# 1. Scrape text data from some selected articles from above link. You can use beautiful soup, newspapers or any data scraping libraries

In [None]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the website to scrape
url = "https://english.onlinekhabar.com/"

# Send a GET request to the website and retrieve the HTML content
response = requests.get(url)
html_content = response.text

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")

# Find the relevant articles or sections on the webpage
articles = soup.find_all("article")

# Loop through the articles and extract the text content
scraped_text = ""
for article in articles:
    # Extract the text from the article
    text = article.get_text(separator=" ")

    # Append the extracted text to the overall scraped text
    scraped_text += text + "\n"

# Print the scraped text
print(scraped_text)


# 2. Process text for sentences and apply necessary NLP processing. You can use nltk , spacy or any NLP libraries

In [2]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Example text
text = "This is an example sentence. It showcases NLP processing."

# Tokenize the text into sentences
sentences = sent_tokenize(text)

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Process each sentence
processed_sentences = []
for sentence in sentences:
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)

    # Remove stop words and convert to lowercase
    words = [word.lower() for word in words if word.lower() not in stopwords.words('english')]

    # Lemmatize the words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words back into a sentence
    processed_sentence = " ".join(lemmatized_words)

    # Append the processed sentence to the list
    processed_sentences.append(processed_sentence)

# Print the processed sentences
for sentence in processed_sentences:
    print(sentence)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\FCT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FCT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\FCT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


example sentence .
showcase nlp processing .


# 3. Extract the subject, object and relationship from each sentence. Extracting entities with modifiers is a plus point


In [17]:
import spacy

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Example sentence
sentence = "Apple Inc. is headquartered in Cupertino, California."

# Process the sentence using spaCy
doc = nlp(sentence)

# Extract the subject, object, and relationship
subject = ""
object_ = ""
relationship = ""

# Iterate over the sentence tokens
for token in doc:
    # Extract the subject
    if "subj" in token.dep_:
        subject = token.text

    # Extract the object
    if "obj" in token.dep_:
        object_ = token.text

    # Extract the relationship
    if "ROOT" in token.dep_:
        relationship = token.text

# Print the extracted subject, object, and relationship
print("Subject:", subject)
print("Object:", object_)
print("Relationship:", relationship)


ImportError: cannot import name dataclass_transform

# 4. Build a directed graph from the above data with entities as a node and relationships as an edge. Label each node and edge with corresponding texts. You can use nexworkx, arongodb, neo4j or any graph/network library.Save the graph db

# 5. Get an answer to the given question. Question sentences should be in natural language, follow the necessary steps to get the answer from the graph db.

In [None]:
# !pip uninstall numpy

In [4]:
# !pip install BeautifulSoup4
# !pip install newspaper3k
# !pip install pymongo
# !pip install nltk
# !pip install networkx
# !pip install spacy
# !pip install py2neo

# python -m spacy download en_core_web_sm

In [None]:

import requests
from bs4 import BeautifulSoup
import spacy
import networkx as nx
import re

# Step 1: Scraping text data from selected articles
url = "https://english.onlinekhabar.com/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# sections = soup.select("section.ok-news-grid")[:3]
# print(sections)

# For all categories 
# for section in sections:
#     section_detail = section.select_one("h2 a")
#     if section_detail:
#         title = section_detail.text.strip()
#         link = section_detail["href"]
#         print("----")
#         print("Title:", title)
#         print("Link:", link)

# print("----")


In [None]:
# Content of Home page

import requests
from bs4 import BeautifulSoup
import spacy
import networkx as nx
import re
from tqdm import tqdm

# Step 1: Scraping text data from selected articles
url = "https://english.onlinekhabar.com/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

sections = soup.select("div.ok-news-post")
# print(sections)

links = []

for section in sections:
    section_detail = section.select_one("h2 a" )
#     print(section_detail)
    link = section_detail.get('href')
#     print(link)
    links.append(link)
    
# print(links)

news_dict = {}
def inside_content(links_list):
    for i in tqdm(links_list):   
#         print(i)
        response = requests.get(i)
        soup = BeautifulSoup(response.content, "html.parser")

        title = soup.select_one("h1").text
        author = soup.select_one("div.ok-author span").text 
        post_date = soup.select_one("span.ok-post-date").text
        
        div_element = soup.find("div", class_="post-content-wrap")
        paragraphs = div_element.find_all("p")
        content = ""

        for paragraph in paragraphs:
            content = content + paragraph.get_text()

        
#         print(f"Author: {author}")
#         print(f"Date: {post_date}")
#         print(f"Title: {title}")
#         print(f"Content: {content}")
        
        news_dict = {
            "Link" : i,
            "Author": author,
            "Date" : post_date,
            "Title" : title,
            "Content" : content
        }
        
        
inside_content(links)



  3%|██▏                                                                                | 2/76 [00:00<00:07,  9.36it/s]

---------------------------------
---------------------------------


  7%|█████▍                                                                             | 5/76 [00:00<00:08,  8.61it/s]

---------------------------------
---------------------------------
---------------------------------


  9%|███████▋                                                                           | 7/76 [00:00<00:08,  7.98it/s]

---------------------------------
---------------------------------


 12%|█████████▊                                                                         | 9/76 [00:01<00:08,  8.24it/s]

---------------------------------
---------------------------------


 14%|███████████▊                                                                      | 11/76 [00:01<00:07,  8.98it/s]

---------------------------------
---------------------------------


 16%|████████████▉                                                                     | 12/76 [00:01<00:07,  8.92it/s]

---------------------------------


 18%|███████████████                                                                   | 14/76 [00:01<00:09,  6.21it/s]

---------------------------------
---------------------------------


 21%|█████████████████▎                                                                | 16/76 [00:02<00:08,  6.94it/s]

---------------------------------
---------------------------------


 24%|███████████████████▍                                                              | 18/76 [00:02<00:07,  7.85it/s]

---------------------------------
---------------------------------


 28%|██████████████████████▋                                                           | 21/76 [00:02<00:06,  8.31it/s]

---------------------------------
---------------------------------
---------------------------------
---------------------------------


 32%|█████████████████████████▉                                                        | 24/76 [00:03<00:06,  8.13it/s]

---------------------------------
---------------------------------


 34%|████████████████████████████                                                      | 26/76 [00:03<00:06,  7.96it/s]

---------------------------------
---------------------------------


 37%|██████████████████████████████▏                                                   | 28/76 [00:03<00:06,  8.00it/s]

---------------------------------
---------------------------------


 39%|████████████████████████████████▎                                                 | 30/76 [00:03<00:05,  8.78it/s]

---------------------------------
---------------------------------


 42%|██████████████████████████████████▌                                               | 32/76 [00:04<00:06,  7.32it/s]

---------------------------------
---------------------------------


 45%|████████████████████████████████████▋                                             | 34/76 [00:04<00:05,  8.05it/s]

---------------------------------
---------------------------------


 47%|██████████████████████████████████████▊                                           | 36/76 [00:04<00:04,  8.29it/s]

---------------------------------
---------------------------------


 49%|███████████████████████████████████████▉                                          | 37/76 [00:04<00:04,  8.35it/s]

In [None]:
print(news_dict)

In [3]:

# Step 2: Applying necessary NLP processing
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Step 3: Extracting subject, object, and relationship from each sentence
edges = []
for sent in doc.sents:
    for token in sent:
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            subject = token.text
            for child in token.children:
                if child.dep_ == "amod":
                    subject = child.text + " " + subject
            for child in token.head.children:
                if child.dep_ == "dobj":
                    obj = child.text
                    for grandchild in child.children:
                        if grandchild.dep_ == "amod":
                            obj = grandchild.text + " " + obj
                    edges.append((subject, obj, token.head.text))

# Step 4: Building a directed graph from the above data
G = nx.DiGraph()
for edge in edges:
    G.add_edge(edge[0], edge[1], relation=edge[2])

# Step 5: Geting an answer to the given question
question = "What is the latest news on politics?"
parsed_question = nlp(question)

query = ""
for token in parsed_question:
    if token.pos_ == "NOUN":
        query += f"{token.text.title()} "
    elif token.pos_ == "PROPN":
        query += f"{token.text} "

if query:
    answers = []
    for edge in G.edges(data=True):
        if query.strip() in (edge[0], edge[1]):
            answers.append(edge[2]["relation"])
    if answers:
        print(f"The latest news on {query.strip()} is: {max(answers)}")
    else:
        print(f"No news found on {query.strip()}.")
else:
    print("Invalid question format.")

No news found on News Politics.


In [6]:
# !conda list
