In [141]:
from nltk.tokenize import sent_tokenize
import spacy
import numpy as np
import pandas as pd 
import os
from tqdm import tqdm
import networkx as nx
from spacy import displacy
from itertools import chain
from spacy.matcher import Matcher 
from spacy.tokens import Doc, Span, Token
from spacy.lang.en.stop_words import STOP_WORDS
import matplotlib.pyplot as plt
import re
import time


pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [30]:
data_df = pd.read_csv("../input/onllinekhabar-english/onlinekhabar_english.csv")

In [46]:
article_list = data_df.values.tolist()
flattened_article_data = list(chain.from_iterable(article_list))[:250]

In [40]:
sentence_data = []
for i in data_df["text"]:
    sentence_data.append(sent_tokenize(i))
flattened_sentence_data = list(chain.from_iterable(sentence_data))

In [5]:
!python -m spacy download en_core_web_lg

In [36]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
non_nc = spacy.load('en_core_web_lg')

nlp = spacy.load('en_core_web_lg')
nlp.add_pipe('merge_noun_chunks')


In [37]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if not token.is_stop and not token.is_punct:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str

In [38]:
def create_svo_lists(doc):
    
    subject_ls = []
    verb_ls = []
    object_ls = []

    for token in doc:
        if token.dep_ in SUBJECTS:
            subject_ls.append((token.lower_, token.idx))
        elif token.dep_ in VERBS:
            verb_ls.append((token.lemma_, token.idx))
        elif token.dep_ in OBJECTS:
            object_ls.append((token.lower_, token.idx))

    return subject_ls, verb_ls, object_ls

def remove_duplicates(tup, tup_posn):
    
    check_val = set()
    result = []
    
    for i in tup:
        if i[tup_posn] not in check_val:
            result.append(i)
            check_val.add(i[tup_posn])
            
    return result


def remove_dates(tup_ls):
    
    clean_tup_ls = []
    for entry in tup_ls:
        if not entry[2].isdigit():
            clean_tup_ls.append(entry)
    return clean_tup_ls

In [39]:
def create_svo_triples(text):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                # Assemble a list of distances between each object and each verb
                dist_ls.append(abs(obj[1] - v[1]))
                
            # Get the index of the verb with the smallest distance to the object 
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            # Remve stop words removal

            no_sw_subj = remove_stop_words_and_punct(subj[0])
            no_sw_obj = remove_stop_words_and_punct(obj[0])
            
            # Add entries to the graph iff neither subject nor object is blank
            if no_sw_subj and no_sw_obj:
                tup = (no_sw_subj, verb_ls[index_min][0], no_sw_obj)
                graph_tup_ls.append(tup)
            
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

In [55]:
%%time
final_tups = []
for i in flattened_article_data:
    final_tups.extend(create_svo_triples(i))

In [54]:
len(final_tups)

In [56]:
# extract subject
source = [i[0] for i in final_tups]

# extract object
target = [i[2] for i in final_tups]

# extract relation
relations = [i[1] for i in final_tups]

kg_dff = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [57]:
GG=nx.from_pandas_edgelist(kg_dff, "source", "target",edge_attr=True, create_using=nx.MultiDiGraph())

In [58]:
plt.figure(figsize=(12,12))
pos = nx.spring_layout(GG, k = 0.5) # k regulates the distance between nodes
nx.draw(GG, with_labels=True, node_color='skyblue', node_size=500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [142]:
# Writing the database into a pickle file
nx.write_gpickle(GG, "final_graph.gpickle")

In [140]:
# Querying the graph with source node "balen shah" and edge "have"

GgG=nx.from_pandas_edgelist(kg_dff[(kg_dff['source'] == "  balen shah")&(kg_dff['edge'] == "have")], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())
plt.figure(figsize=(3,3))
pos = nx.spring_layout(GgG, k = 0.5) # k regulates the distance between nodes
nx.draw(GgG, with_labels=True, node_color='skyblue', node_size=500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [143]:
def create_svo_triples_for_questions(text):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                dist_ls.append(abs(obj[1] - v[1]))
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            if subj[0] and obj[0]:
                tup = (subj[0], verb_ls[index_min][0], obj[0])
                graph_tup_ls.append(tup)
 
    
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

In [144]:
def searching(text):
    tuples = create_svo_triples_for_questions(text)[0]
    if "which" or "who" in tuples:
        G_Answer=nx.from_pandas_edgelist(kg_dff[(kg_dff['edge'] == tuples[1])&(kg_dff['target'] == tuples[2])], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())
        answer = "The answer is: " +  str(kg_dff[(kg_dff['edge'] == tuples[1])&(kg_dff['target'] == tuples[2])].iloc[0]["source"])
    elif "what" in tuples:
        G_Answer=nx.from_pandas_edgelist(kg_dff[(kg_dff['source'] == tuples[0])&(kg_dff['edge'] == tuples[1])], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())
        answer = "The answer is: " +  str(kg_dff[(kg_dff['source'] == tuples[0])&(kg_dff['edge'] == tuples[1])].iloc[0]["target"])
    pos = nx.spring_layout(G_Answer, k = 0.5) # k regulates the distance between nodes
    nx.draw(G_Answer, with_labels=True, node_color='skyblue', node_size=500, edge_cmap=plt.cm.Blues, pos = pos)
    plt.show()
    return answer

In [154]:
#Lets test our graph for an example question in Natural Language

searching("who controled coronavirus outbreak in Nepal?")

In [157]:
searching("who formed japan nepal friendship parliamentarian committee?")

In [158]:
searching("who added bamdev gautam to national assembly?")

In [167]:
searching("who informs cybercrime?")