**Knowledge Graph Construction**

Based on Python code written by Prateek Joshi, adapted for COMP3220 (Document Processing and the Semantic Web).

In [54]:
# Libraries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib notebook

# Progression bar
from tqdm import tqdm

In [55]:
# SpaCy is an open-source software library for advanced natural language processing.
import spacy
from spacy.matcher import Matcher 


# The en_core_web_sm  model for English was trained on written web text (blogs, news, 
# comments), and includes vocabulary, vectors, syntax and entities.

nlp = spacy.load('en_core_web_sm')

In [56]:
pd.set_option('display.max_colwidth', 200)

In [57]:
# Read CSV file that contains more than 4300 sentences extracted from 500 Wikipedia articles. 
#  Each of these sentences contains exactly two entities: one subject and one object. 

candidate_sentences = pd.read_csv("wiki_sentences_v2.csv")

candidate_sentences['sentence'].sample(5)

1543    kannada actor rajkumar began working with veeranna and later became an important actor.
342                       the musical score and the incidental music were composed by stothart.
3339                           although mostly used by kodak cameras, it became very popular.  
1892                                                        the film was given an imax release.
3511                                                   levels i, iia, and iib are unrestricted.
Name: sentence, dtype: object

In [58]:
# Let’s check the subject and object of one of these sentences.

doc = nlp("the drawdown process is governed by astm standard d823")
doc = nlp("the film had 200 patents")
for tok in doc:
    print(tok.text, "-->", tok.dep_)

# There is only one subject (‘process’) and one object (‘standard’). 
# You can find a list the dependency lables here:
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md

the --> det
film --> nsubj
had --> ROOT
200 --> nummod
patents --> dobj


### Task 1 ###
Complete the Python function (get_entities) below that should loop through a sentence and extract the subject (ent1) and the object (ent2). Note that an entity can span across multiple tokens (e.g. 'red wine'), but the dependency parser tags only 
individual tokens as subjet (subj) and object (obj).

In order to deal with this problem, the code should use a variable (prv_tok_dep) that will hold the dependency tag of the 
previous token in the sentence, and a variable (prv_tok_text) that will hold the previous token itself. Finally, the two variables (prefix and modifier) should be used to hold the sequence of tokens that is assocated with the subject or the object.

You can find everything that you need to know about spaCy here: https://course.spacy.io/en/chapter1 (Chapter 1).

In [37]:
def get_entities(sent):
    
    ent1 = ""           # Variable for storing the subject.
    ent2 = ""           # Variable for storing the object.

    prv_tok_dep = ""    # Variable for dependency tag of previous token in the sentence.
    prv_tok_text = ""   # Variable for previous token in the sentence.

    prefix = ""         # Variable for storing compounds.
    modifier = ""       # Variable for storing modifieres.

    # Loop through the tokens in the sentence.
    doc = nlp(sent)
    for i,tok in enumerate(doc):
        # Check if a token is a punctuation mark or not.
        if tok.dep_!="punct":
            # Check if a token is a compound one or not.
            if tok.dep_=="compound":
            # If yes, then store the token in the prefix variable.
                prefix = tok.text
            # Check if the previous token was also a compound one.
                if prv_tok_dep == "compound":
              # If yes, then update the prefix variable.
                  prefix = prv_tok_text+" "+tok.text

        # Check if a token is a modifier or not.
        if tok.dep_.endswith('mod') == True:
            # If yes, then store the token in the modifier varible.
            modifier = tok.text
            # Check if the previous token was a compound one.
            if prv_tok_dep == 'compound':
              # If yes, then update the modifier variable.
              modifier = prv_tok_text+" "+tok.text

        # Check if a token is the subject.
        if tok.dep_.find('subj') == True:
            # If yes, then concatenate the modifier, prefix, and token
            # and assign the result to the subject variable (ent1).
            ent1 = modifier+" "+prefix+" "+tok.text
            # Reset the following variables: prefix, modifier, prv_tok_dep, and prv_tok_text.
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""

        # Check if a token is the object.
        if tok.dep_.find('obj') == True:
            # If yes, then concatenate the modifier, prefix, and token 
            # and assign the result to the object variable (ent2).
            ent2 = modifier+" "+prefix+" "+tok.text

            # Update the variable for the dependency tag for the previous token. 
            prv_tok_dep = doc[i-1].dep_
            # Update the variable for the previous token in the sentence.
            prv_tok_text = doc[i-1].text

    return [ent1.strip(), ent2.strip()]

In [38]:
# Test function:

print(get_entities("the film had 200 patents"))

['film', '200  patents']


In [39]:
# Extract these entity pairs (subject, object) for all the sentences.

entity_pairs = []

for i in tqdm(candidate_sentences["sentence"]):
    entity_pairs.append(get_entities(i))
  

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 4318/4318 [00:18<00:00, 228.16it/s]


In [40]:
print(entity_pairs[10:20])

[['we', 'tests'], ['', 'international sales rights'], ['canadian robbie robertson', 'soundtrack'], ['it', 'original music tracks'], ['it', 'reviewed  franchise'], ['she', 'accidentally  mystique'], ['military  forces', 'arrest'], ['train', 'vuk'], ['kota eberhardt', 'selene gallio'], ['singer', 'sequel']]


### Task 2 ###

The relation (ROOT) will connect a subject with an object in our knowledge graph.

Use the rule-based pattern matcher of spaCy to extract the dependency relation (ROOT) of a sentence. 
Once this ROOT (or main verb) is identified, then the pattern should check whether that ROOT 
is followed by a preposition (prep), an agent word (agent) or an adjective (ADJ). If that is the case, then that word
is added to the ROOT. 

Check the spaCy video about rule-based matching (Chapter 1; 10. Rule-based matching) here: https://course.spacy.io/en/chapter1 to get an idea about how to write such a pattern.

In [48]:
def get_relation(sent):
  """Extract Verbs"""
  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  # Define the pattern 
  pattern = [{'DEP': 'ROOT'},
             {'DEP': 'prep', 'OP':'?'},
             {'DEP': 'agent', 'OP':'?'},
             {'POS': 'ADJ', 'OP': '?'}]
  
  matcher.add("matching_1", [pattern]) 

  matches = matcher(doc) # (match_id, start, end)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [49]:
# Test function:

print(get_relation("John completed the task"))

completed


In [50]:
# Get the relations from all the Wikipedia sentences:

relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]

print(relations)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 4318/4318 [00:18<00:00, 230.89it/s]

['decides', 'heard in', 'paralyzed by', 'set on', 'wails with', "'s", 'joined', 'revealed', 'revealed as', 'tried', 'went through', 'circling', 'supervised', 'features', 'is', 'injures', 'tasked with', 'attacked by', 'portrays', 'return', 'stand', 'considered for', 'served as', 'completed on', 'released', 'released', 'distributed by', 'ended', 'chandler', 'dedicated to executive', 'approach former', 'frees', 'disowns', 'appears as', 'credited as', 'stated', 'empathizing with', 'was', 'contributed to', 'took', 'changed', 'became', 'look like big', 'perform', 'managed', 'released on', 'written by', 'conducted by', 'released', 'released', 'scheduled', 'offered in', 'includes', 'include limited', 'are ready', 'was', 'tend', 'include', 'prevalent in typical', 'blart', 'are', 'had many', 'supervises', 'oversee', 'credited', 'start in', 'started as', 'attended', 'use different', 'take', 'equipped with sound', 'designed for', 'cooperates with', 'femis in', 'is', 'presented in', 'presented in',




In [51]:
print(pd.Series(relations).value_counts()[:50])

is               348
was              283
released on       82
are               73
were              67
include           61
                  50
's                41
released          39
have              31
has               29
became            29
become            26
composed by       26
released in       26
included          22
called            21
produced          21
been              20
considered        19
used              18
had               18
be                16
made              16
received          15
hired             14
went              14
scheduled         14
directed by       13
wrote             13
introduced in     13
set               12
wanted            11
won               11
produced by       11
began in          11
began             11
cast as           10
features          10
gave              10
sold              10
stars             10
\n                10
written by        10
reported           9
includes           9
going              9
gives        

In [52]:
# Extract subjects.
subjects = [i[0] for i in entity_pairs]

# Extract objects.
objects = [i[1] for i in entity_pairs]

In [53]:
# Build a pandas dataframe of source, relation target for the knowledge graph.
kg_df = pd.DataFrame({'source':subjects, 'edge':relations, 'target':objects})

kg_df.head(10)

Unnamed: 0,source,edge,target
0,connie,decides,own
1,later scream,heard in,distance
2,christian,paralyzed by,then elder
3,temple,set on,fire
4,outside cult,wails with,him
5,it,'s,religious awakening
6,c. mackenzie,joined,craig cast
7,di francia,revealed,action cast
8,sebastian maniscalco,revealed as,later ben cast
9,we,tried,just film


In [275]:
# Function to plot the knowledge graph for a specific relation. 

def plot_kg_specific_relations(relation):  
    
    # Construct the graph.
    G = nx.from_pandas_edgelist(kg_df[kg_df['edge']==relation], "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())

    pos = nx.spring_layout(G, k=0.9)
    plt.figure(figsize=(9, 9)) 
    
    # Draw the graph.
    nx.draw(G, pos, edge_color='black', width=1, linewidths=1, node_size=1000, font_size=8, node_color='orange', alpha=0.9, labels={node:node for node in G.nodes()})
    
    labels = {}
    
    for _, row in kg_df.iterrows():
        if (row[1] == relation): 
            labels[(row[0], row[2])] = row[1]
    
    # Add the label names in form of a dictionnary { (Subject, Object):Relation } to the graph.
    nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8, font_color='black')
    plt.show()


In [276]:
plot_kg_specific_relations('composed by')

<IPython.core.display.Javascript object>

In [277]:
plot_kg_specific_relations('released in')

<IPython.core.display.Javascript object>

In [278]:
plot_kg_specific_relations('written by')

<IPython.core.display.Javascript object>

In [294]:
# Function to plot the knowledge graph for a specific entity. 

def plot_kg_specific_entity(entity):
    G=nx.from_pandas_edgelist(kg_df[kg_df['source']==entity], "source", "target", 
                              edge_attr=True, create_using=nx.MultiDiGraph())

    plt.figure(figsize=(9,9))
    pos = nx.spring_layout(G, k = 0.9) 
    nx.draw(G, pos, edge_color='black', width=1, linewidths=1, node_size=1000, font_size=8, node_color='skyblue', alpha=0.9, labels={node:node for node in G.nodes()})
    plt.show()

In [295]:
plot_kg_specific_entity('schwarzenegger')

<IPython.core.display.Javascript object>