In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

In [2]:
import pdb
import os
import json

from spacy.matcher import Matcher 
from spacy.tokens import Span
from spacy import displacy

In [3]:
def read_OMCS_file(filename, language='en', save_filename=None):
    '''
    Given the filename and the language retrieves the sentences from the
    provided language and stores it in another file.
    input:
        filename : string containing the name of the file
        language : string containing the name of the language
                   to look for.
    
    output: List of sentences from the OMCS text file belonging 
            to that particular language.
    '''
    data_list_language = []
    
    assert os.path.isfile(filename), 'File does not exist!'
    
    with open(filename, 'r') as f:
         data_list = f.readlines()
   
    for data_line in data_list:
        try:
            if data_line.split('\t')[4]==language:
                data_list_language.append(data_line.split('\t')[1])
        except:
            print('Found a faulty sentence :', data_line)
            
    if save_filename:
        
        with open(save_filename, 'w') as fp:
            json.dump(data_list_language, fp)
            
    return data_list_language
    
    
    

In [4]:
data = read_OMCS_file('./data/omcs-sentences-free.txt', save_filename='english_sentences.txt')


Found a faulty sentence : 1531350	Um(a) golfinho é usado(a) para brinc

Found a faulty sentence : 1194903	Some people like to check their e-mail first thing in the morning.

Found a faulty sentence : 1194907	People take showers in the morning.

Found a faulty sentence : 1194921	People eat cereal for breakfast.

Found a faulty sentence : 1194929	People can travel from work to home on a bicycle.

Found a faulty sentence : 1194960	Paper clips hold sheets of paper together.

Found a faulty sentence : 1044642	The statement "Telescopes make things look larger." is true because Refracting telescopes use lenses to gather and bend

Found a faulty sentence : (898159 rows)



In [5]:
with open('english_sentences.txt', 'r') as fp:
    en_data = json.load(fp)

In [6]:
#printing token information using spacy
sentence = en_data[121]
tag_info = nlp(sentence)
for token in tag_info:
    print(token.text, token.tag_, token.dep_, token.head.text, token.head.pos_)

Sometimes RB advmod causes VERB
lightning NN nsubj causes VERB
causes VBZ ROOT causes VERB
electricity NN compound shortouts NOUN
shortouts NNS dobj causes VERB


In [7]:
#getting noun chunks using spacy
doc = nlp(sentence)
for chunk in doc.noun_chunks:
    print(chunk.text, " | ", chunk.root.text, " | ", chunk.root.dep_, " | ",
            chunk.root.head.text)

lightning  |  lightning  |  nsubj  |  causes
electricity shortouts  |  shortouts  |  dobj  |  causes


In [8]:
#navigating the parse tree
sentence = "the film had 200 patents."
doc = nlp(sentence)
for token in doc:
    print(token.text," | ",  token.dep_," | ",  token.head.text," | ",  token.head.pos_," | ", 
            [child.text for child in token.children])

the  |  det  |  film  |  NOUN  |  []
film  |  nsubj  |  had  |  AUX  |  ['the']
had  |  ROOT  |  had  |  AUX  |  ['film', 'patents', '.']
200  |  nummod  |  patents  |  NOUN  |  []
patents  |  dobj  |  had  |  AUX  |  ['200']
.  |  punct  |  had  |  AUX  |  []


In [9]:
#code for getting the entities and relations from the tutorial

def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]


def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [None]:
#my code for extracting entities and relations

    
def check_verb_subject_in_clause(clause_token_list):
    '''
    Given a clause/sentence checks if the sentence contains
    a subject and a verb.
    input:
        clause : a string containing a sentence
        
    output:
        contains : A boolean flag indicating the presence 
                   of a verb and a subject in the sentence.
    '''
    pos_list = ["VERB", 'AUX']
    dep_list = ["ROOT", 'aux']
    contains = False
    contains_verb = False
    contains_sub = False
    for token in clause_token_list:
        #print("token : {} | pos : {} | dep : {}".format(token.text, token.pos_, token.dep_))
        if token.dep_.find("subj") == True:
            contains_sub = True
        if token.pos_ in pos_list or token.dep_ in dep_list:
            contains_verb = True
        #print("Contains verb : {}, contains sub : {}".format(contains_verb, contains_sub))
    if contains_verb and contains_sub:
        contains = True
    return contains
    

        
def get_subtree_excluding_subclause(node):
    '''
    Given the node, returns the subtree rooted at that node:
    ideally to extract the subject or object subtree.
    '''
    black_list = ["det", "punct", "conj", "cconj"]
    clausal_list = ["ROOT", "csubj", 'ccomp', "prep"
                    'advcl', 'acl', 'conj',
                   'relcl']
    entity = []
    entity_string = ""
    for left in node.lefts:
        
        subtree_tokens, subtree_string, dep = get_subtree_excluding_subclause(left)
        #if dep in clausal_list:
            #print("From inside the function :\n The token : {}\n The subtree : {}".format(left,subtree_tokens))
            #print("Qualify for clause : ",check_verb_subject_in_clause(subtree_tokens))
        
        #add the tokens if dep is not in the clausal_list
        if dep not in clausal_list: 
            for tok in subtree_tokens:
                entity.append(tok)
        else:
        #even if it is in the clausal_list, add it if it does not have a verb
        #and a subject
            if not check_verb_subject_in_clause(subtree_tokens):
                for tok in subtree_tokens:
                    entity.append(tok)
            
    entity.append(node)
    
    for right in node.rights:
        
        subtree_tokens, subtree_string, dep = get_subtree_excluding_subclause(right)
        #add the tokens if dep is not in the clausal_list
        if dep not in clausal_list: 
            for tok in subtree_tokens:
                entity.append(tok)
        else:
        #even if it is in the clausal_list, add it if it does not have a verb
        #and a subject
            if not check_verb_subject_in_clause(subtree_tokens):
                for tok in subtree_tokens:
                    entity.append(tok)
                    
    for value in entity:
        entity_string += value.text
        entity_string += " "
        
    entity_string = entity_string.strip()
    return entity, entity_string, node.dep_
    
def extract_clauses(sent):
    '''
    Given a sentence, will break the sentence into 
    different clauses
    '''
    doc = nlp(sent)
    #the idea is to find one of the dep_ from the following 
    #list 
    '''
    [ parataxis, 
      conj - conjunct,
      ccomp - clausal complement,
      advcl - adverbal complement,
      relcl - relative clause modifier,
      csubj - clausal subject
      xcomp - open clausal complement]
    '''
    clause_dep_list = ["ROOT", "csubj", 'ccomp', "prep",
                       'advcl', 'acl', 'conj',
                       'relcl']
    
    clause_list = []
    for token in doc:

        if token.dep_ in clause_dep_list:
            
            subclause, _, _ = get_subtree_excluding_subclause(token)
#             check if the subclause contains a verb and a subject
#             if not then it does not qualify as a clause
#             print("Token :", token)
#             print("The subclause :", subclause)
#             print(check_verb_subject_in_clause(subclause))
            if check_verb_subject_in_clause(subclause):
                clause_list.append(subclause)
    
    return clause_list




sentence = en_data[889]
sentence0 = 'The sky is blue.'
sentence1 = "the city of the loir-et-cher is part of the greater mumbai"
sentence2 = 'Even with the weather being that nasty, the couple and their families decided to go ahead \
with the wedding as planned.'
sentence3 = 'My little daughter loves to play with her dolls.'
sentence4 = 'Because she loves her students, Mrs Stevens will be sad on the last day of school.'
sentence5 = 'My mother suggested that I should consult a doctor'
sentence6 = 'In the spring, Damien will run his first marathon.'
sentence7 = 'If there is no leftover pizza, Rosa usually eats \
whole-grain cereal.'
sentence8 = 'Huffing and puffing, we arrived at the classroom door with only seven seconds to spare'
compound_sentence = 'I like road bikes, and he likes mountain bikes.'
compound_sentence = 'She ran with the dogs, I swam with the fishes, and they biked to the mountains.'
complex_sent = 'John retired when he turned 65'
complex_sent2 = 'Whether you agree with me or not makes little \
difference to our inverstors, who by the way, are the ones most \
affected by whatever mistake we make.'
complex_sent3 = 'Whoever thought of the idea is a genius.'
complex_compound1 = "Bill voted against the measure \
because he felt that it wasn't strong enough, but he also offered \
to continue discussions, which we will do next week." 


current = sentence8
doc = nlp(current)
print("The sentence :", current," \n\n")

print("Its analysis :")
for token in doc:
    print(token.text," | ",  token.dep_," | ", token.pos_, " | ", token.head.text," | ",  token.head.pos_," | ", 
           [left.text for left in token.lefts], " | ",
           [right.text for right in token.rights], " | ",
           [child.text for child in token.children])
   
print("The extracted clauses :")
clauses = extract_clauses(current)
for clause in clauses:
    print(clause)
displacy.serve(doc, style='dep')



In [None]:
#my code for extracting entities and relations

def extract_subject_object(sent):
    doc = nlp(sent)
    subject = []
    obj = []
    for token in doc:
        
        if token.dep_.find("subj") == True and len(subject) == 0:
            for child in token.children:
                if child.dep_ not in ["det", "punct", "conj", "prep"]:
                    subject.append(child.text)
            subject.append(token.text)
        
        if token.dep_.find("dobj") == True and len(obj) == 0:
            for child in token.children:
                if child.dep_ not in ["det", "punct", "conj", "prep"]:
                    obj.append(child.text)
            obj.append(token.text)
    
    subject_string = ""
    for token in subject:
        subject_string += token
        subject_string += " "
    sub_string = subject_string.strip()
    
    object_string = ""
    for token in obj:
        object_string += token
        object_string += " "
    obj_string = object_string.strip(" ")
    return (sub_string, obj_string)


def extract_entities_from_clause(token_list):
    '''
   Given a clause in the form of a token list (to preserve the dependency of the 
   tokens from the original sentence) returns the possible entities(subject, object). In the 
   absence of an object it looks for an appropriate substitute e.g. adjective or something.
    '''
    root_children = []
    object_entity_list = []
    subject_entity = ''
    subject_entity_list = []
    entity2_dep_list = []
    intrans_verb_dep_list = ["prep", "acomp", "advmod", "attr"]
    for token in token_list:
        
        if token.dep_.find("obj") == True:
            entity, entity_string, ent_dep = get_subtree_excluding_subclause(token)
            object_entity_list.append(entity_string)
            #print(" The object : ",object_entity_list)
            entity2_dep_list.append(token.dep_)

        if token.dep_.find("subj") == True:
            sub_entity, sub_entity_string, sub_ent_dep = get_subtree_excluding_subclause(token)
            subject_entity = sub_entity_string
            #print(" The subject :", get_subtree_excluding_subclause(token))
    #if there are no objects look for the following:
    #look for adjectives?
    relation_token = extract_relation_from_clause(token_list)
    if relation_token is not None:
        relation_subtree = relation_token.subtree
    else:
        relation_subtree = []
    if len(object_entity_list) == 0:
        print('No object, so looking for something else')
        for token in relation_subtree:
            if token.dep_ in intrans_verb_dep_list:
                entity, entity_string, ent_dep = get_subtree_excluding_subclause(token)
                object_entity_list.append(entity_string)                
                #print(" The object : ",get_subtree_excluding_subclause(token))
                entity2_dep_list.append(token.dep_)
    
    
    subject_entity_list = [subject_entity for i in range(len(object_entity_list))]
    relation_list = [relation_token for i in range(len(object_entity_list))]   
    
    return subject_entity_list, object_entity_list, relation_list, entity2_dep_list


def extract_relation_from_clause(token_list):
    '''
    Given a clause in the form of a token list (to preserve the dependency of the 
    tokens from the original sentence) returns the 
    possible relation (verb). The relation is calculated 
    by taking the verb and 
    
    
    input:
        clause : a string containing a single clause
    '''
    
    #if root is verb select that.
    
    for token in token_list:
        #pdb.set_trace()
        if token.dep_ in ['ROOT']:
            return token
    
    #if not, look for verb
    for token in token_list:
        if token.pos_ in ['VERB']:
            return token
        
    #if not, look for auxillary
    for token in token_list:
        
        if token.dep_ in ['aux']:
            return token
    
    
            
            
def extract_entity_relation_tuples(sent):
    '''
    Given a sentence, returns a list of entity relation tuples
    Entity relation tuples are extracted for each of the clauses
    detected in the sentence.
    input:
        sent : A sentence in the form of a string.
    
    output:
        
        entity_rel_list : A list containing enitiy relations extracted from 
                          individual clause.
                          of the form [((subj, obj), rel), (), ()]
    '''
    clause_list = extract_clauses(sent)
    entity_rel_list = []
    for clause in clause_list:
        
        sub_list, obj_list, rel_list, _ = extract_entities_from_clause(clause)
        
        for i in range(len(sub_list)):
            entity_rel_list.append(((sub_list[i], obj_list[i]), rel_list[i]))
    
    
    return entity_rel_list
    



sentence = en_data[8]
sentence0 = 'The sky is blue.'
sentence1 = "the city of the loir-et-cher is part of the greater mumbai"
sentence2 = 'Even with the weather being that nasty, the couple and their families decided to go ahead \
with the wedding as planned.'
sentence3 = 'My little daughter loves to play with her dolls.'
sentence4 = 'Because she loves her students, Mrs Stevens will be sad on the last day of school.'
sentence5 = 'My mother suggested that I should consult a doctor'
sentence6 = 'In the spring, Damien will run his first marathon.'
sentence7 = 'If there is no leftover pizza, Rosa usually eats \
whole-grain cereal.'
sentence8 = 'Huffing and puffing, we arrived at the classroom door with only seven seconds to spare'
compound_sentence = 'I like road bikes, and he likes mountain bikes.'
compound_sentence = 'She ran with the dogs, I swam with the fishes, and they biked to the mountains.'
complex_sent = 'John retired when he turned 65'
complex_sent2 = 'Whether you agree with me or not makes little \
difference to our inverstors, who by the way, are the ones most \
affected by whatever mistake we make.'
complex_sent3 = 'Whoever thought of the idea is a genius.'
complex_compound1 = "Bill voted against the measure \
because he felt that it wasn't strong enough, but he also offered \
to continue discussions, which we will do next week." 


current = sentence6
doc = nlp(current)
print("The sentence :", current," \n\n")
 

print("The extracted clauses :")
clauses = extract_clauses(current)

for clause in clauses:
    print("Clause :", clause)
    print(extract_entities_from_clause(clause))
    
print("\n\n")
#getting entity relations from the blog
entities_other = get_entities(current)
relation_other = get_relation(current)
print("\nentity and relations from the blog:\nSubject : {} | Object : {} | Relation : {}\n\n".format(entities_other[0], 
                                                                               entities_other[1],
                                                                               relation_other))

#getting entity relations my code
entity_list = extract_entity_relation_tuples(current)
for entity in entity_list:
    subj, obj = entity[0]
    rel = entity[1]
    print("\nentity and relations from my code:\nSubject : {} | Object : {} | Relation : {}\n\n".format(subj, 
                                                                              obj, rel))

#print(extract_entities_from_clause(current))
#print(extract_relation_from_clause(current))

displacy.serve(doc, style='dep')
    

The sentence : In the spring, Damien will run his first marathon.  


The extracted clauses :
Clause : [In, the, spring, ,, Damien, will, run, his, first, marathon, .]
(['Damien', 'Damien'], ['the spring', 'his first marathon'], [run, run], ['pobj', 'dobj'])




entity and relations from the blog:
Subject : Damien | Object : first  marathon | Relation : run



entity and relations from my code:
Subject : Damien | Object : the spring | Relation : run



entity and relations from my code:
Subject : Damien | Object : his first marathon | Relation : run




  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

