# Information extraction using spaCy

Extract triples (subject, predicate, object) from text via Subtree Matching

In [None]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 
from spacy.matcher import Matcher 
from spacy.tokens import Span
from spacy import displacy

from nltk.tokenize import sent_tokenize

pd.set_option('display.max_colwidth', 200)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
text = "Tableau was recently acquired by Salesforce." 

doc = nlp(text)
displacy.render(doc, style = 'dep', jupyter = True)

In [None]:
for tok in doc: 
    print(tok.text,"-->",tok.dep_,"-->",tok.pos_)

In [None]:
def subtree_matcher(doc):
    subjpass = 0

    for i,tok in enumerate(doc):
        # find dependency tag that contains the text "subjpass"    
        if tok.dep_.find("subjpass") == True:
            subjpass = 1

    subj = ''
    pred = ''
    obj = ''

    # if subjpass == 1 then sentence is passive
    if subjpass == 1:
        for i,tok in enumerate(doc):
            if tok.dep_.find("subjpass") == True:
                obj = tok.text
            
            
            
            if tok.dep_.endswith("obj") == True:
                subj = tok.text

    # if subjpass == 0 then sentence is not passive
    else:
        for i,tok in enumerate(doc):
            if tok.dep_.endswith("subj") == True:
                subj = tok.text

            if tok.dep_.endswith("obj") == True:
                obj = tok.text

    return subj, obj

In [None]:
subtree_matcher(doc)

In [None]:
text2 = "Careem, a ride-hailing major in the middle east, was acquired by Uber."

doc2 = nlp(text2)
subtree_matcher(doc2)

In [None]:
text3 = "Salesforce recently acquired Tableau."

doc3 = nlp(text3)
subtree_matcher(doc3)

In [None]:
from nltk.tokenize import sent_tokenize

corpus = "Neuerkirch and Külz abandoned their old fossil fuel heating system for a 100% renewable heating system – a biomass plant supplies 75% of the energy and a solar thermal plant provides for the rest. Only a few homes are not connected to the central system."
sentences = sent_tokenize(corpus)
for sentence in sentences:
    print(subtree_matcher(sentence))

In [1]:
import spacy
import textacy

nlp = spacy.load("en_core_web_lg")
text = nlp("Neuerkirch and Külz abandoned their old fossil fuel heating system for a 100% renewable heating system – a biomass plant supplies 75% of the energy and a solar thermal plant provides for the rest. Only a few homes are not connected to the central system.")
text_ext = textacy.extract.subject_verb_object_triples(text)

In [2]:
for item in text_ext:
    print(item)

(Neuerkirch, abandoned, heating system)
(Külz, abandoned, heating system)
(biomass plant, supplies, %)


In [15]:
text2 = nlp("Only a few homes are not connected to the central system.")
text_ext2 = textacy.extract.subject_verb_object_triples(text2)

for item in text_ext2:
    print(item)