In [1]:
import spacy
import random
from collections import Counter

import seaborn as sns #for visualization
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('seaborn')
sns.set(font_scale=2)
import json
def pretty_print(pp_object):
    print(json.dumps(pp_object, indent=2))
    
from IPython.display import Markdown, display
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

In [2]:
from pprint import pprint

In [3]:
# Spipe = spacy.load('en_core_web_lg')

nlp_pipe = spacy.load('en_coref_lg')

In [4]:
def show_dep(doc):
    options = {'compact':False, 'collapse_punct':False, 'collapse_phrases':True}
    spacy.displacy.render(doc, jupyter=True, style='dep')

## Constants

### POS

In [5]:
# POS
VERB = "VERB"

# NOUNS -
NOUNS_POS = ["NOUN", "PROPN", "ADJ", "PRON"]

PREP_POS = ["ADP"]

DET_POS = ["DET"]

### DEPENDENCY TAG

In [6]:
SUBJECT_DEP = ["nsubj", "nsubjpass", "dobj"]

COMPOUND_DEP = ["compound"]

PREPOSITIONAL_MOD = ["prep"]

OBJ_PREPOSITION = ["pobj"]

RELATIVE_CLAUSE_MOD = ["relcl"]

APPOS = ["appos"]

AND = ["conj"]

In [7]:
for token in doc1:
    print(token.text, token.pos_, token.tag_, token.dep_, token.is_stop)

NameError: name 'doc1' is not defined

In [7]:
for word in doc1:
    if word.pos_ == "VERB" and word.head == word:
        print("({}) - {}".format(word.pos_, word))
        for decendent in word.children:
            print("\t({}) - {} - {}".format(decendent.pos_, decendent.dep_, decendent))

(VERB) - show
	(PROPN) - nsubj - Hanity
	(PUNCT) - punct - .


### Find the ROOT of a sentence

In [8]:
# This will work correctly on a doc object of a sentence
def get_root(sent_doc):
    # There is always a single root for a sentence.
    # This is just a safety guard
    roots = list()
    for word in sent_doc:
        if word.head == word:
            roots.append(word)
    if len(roots) != 1:
        return None
    return roots[0]            

### Find Compound names [ON HOLD]

In [9]:
def get_compound(noun):
    compound_candidates = list()
    compound_candidates.append(noun)
    for child in noun.children:
        if (child.pos_ in NOUNS_POS or child.pos_ in DET_POS) and child.dep_ in COMPOUND_DEP:
            compound_candidates.extend(get_compound(child))
    return compound_candidates

### Find all subjects

In [10]:
def get_nominal_subjects(root):
    subject_list = list()
    for child in root.children:
        if child.pos_ in NOUNS_POS and child.dep_ in SUBJECT_DEP:
            subject_list.append(child)
    return subject_list

### Find the `appos`
Check if there is chain of `VERB --(nsubj)--> NOUN --(appos)--> NOUN`

In [11]:
def get_appos(noun):
    # Assume that there will be only one appos
    for child in noun.children:
        if child.pos_ in NOUNS_POS and child.dep_ in APPOS:
            return child
    return None

### Find the list of `nsubj`

In [12]:
def get_nsubjs(noun):
    conjs = list()
    conjs.append(noun)
    for child in noun.children:
        if child.pos_ in NOUNS_POS and child.dep_ in AND:
            conjs.extend(get_nsubjs(child))
    return conjs

In [27]:
def get_prep(noun):
    # assume there will be only one of these attached to the main noun
    prep = []
    for child in noun.children:
        if child.pos_ in PREP_POS and child.dep_ in PREPOSITIONAL_MOD:
            prep.append(child)
    return prep[-1]

In [14]:
def get_attributes(nominal_subj):
    appos_noun = get_appos(nominal_subj)
    if appos_noun is None:
        return get_nsubjs(nominal_subj)
    else:
        return get_nsubjs(appos_noun)

In [15]:
def get_objects(prep):
    for child in prep.children:
        if child.pos_ in NOUNS_POS and child.dep_ in OBJ_PREPOSITION:
            return child
    return None

In [16]:
def get_table_attr(main_obj):
    # Check if there, relative clause modifier
    relative_verb = None
    # check if 'relcl'
    for child in main_obj.children:
        if child.pos_ in VERB and child.dep_ in RELATIVE_CLAUSE_MOD:
            relative_verb = child
            break
    intermediate_prep = None
    # If there isn't then, check the for prep.
    if relative_verb is None:
        intermediate_prep = get_prep(main_obj)
    else:
        intermediate_prep = get_prep(relative_verb)
    objs = get_objects(intermediate_prep)
    return [main_obj, objs]

In [19]:
text1 = "show John Hanity the names, contact numbers, weight and local city of all the students in the computer science department."
test(text1)

root : show
nsubj : Hanity
Attributes :
[[names], [numbers, contact], [weight], [city]]
Prep : of
Objects : students
All objs : 
[[students], [department, science, computer]]


In [20]:
text2 = "what are the names, contact numbers and so on for all the students in the computer science department?"
test(text2)

root : are
nsubj : names
Attributes :
[[names], [numbers, contact]]
Prep : for
Objects : students
All objs : 
[[students], [department, science, computer]]


In [21]:
text3 = "get me the names, contact numbers and so on for all the students, who are in the computer science department."
test(text3)

root : get
nsubj : names
Attributes :
[[names], [numbers, contact]]
Prep : for
Objects : students
All objs : 
[[students], [department, science, computer]]


In [18]:
def test(text):
    doc = nlp_pipe(text)
    text_root = get_root(doc)
    print("root : {}".format(text_root))
    # I'll just be testing the first subjs
    nominal_subjs = get_nominal_subjects(text_root)
    print("nsubj : {}".format(nominal_subjs[0]))
    if len(nominal_subjs) > 1:
        print("There are more subjs are attached to ROOT")
    attrs = get_attributes(nominal_subjs[0])
    print("Attributes :")
    pprint([get_compound(attr) for attr in attrs])
    main_prep = get_prep(attrs[0])
    print("Prep : {}".format(main_prep))
    main_object = get_objects(main_prep)
    print("Objects : {}".format(main_object))
    all_objs = get_table_attr(main_object)
    print("All objs : ")
    pprint([get_compound(obj) for obj in all_objs])
    show_dep(doc)

In [22]:
def get_attrs_n_objects(text):
    doc = nlp_pipe(text)
    text_root = get_root(doc)
    attributes = get_attributes(get_nominal_subjects(text_root)[0])
    main_prep = get_prep(attributes[0])
    main_object = get_objects(main_prep)
    all_objects = get_table_attr(main_object)
    attributes = [get_compound(attr) for attr in attributes]
    all_objects = [get_compound(obj) for obj in all_objects]
    return attributes, all_objects

In [23]:
def join(list_attrs):
    return [" ".join([a.text for a in attrs]) for attrs in list_attrs]

In [24]:
text1 = "show John Hanity the names, contact numbers, weight and local city of all the students in the computer science department."
attributes, table_info = get_attrs_n_objects(text1)

print("attribututes : ")
pprint(join(attributes))
print("\ntable_info : ")
pprint(join(table_info))

attribututes : 
['names', 'numbers contact', 'weight', 'city']

table_info : 
['students', 'department science computer']


In [26]:
test4= "give me max of all the marks for the students of computer science students above 30"
doc4 = nlp_pipe(test4)
show_dep(doc4)

In [28]:
test(test4)

root : give
nsubj : max
Attributes :
[[max]]
Prep : for
Objects : students
All objs : 


AttributeError: 'NoneType' object has no attribute 'children'