# Parsing 
- NLTK
- SPACY
- STANZA

In [2]:
michelle = """Hi! I'm Michelle and I'm 22.
I really,really like this guy. He's 27 and everything  I like in a guy. We have so much in common.
We met around three and a half months ago. A week after we met, he texted me and we didn't stop talking for a whole month and a half. We talked day and night, sometimes 'til four in the morning.
Then, he started ignoring me. When that started to  happen, a red flag went up in my head, so I started ignoring him, too. Except I started missing him.
Before I started a new semester, I asked him what was the point of saving my number if he wasn't going to ask me out. (Yes, we haven't gone out on a date yet. We've talked about it, but he doesn't make it happen.)
I told him I wasn't going to have enough time for him, and if he really wanted to go out with me, he should make it happen soon rather than later.
I just don't understand why he hasn't asked me out yet. He gives me the money excuse, or the "every time I want to, something else comes up" excuse.
If he wants to see me he should've done so already... right?"""


### NLTK

In [3]:
import nltk
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")

In [4]:
elephant = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)

for tree in parser.parse(elephant):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [5]:
import nltk
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp = nltk.RegexpParser(grammar)
mary = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NNP"),
    ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]

mary_parse = cp.parse(mary)
print(mary_parse)
    

(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NNP)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [6]:
elephant_pos = nltk.pos_tag(elephant)
elephant_parse = cp.parse(elephant_pos)
print(elephant_parse)

(S
  I/PRP
  shot/VBP
  (NP an/DT elephant/NN)
  in/IN
  my/PRP$
  (NP pajamas/NN))


In [7]:
import nltk
#nltk.download('words')
#nltk.download('maxent_ne_chunker')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag

def preprocessSent(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

def preprocessText(text):
# Segment text into sentences
    sent = sent_tokenize(text)
# Tokenize each sentences
    sent = [nltk.word_tokenize(s) for s in sent]
# Part-of-speech tagging each sentences
    sent = [nltk.pos_tag(s) for s in sent]
    return sent


In [13]:
michelle_prep = preprocessText(michelle)
print(michelle_prep)

[[('Hi', 'NN'), ('!', '.')], [('I', 'PRP'), ("'m", 'VBP'), ('Michelle', 'NNP'), ('and', 'CC'), ('I', 'PRP'), ("'m", 'VBP'), ('22', 'CD'), ('.', '.')], [('I', 'PRP'), ('really', 'RB'), (',', ','), ('really', 'RB'), ('like', 'IN'), ('this', 'DT'), ('guy', 'NN'), ('.', '.')], [('He', 'PRP'), ("'s", 'VBZ'), ('27', 'CD'), ('and', 'CC'), ('everything', 'NN'), ('I', 'PRP'), ('like', 'VBP'), ('in', 'IN'), ('a', 'DT'), ('guy', 'NN'), ('.', '.')], [('We', 'PRP'), ('have', 'VBP'), ('so', 'RB'), ('much', 'JJ'), ('in', 'IN'), ('common', 'JJ'), ('.', '.')], [('We', 'PRP'), ('met', 'VBD'), ('around', 'IN'), ('three', 'CD'), ('and', 'CC'), ('a', 'DT'), ('half', 'JJ'), ('months', 'NNS'), ('ago', 'RB'), ('.', '.')], [('A', 'DT'), ('week', 'NN'), ('after', 'IN'), ('we', 'PRP'), ('met', 'VBD'), (',', ','), ('he', 'PRP'), ('texted', 'VBD'), ('me', 'PRP'), ('and', 'CC'), ('we', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('stop', 'VB'), ('talking', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('whole', 'JJ'), ('month', '

In [9]:
michelle_parse = cp.parse(michelle_prep[2])
print(michelle_parse)
    


(S
  I/PRP
  really/RB
  ,/,
  really/RB
  (PP like/IN (NP this/DT guy/NN))
  ./.)


###  SPACY

In [10]:
import spacy
from spacy import displacy

# Load the language model
nlp = spacy.load("en_core_web_sm")
sentence = "John loves Mary."
#sentence = 'Deemed universities charge huge fees'
#sentence = 'I really,really like this guy.'
#sentence = 'We have so much in common .'
#sentence = 'When that started to  happen, a red flag went up in my head.'
sentence = 'I have a red car.'

# nlp function returns an object with individual token information, 
# linguistic features and relationships
doc = nlp(sentence)

print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relation','Head', 'Children'))
print ("-" * 70)

for token in doc:
    # Print the token, dependency nature, head and all dependents of the token
    print ("{:<15} | {:<8} | {:<15} | {:<20}"
         .format(str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))
  
# Use displayCy to visualize the dependency 
displacy.render(doc, style='dep', jupyter=True, options={'distance': 120})

Token           | Relation | Head            | Children            
----------------------------------------------------------------------
I               | nsubj    | have            | []                  
have            | ROOT     | have            | [I, car, .]         
a               | det      | car             | []                  
red             | amod     | car             | []                  
car             | dobj     | have            | [a, red]            
.               | punct    | have            | []                  


### STANZA

In [11]:
import stanza

# Download the language model
stanza.download('en')

sentence = 'A week after we met, he texted me and we didn\'t stop talking for a whole month and a half.'
#sentence = 'I have a red car.'

# Build a Neural Pipeline
nlp = stanza.Pipeline('en', processors = "tokenize,mwt,pos,lemma,depparse") 

# Pass the sentence through the pipeline
doc = nlp(sentence)

# Print the dependencies of the first sentence in the doc object
# Format - (Token, Index of head, Nature of dependency)
# Index starts from 1, 0 is reserved for ROOT
doc.sentences[0].print_dependencies()


print ("{:<15} | {:<10} | {:<15} ".format('Token', 'Relation', 'Head'))
print ("-" * 50)

# Convert sentence object to dictionary  
sent_dict = doc.sentences[0].to_dict()

# iterate to print the token, relation and head
for word in sent_dict:
    print ("{:<15} | {:<10} | {:<15} "
         .format(str(word['text']),str(word['deprel']), str(sent_dict[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 40.1MB/s]                    
2021-10-21 12:43:03 INFO: Downloading default packages for language: en (English)...
2021-10-21 12:43:04 INFO: File exists: /users/kent/slee122/stanza_resources/en/default.zip.
2021-10-21 12:43:09 INFO: Finished downloading models and saved to /users/kent/slee122/stanza_resources.
2021-10-21 12:43:09 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-10-21 12:43:09 INFO: Use device: cpu
2021-10-21 12:43:09 INFO: Loading: tokenize
2021-10-21 12:43:09 INFO: Loading: pos
2021-10-21 12:43:10 INFO: Loading: lemma
2021-10-21 12:43:10 INFO: Loading: depparse
2021-10-21 12:43:10 INFO: Done loading processors!


('A', 2, 'det')
('week', 5, 'obl:npmod')
('after', 5, 'mark')
('we', 5, 'nsubj')
('met', 8, 'advcl')
(',', 8, 'punct')
('he', 8, 'nsubj')
('texted', 0, 'root')
('me', 8, 'obj')
('and', 14, 'cc')
('we', 14, 'nsubj')
('did', 14, 'aux')
("n't", 14, 'advmod')
('stop', 8, 'conj')
('talking', 14, 'xcomp')
('for', 19, 'case')
('a', 19, 'det')
('whole', 19, 'amod')
('month', 15, 'obl')
('and', 22, 'cc')
('a', 22, 'det')
('half', 19, 'conj')
('.', 8, 'punct')
Token           | Relation   | Head            
--------------------------------------------------
A               | det        | week            
week            | obl:npmod  | met             
after           | mark       | met             
we              | nsubj      | met             
met             | advcl      | texted          
,               | punct      | texted          
he              | nsubj      | texted          
texted          | root       | ROOT            
me              | obj        | texted          
and           