In [2]:

import spacy
import time
import nltk

In [3]:
### Spacy load ###
begin=time.time()

nlp = spacy.load('en')
begin=time.time()
doc = nlp('I like green eggs and ham.A woman is walking through the door.')

#for np in doc.noun_chunks:
#    print(np.text, np.root.text, np.root.dep_, np.root.head.text)
    # I I nsubj like
    # green eggs eggs dobj like
    # ham ham conj eggs


In [19]:
type(doc)

spacy.tokens.doc.Doc

In [6]:
### tokenization ###
begin=time.time()
doc = nlp('I like green eggs and ham.A woman is walking through the door.')
tokenlist=[]
for token in doc:
    tokenlist.append(token)
print(time.time()-begin)
tokenlist

0.0014889240264892578


[I,
 like,
 green,
 eggs,
 and,
 ham,
 .,
 A,
 woman,
 is,
 walking,
 through,
 the,
 door,
 .]

In [5]:
begin=time.time()
token=nltk.word_tokenize('I like green eggs and ham.A woman is walking through the door.')
print(time.time()-begin)
token

0.014837026596069336


['I',
 'like',
 'green',
 'eggs',
 'and',
 'ham.A',
 'woman',
 'is',
 'walking',
 'through',
 'the',
 'door',
 '.']

In [7]:
### Code to find the number of tokens longest common sequence ### 
def lcs(xstr, ystr):
    if not xstr or not ystr:
        return 0
    x, xs, y, ys = xstr[0], xstr[1:], ystr[0], ystr[1:]
    if x == y:
        return 1 + lcs(xs, ys)
    else:
        return max(lcs(xstr, ys), lcs(xs, ystr))  

In [9]:
begin=time.time()
doc = nlp('I like green eggs and ham.A woman is walking through the door.')
lcs_num=lcs(doc,doc)
print(time.time()-begin)

lcs_num

0.0028388500213623047


15

In [10]:
begin=time.time()
token=nltk.word_tokenize('I like green eggs and ham. A woman is walking through the door.')
lcs_num=lcs(token,token)
print(time.time()-begin)
lcs_num

0.002022981643676758


15

In [11]:
### edit_distance ###

token2=nltk.word_tokenize('I dont like red eggs and ham. A man is walking through the door.')
nltk.edit_distance(token,token2)

3

In [12]:
# Get first token of the processed document
token = doc[0]
print(token)

# Print sentences (one sentence per line)
for sent in doc.sents:
    print(sent)

I
I like green eggs and ham.
A woman is walking through the door.


In [13]:
# For each token, print corresponding part of speech tag
for token in doc:
    print('{} - {}'.format(token, token.pos_))

I - PRON
like - VERB
green - ADJ
eggs - NOUN
and - CCONJ
ham - NOUN
. - PUNCT
A - DET
woman - NOUN
is - VERB
walking - VERB
through - ADP
the - DET
door - NOUN
. - PUNCT


In [14]:
### dependency parsing ###
#Write a function that walks up the syntactic tree of the given token and collects all tokens to the root token (including root token).

def tokens_to_root(token):
    """
    Walk up the syntactic tree, collecting tokens to the root of the given `token`.
    :param token: Spacy token
    :return: list of Spacy tokens
    """
    tokens_to_r = []
    while token.head is not token:
        tokens_to_r.append(token)
        token = token.head
        tokens_to_r.append(token)

    return tokens_to_r

# For every token in document, print it's tokens to the root
for token in doc:
    print('{} --> {}'.format(token, tokens_to_root(token)))

# Print dependency labels of the tokens
for token in doc:
    print('-> '.join(['{}-{}'.format(dependent_token, dependent_token.dep_) for dependent_token in tokens_to_root(token)]))

I --> [I, like]
like --> []
green --> [green, eggs, eggs, like]
eggs --> [eggs, like]
and --> [and, eggs, eggs, like]
ham --> [ham, eggs, eggs, like]
. --> [., like]
A --> [A, woman, woman, walking]
woman --> [woman, walking]
is --> [is, walking]
walking --> []
through --> [through, walking]
the --> [the, door, door, through, through, walking]
door --> [door, through, through, walking]
. --> [., walking]
I-nsubj-> like-ROOT

green-amod-> eggs-dobj-> eggs-dobj-> like-ROOT
eggs-dobj-> like-ROOT
and-cc-> eggs-dobj-> eggs-dobj-> like-ROOT
ham-conj-> eggs-dobj-> eggs-dobj-> like-ROOT
.-punct-> like-ROOT
A-det-> woman-nsubj-> woman-nsubj-> walking-ROOT
woman-nsubj-> walking-ROOT
is-aux-> walking-ROOT

through-prep-> walking-ROOT
the-det-> door-pobj-> door-pobj-> through-prep-> through-prep-> walking-ROOT
door-pobj-> through-prep-> through-prep-> walking-ROOT
.-punct-> walking-ROOT


In [18]:
# Print all named entities with named entity types
doc_1 = nlp('WHO IS JAY?')
doc_2 = nlp('Who is jay?')
doc_3=nlp('Hpp is Happy')
doc_4=nlp(u'Xiaoming birthday is May 3rd')
for ent in doc_2.ents:
    print('DOC 2')
    print('{} - {}'.format(ent, ent.label_))
for ent in doc_1.ents:
    print('DOC 1')
    print('{} - {}'.format(ent, ent.label_))
for ent in doc_3.ents:
    print('DOC 3')
    print('{} - {}'.format(ent, ent.label_))
for ent in doc_4.ents:
    print('DOC 4')
    print('{} - {}'.format(ent, ent.label_))    
       
len(doc_1.ents)


DOC 1
JAY - PERSON
DOC 4
May 3rd - DATE


1

In [24]:
# Print noun chunks for doc_2
print([chunk for chunk in doc_2.noun_chunks])

[I, Paris, I, my old friend, uni]


In [25]:
# For every token in doc_2, print log-probability of the word, estimated from counts from a large corpus 
for token in doc_2:
    print(token, ',', token.prob)

I , -4.064180850982666
went , -8.474893569946289
to , -3.83851957321167
Paris , -11.6917724609375
where , -7.183883190155029
I , -4.064180850982666
met , -9.784490585327148
my , -5.918124675750732
old , -7.7954816818237305
friend , -8.825821876525879
Jack , -11.20296573638916
from , -6.028810501098633
uni , -19.579313278198242
. , -3.0729479789733887


In [20]:
### similarity ###
# For a given document, calculate similarity between 'apples' and 'oranges' and 'boots' and 'hippos'
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
print(apples.similarity(oranges))
print(boots.similarity(hippos))

print()
# Print similarity between sentence and word 'fruit'
apples_sent, boots_sent = doc.sents
fruit = doc.vocab[u'fruit']
print(apples_sent.similarity(fruit))
print(boots_sent.similarity(fruit))
print(boots_sent.similarity(apples_sent))

0.0
0.0

0.569403101179
0.323890751106
0.832116326852


### Example of nltk dependency grammar 
#### not useful for dependency parsing

In [29]:
import nltk


In [30]:
groucho_dep_grammar = nltk.DependencyGrammar.fromstring("""
'shot' -> 'I' | 'elephant' | 'in'
'elephant' -> 'an' | 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
""")
print(groucho_dep_grammar)

Dependency grammar with 7 productions
  'shot' -> 'I'
  'shot' -> 'elephant'
  'shot' -> 'in'
  'elephant' -> 'an'
  'elephant' -> 'in'
  'in' -> 'pajamas'
  'pajamas' -> 'my'


In [31]:
pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)
sent = 'I shot an elephant in my pajamas'.split()
trees = pdp.parse(sent)
for tree in trees:
     print(tree)

(shot I (elephant an (in (pajamas my))))
(shot I (elephant an) (in (pajamas my)))
