# Spacy Introduction

Following https://github.com/cytora/pycon-nlp-in-10-lines/blob/master/00_spacy_intro.ipynb

In [1]:
# Import spacy and English models
import spacy
import numpy

nlp = spacy.load('en')

In [2]:
doc_2 = nlp(u"I went to Paris where I met my old friend Jack from uni.")
for ent in doc_2.ents:
    print('{} - {}'.format(ent, ent.label_))

Paris - GPE
Jack - PERSON


In [10]:
# Read in data - sample 1

sample = open('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\President Trump- State Visit 2017-02-20.txt', 'r', encoding='utf8').read()

sample

'President Trump: State Visit\n\n16:30:00\n\nMr Charles Walker (in the Chair)\nThis is a very over-subscribed debate. If all hon. Members stick to five minutes and do not take too many, if any, interventions, everybody should get in.\n\nI remind those in the Public Gallery that this is a Chamber of the House of Commons. By all means listen and observe, but if there is any off-stage noise, I will suspend the sitting and clear the Public Gallery.\n\n16:31:00\n\nPaul Flynn (Newport West) (Lab)\nI beg to move,\n\nThat this House has considered e-petitions 171928 and 178844 relating to a state visit by President Donald Trump.\n\nIt is a pleasure to serve under the chairmanship of such a distinguished parliamentarian, Mr Walker. I thank the Petitions Committee for allowing me to introduce the petitions. There has been a great deal of misunderstanding about their nature. One of them, which has been signed by more than 300,000 people, states:\n\n“Donald Trump should be invited to make an offic

In [11]:
doc = nlp(sample)
for ent in doc.ents:
    print('{} - {}'.format(ent, ent.label_))

Trump - PERSON
16:30:00 - TIME
Charles Walker - PERSON
Chair - ORG
five minutes - TIME
the Public Gallery - ORG
the Public Gallery - ORG
16:31:00 - TIME


Paul Flynn - PERSON
Newport West - GPE
House - ORG
171928 - CARDINAL
178844 - DATE
Donald Trump - PERSON
Mr Walker - PERSON
the Petitions Committee - ORG
One - CARDINAL
more than 300,000 - CARDINAL
Donald Trump - PERSON
State - ORG
U.K. - GPE
1,850,000 - CARDINAL
a few days - DATE
Donald Trump - PERSON
UK - GPE
the US Government - ORG
State - ORG
Queen - PERSON
first - ORDINAL
Trump - PERSON
recent days - DATE
24 hours - TIME
seven days - DATE
Only two - CARDINAL
the United States - GPE
1952 - DATE
seven days - DATE
Trump - PERSON
the United States’s - GPE
one - CARDINAL
the United States - GPE
Alex Salmond - PERSON
Gordon - PERSON
SNP - ORG

Does the hon - WORK_OF_ART
seven days - DATE
Trump - PERSON
Paul Flynn - PERSON
Europe - LOC
Brexit - GPE
Brexit - GPE
Europe - LOC
Lithuania - GPE
Trump - PERSON
Mark Pritchard - PERSON
Wrekin 

In [5]:
sample2 = "please remove my name and information from the registered user list.  Do not \nsell my information.\n\nPhillip Allen"

doc2 = nlp(sample2)
for ent in doc2.ents:
    print('{} - {}'.format(ent, ent.label_))

Phillip Allen - PERSON


In [12]:
scores = numpy.zeros((len(doc), nlp.entity.model.nr_class))
with nlp.entity.step_through(doc) as state:
    while not state.is_final:
        action = state.predict()
        next_tokens = state.queue
        scores[next_tokens[0].i] = state.scores
        state.transition(action)

AttributeError: 'spacy.syntax.parser.ParserModel' object has no attribute 'nr_class'

In [36]:
doc.sentiment

0.0

In [16]:
nlp.entity.model.class?

Object `nlp.entity.model.class` not found.


## Word embedding / similarity

In [8]:
# For a given document, calculate similarity between 'apples' and 'oranges' and 'boots' and 'hippos'
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
print(apples.similarity(oranges))
print(boots.similarity(hippos))

print()
print("Print similarity between sentence and word 'fruit'")
apples_sent, boots_sent = doc.sents
fruit = doc.vocab[u'fruit']
print(apples_sent.similarity(fruit))
print(boots_sent.similarity(fruit))

0.0
0.0

Print similarity between sentence and word 'fruit'
0.569403188405
0.323890895482


In [29]:
tok1 = nlp(u"King")
tok2 = nlp(u"Queen")
print(tok1.similarity(tok2))

0.725261034541


In [30]:
tok1 = nlp(u"MND")
tok2 = nlp(u"MND Assocation")
print(tok1.similarity(tok2))

0.0


In [31]:
tok1 = nlp(u"Trump")
tok2 = nlp(u"Donald Trump")
print(tok1.similarity(tok2))

0.0


# Spacy intro to NLP

https://nicschrading.com/project/Intro-to-NLP-with-spaCy/


In [11]:
# Imports

import math

In [3]:
# Set up spaCy
from spacy.en import English
parser = English()

# Test Data
multiSentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."

multiSentence

'There is an art, it says, or rather, a knack to flying.The knack lies in learning how to throw yourself at the ground and miss.In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.'

" spaCy does tokenization, sentence recognition, part of speech tagging, lemmatization, dependency parsing, and named entity recognition all at once!"

In [4]:
# all you have to do to parse text is this:
#note: the first time you run spaCy in a file it takes a little while to load up its modules
parsedData = parser(multiSentence)

# Let's look at the tokens
# All you have to do is iterate through the parsedData
# Each token is an object with lots of different properties
# A property with an underscore at the end returns the string representation
# while a property without the underscore returns an index (int) into spaCy's vocabulary
# The probability estimate is based on counts from a 3 billion word
# corpus, smoothed using the Simple Good-Turing method.
for i, token in enumerate(parsedData):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 1:
        break


original: 769 There
lowercased: 608 there
lemma: 608 there
shape: 684 Xxxxx
prefix: 568 T
suffix: 609 ere
log probability: -7.277902603149414
Brown cluster id: 1918
----------------------------------------
original: 513 is
lowercased: 513 is
lemma: 536 be
shape: 505 xx
prefix: 509 i
suffix: 513 is
log probability: -4.3297648429870605
Brown cluster id: 762
----------------------------------------
original: 591 an
lowercased: 591 an
lemma: 591 an
shape: 505 xx
prefix: 506 a
suffix: 591 an
log probability: -5.953293800354004
Brown cluster id: 3
----------------------------------------


In [6]:
# Let's look at the sentences
sents = []
# the "sents" property returns spans
# spans have indices into the original string
# where each index value represents a token
for span in parsedData.sents:
    # go from the start to the end of each span, returning each token in the sentence
    # combine each token using join()
    sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()
    sents.append(sent)

for sentence in sents:
    print(sentence)

There is an art, it says, or rather, a knack to flying.
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created.
This has made a lot of people very angry and been widely regarded as a bad move.


In [8]:
# Let's look at the part of speech tags of the first sentence
for span in parsedData.sents:
    sent = [parsedData[i] for i in range(span.start, span.end)]
    break

for token in sent:
    print(token.orth_, token.pos_)

There ADV
is VERB
an DET
art NOUN
, PUNCT
it PRON
says VERB
, PUNCT
or CCONJ
rather ADV
, PUNCT
a DET
knack NOUN
to ADP
flying NOUN
. PUNCT


In [12]:
# Let's look at the dependencies of this example:
example = "The boy with the spotted dog quickly ran after the firetruck."
parsedEx = parser(example)
# shown as: original token, dependency tag, head word, left dependents, right dependents
for token in parsedEx:
    print(token.orth_, token.dep_, token.head.orth_, math.exp(token.prob), [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])

The det boy 0.003106612502375516 [] []
boy nsubj ran 4.882202263418575e-05 ['The'] ['with']
with prep boy 0.004683241583011624 [] []
the det dog 0.03253477391428923 [] []
spotted amod dog 3.139151557367171e-09 [] []
dog nsubj ran 9.102749578361624e-05 ['the', 'spotted'] []
quickly advmod ran 6.586847992105393e-05 [] []
ran ROOT ran 4.652857862703259e-05 ['boy', 'dog', 'quickly'] ['after', '.']
after prep ran 0.000545223816758833 [] ['firetruck']
the det firetruck 0.03253477391428923 [] []
firetruck pobj after 3.139151557367171e-09 ['the'] []
. punct ran 0.04628450778430217 [] []
