In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import tree
from IPython.display import SVG
from IPython.display import display
from subprocess import call
import pydotplus
from sklearn.externals.six import StringIO
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import pylab as p1



In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
text = """London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.
"""

In [4]:
doc = nlp(text)

In [10]:
# print the text in the document
print(doc.text)

London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.



In [16]:
nlp.remove_pipe('sentencizer')
nlp.add_pipe(sbd,first=True)

In [18]:
#sentence based tokenizer
sbd = nlp.create_pipe('sentencizer')
doc1 = nlp(text)

sents_list = []
for sen in doc1.sents:
    print(sen)

London is the capital and most populous city of England and 
the United Kingdom.
 Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia.
It was founded by the Romans, who named it Londinium.




In [23]:
#remove stop words using spaCy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
filtered_sen =[]
for words in doc:
    if words.is_stop == False: # is_stop is a command to check if the word is a stop word or not
        filtered_sen.append(words)
print(filtered_sen)


[London, capital, populous, city, England, 
, United, Kingdom, .,  , Standing, River, Thames, south, east, 
, island, Great, Britain, ,, London, major, settlement, 
, millennia, ., founded, Romans, ,, named, Londinium, ., 
]


In [24]:
# Lemmatization of words
for words in doc:
    print(words.text,words.lemma_)

London London
is be
the the
capital capital
and and
most most
populous populous
city city
of of
England England
and and

 

the the
United United
Kingdom Kingdom
. .
   
Standing stand
on on
the the
River River
Thames Thames
in in
the the
south south
east east

 

of of
the the
island island
of of
Great Great
Britain Britain
, ,
London London
has have
been be
a a
major major
settlement settlement

 

for for
two two
millennia millennium
. .
It -PRON-
was be
founded found
by by
the the
Romans Romans
, ,
who who
named name
it -PRON-
Londinium Londinium
. .

 



In [28]:
# Visual Rendering entity objects
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)

In [11]:
# slicing of documents
first_sentence = doc[0:10]
second_sentence = doc[10:20]
print(first_sentence.text)
print(second_sentence.text)

London is the capital and most populous city of England
and 
the United Kingdom.  Standing on the


In [14]:
#lexical attributes, lets find the words ocurring before punctuations. (this idea can be transferred for number mainly)
for token in doc:
    if token.is_alpha:
        temp_word = doc[token.i+1] # the i+1 helps pick the next word. we can edit to suit our needs
        if temp_word.is_punct:
            print("Last word found: ", token.text)

Last word found:  Kingdom
Last word found:  Britain
Last word found:  millennia
Last word found:  Romans
Last word found:  Londinium


In [15]:
#Named entities are "real world objects" that are assigned a name – for example, a person, an organization or a country.

#The doc dot ents property lets you access the named entities predicted by the model.

#It returns an iterator of Span objects, so we can print the entity text and the entity label using 
#the "label underscore" attribute.

#In this case, the model is correctly predicting "London" as an Geopolitical Entity, "U.K." as a geopolitical entity 
#and "two millenia" as date and so on and so forth.

for entity in doc.ents:
    print(entity.text, entity.label_)
    

London GPE
England GPE
the United Kingdom GPE
the River Thames LOC
Great Britain GPE
London GPE
two millennia DATE
Romans NORP
Londinium LOC


In [16]:
# analogous to above example, use spacy to identify part of speech tags
for token in doc:
    print(token.text, token.pos_)

London PROPN
is VERB
the DET
capital NOUN
and CCONJ
most ADV
populous ADJ
city NOUN
of ADP
England PROPN
and CCONJ

 SPACE
the DET
United PROPN
Kingdom PROPN
. PUNCT
  SPACE
Standing VERB
on ADP
the DET
River PROPN
Thames PROPN
in ADP
the DET
south ADJ
east NOUN

 SPACE
of ADP
the DET
island NOUN
of ADP
Great PROPN
Britain PROPN
, PUNCT
London PROPN
has VERB
been VERB
a DET
major ADJ
settlement NOUN

 SPACE
for ADP
two NUM
millennia NOUN
. PUNCT
It PRON
was VERB
founded VERB
by ADP
the DET
Romans PROPN
, PUNCT
who PRON
named VERB
it PRON
Londinium PROPN
. PUNCT

 SPACE


In [30]:
#In addition to the part-of-speech tags, we can also predict how the words are related. 
#For example, whether a word is the subject of the sentence or an object.

#The "dep underscore" attribute returns the predicted dependency label.

#The head attribute returns the syntactic head token. You can also think of it as the parent token 
#this word is attached to.

for token in doc:
    print (token.text, token.pos_, token.dep_, token.head.text)


London PROPN nsubj is
is VERB ROOT is
the DET det capital
capital NOUN attr is
and CCONJ cc capital
most ADV advmod populous
populous ADJ amod city
city NOUN conj capital
of ADP prep city
England PROPN pobj of
and CCONJ cc city

 SPACE  and
the DET det Kingdom
United PROPN compound Kingdom
Kingdom PROPN conj city
. PUNCT punct is
  SPACE  .
Standing VERB advcl been
on ADP prep Standing
the DET det Thames
River PROPN compound Thames
Thames PROPN pobj on
in ADP prep Standing
the DET det east
south ADJ amod east
east NOUN pobj in

 SPACE  east
of ADP prep east
the DET det island
island NOUN pobj of
of ADP prep island
Great PROPN compound Britain
Britain PROPN pobj of
, PUNCT punct been
London PROPN nsubj been
has VERB aux been
been VERB ROOT been
a DET det settlement
major ADJ amod settlement
settlement NOUN attr been

 SPACE  settlement
for ADP prep settlement
two NUM nummod millennia
millennia NOUN pobj for
. PUNCT punct been
It PRON nsubjpass founded
was VERB auxpass founded
founded VE

In [31]:
# Visual Rendering of dependency parsing
text1 = "I am rather disappointed by Laurent Koscielny's decision to leave Arsenal in this fashion"
doc2 = nlp(text1)
displacy.render(doc2,style='dep',jupyter="True")

In [20]:
# A quick tip: To get definitions for the most common tags and labels, 
#you can use the spacy dot explain helper function.
#use it POS, NER and anything
print(spacy.explain('pobj'))
spacy.explain('GPE')

object of preposition


'Countries, cities, states'

#Rule Based Matching
Compared to regular expressions, the matcher works with Doc and Token objects instead of only strings.

It's also more flexible: you can search for texts but also other lexical attributes.

You can even write rules that use the model's predictions.

For example, find the word "duck" only if it's a verb, not a noun.

Match patterns are lists of dictionaries. Each dictionary describes one token. The keys are the names of token attributes, mapped to their expected values.

In this example, we're looking for two tokens with the text "iPhone" and "X".

We can also match on other token attributes. Here, we're looking for two tokens whose lowercase forms equal "iphone" and "x".

We can even write patterns using attributes predicted by the model. Here, we're matching a token with the lemma "buy", plus a noun. The lemma is the base form, so this pattern would match phrases like "buying milk" or "bought flowers".

Match exact token texts

[{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
Match lexical attributes
[{'LOWER': 'iphone'}, {'LOWER': 'x'}]
Match any token attributes
[{'LEMMA': 'buy'}, {'POS': 'NOUN'}]

To use a pattern, we first import the matcher from spacy dot matcher.

We also load a model and create the nlp object.

The matcher is initialized with the shared vocabulary, nlp dot vocab. You'll learn more about this later – for now, just remember to always pass it in.

The matcher dot add method lets you add a pattern. The first argument is a unique ID to identify which pattern was matched. The second argument is an optional callback. We don't need one here, so we set it to None. The third argument is the pattern.

To match the pattern on a text, we can call the matcher on any doc.

This will return the matches.

When you call the matcher on a doc, it returns a list of tuples.

Each tuple consists of three values: the match ID, the start index and the end index of the matched span.

This means we can iterate over the matches and create a Span object: a slice of the doc at the start and end index.m

Here's an example of a more complex pattern using lexical attributes.

We're looking for five tokens:

A token consisting of only digits.

Three case-insensitive tokens for "fifa", "world" and "cup".

And a token that consists of punctuation.

The pattern matches the tokens "2018 FIFA World Cup:".

In this example, we're looking for two tokens:

A verb with the lemma "love", followed by a noun.

This pattern will match "loved dogs" and "love cats".

Operators and quantifiers let you define how often a token should be matched. They can be added using the "OP" key.

Here, the "?" operator makes the determiner token optional, so it will match a token with the lemma "buy", an optional article and a noun.

"OP" can have one of four values:

An "!" negates the token, so it's matched 0 times.

A "?" makes the token optional, and matches it 0 or 1 times.

A "+" matches a token 1 or more times.

And finally, an "*" matches 0 or more times.

Operators can make your patterns a lot more powerful, but they also add more complexity – so use them wisely.

Example	Description

{'OP': '!'}	Negation: match 0 times
    
{'OP': '?'}	Optional: match 0 or 1 times
    
{'OP': '+'}	Match 1 or more times

{'OP': '*'}	Match 0 or more times

In [55]:
#practice rule based matching
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [57]:
pattern = [{'POS':'VERB'},{'POS':'ADP'}]
matcher.add('UNITED_KINGDOM_MATCHER',None,pattern)
matches = matcher(doc)
for match_id,start,end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

Standing on
founded by


To get the hash for a string, we can look it up in nlp dot vocab dot strings.

To get the string representation of a hash, we can look up the hash.

A Doc object also exposes its vocab and strings.

In [59]:
print(nlp.vocab.strings['London'])

5392354317538386956


In [60]:
print(nlp.vocab.strings[5392354317538386956])

London


In [62]:
print(doc.vocab.strings['United'])

13226800834791099135


## Check OneNote for infograph on Lexemes

## Semantic Similarity

Here's an example. Let's say we want to find out whether two documents are similar.

First, we load the medium English model, "en_core_web_md".

We can then create two doc objects and use the first doc's similarity method to compare it to the second.

Here, a fairly high similarity score of 0.86 is predicted for "I like fast food" and "I like pizza".

The same works for tokens.

According to the word vectors, the tokens "pizza" and "pasta" are kind of similar, and receive a score of 0.7.`

You can also use the similarity methods to compare different types of objects.

For example, a document and a token.

Here, the similarity score is pretty low and the two objects are considered fairly dissimilar.

Here's another example comparing a span – "pizza and pasta" – to a document about McDonalds.

The score returned here is 0.61, so it's determined to be kind of similar.

But how does spaCy do this under the hood?

Similarity is determined using word vectors, multi-dimensional representations of meanings of words.

You might have heard of Word2Vec, which is an algorithm that's often used to train word vectors from raw text.

Vectors can be added to spaCy's statistical models.

By default, the similarity returned by spaCy is the cosine similarity between two vectors – but this can be adjusted if necessary.

Vectors for objects consisting of several tokens, like the Doc and Span, default to the average of their token vectors.

That's also why you usually get more value out of shorter phrases with fewer irrelevant words.

Predicting similarity can be useful for many types of applications. For example, to recommend a user similar texts based on the ones they have read. It can also be helpful to flag duplicate content, like posts on an online platform.

However, it's important to keep in mind that there's no objective definition of what's similar and what isn't. It always depends on the context and what your application needs to do.

Here's an example: spaCy's default word vectors assign a very high similarity score to "I like cats" and "I hate cats". This makes sense, because both texts express sentiment about cats. But in a different application context, you might want to consider the phrases as very dissimilar, because they talk about opposite sentiments.

To give you an idea of what those vectors look like, here's an example.

First, we load the medium model again, which ships with word vectors.

Next, we can process a text and look up a token's vector using the dot vector attribute.

The result is a 300-dimensional vector of the word "banana".

### Intuitive explanation for word vector
Word Vector Representation
When we’re looking at words alone, it’s difficult for a machine to understand connections that a human would understand immediately. Engine and car, for example, have what might seem like an obvious connection (cars run using engines), but that link is not so obvious to a computer.

Thankfully, there’s a way we can represent words that captures more of these sorts of connections. A word vector is a numeric representation of a word that commuicates its relationship to other words.

Each word is interpreted as a unique and lenghty array of numbers. You can think of these numbers as being something like GPS coordinates. GPS coordinates consist of two numbers (latitude and longitude), and if we saw two sets GPS coordinates that were numberically close to each other (like 43,-70, and 44,-70), we would know that those two locations were relatively close together. Word vectors work similarly, although there are a lot more than two coordinates assigned to each word, so they’re much harder for a human to eyeball.

Using spaCy‘s en_core_web_sm model, let’s take a look at the length of a vector for a single word, and what that vector looks like using .vector and .shape.


In [65]:
print(doc[1].vector)

[-8.4961e-02  5.0200e-01  2.3823e-03 -1.6755e-01  3.0721e-01 -2.3762e-01
  1.6069e-01 -3.6786e-01 -5.8347e-02  2.4990e+00 -2.3647e-03  1.0732e-02
 -3.0422e-01  8.4579e-02 -4.0299e-02 -4.1562e-01 -2.4494e-02  1.4691e+00
 -5.2932e-02 -7.4413e-02 -3.9244e-01 -3.2535e-01 -2.2333e-01  5.6823e-03
  3.5675e-01  1.9445e-01  5.6762e-02 -4.5502e-02 -2.8105e-01 -5.8896e-02
 -9.8626e-02  9.2177e-02  3.3172e-01 -3.9967e-02 -1.1766e-01  4.8373e-02
 -6.2241e-02 -1.0413e-01  9.9263e-04 -4.8925e-01  3.4786e-01  3.2724e-01
  1.3882e-01 -1.9917e-01  1.2995e-01  6.0549e-02 -2.3714e-01 -5.1295e-01
 -3.7396e-01  1.2902e-01  5.5797e-02  3.3444e-01 -1.8025e-01 -3.4740e-02
  2.8323e-01 -9.5301e-02  2.1143e-01 -7.6149e-02  1.5069e-01 -1.7441e-01
 -7.4768e-03 -7.8287e-02 -1.2751e-01  2.2545e-01  3.5101e-02 -6.1015e-01
 -2.6812e-01  6.1632e-02 -3.0503e-01 -1.3405e-01 -4.4271e-01 -1.7720e-01
  1.7663e-01 -3.1210e-01 -2.5722e-01 -2.4858e-02  7.2504e-02 -7.9759e-02
 -1.9214e-01  5.9602e-01  1.2880e-01 -7.4629e-02 -1

In [66]:
text1 = """London is the capital and most populous city of England and 
the United Kingdom. Arsenal is based out of this great city.
"""

In [67]:
doc1 = nlp(text1)

In [68]:
print(doc1.similarity(doc))

0.9648870102443627


Statistical models are useful if your application needs to be able to generalize based on a few examples.

For instance, detecting product or person names usually benefits from a statistical model. Instead of providing a list of all person names ever, your application will be able to predict whether a span of tokens is a person name. Similarly, you can predict dependency labels to find subject/object relationships.

To do this, you would use spaCy's entity recognizer, dependency parser or part-of-speech tagger.

Rule-based approaches on the other hand come in handy if there's a more or less finite number of instances you want to find. For example, all countries or cities of the world, drug names or even dog breeds.

In spaCy, you can achieve this with custom tokenization rules, as well as the matcher and phrase matcher.

Here's an example of a matcher rule for "golden retriever".

If we iterate over the matches returned by the matcher, we can get the match ID and the start and end index of the matched span. We can then find out more about it. Span objects give us access to the original document and all other token attributes and linguistic features predicted by the model.

For example, we can get the span's root token. If the span consists of more than one token, this will be the token that decides the category of the phrase. For example, the root of "Golden Retriever" is "Retriever". We can also find the head token of the root. This is the syntactic "parent" that governs the phrase – in this case, the verb "have".

Finally, we can look at the previous token and its attributes. In this case, it's a determiner, the article "a".

The phrase matcher is another helpful tool to find sequences of words in your data.

It performs a keyword search on the document, but instead of only finding strings, it gives you direct access to the tokens in context.

It takes Doc objects as patterns.

It's also really fast.

This makes it very useful for matching large dictionaries and word lists on large volumes of text.

Here's an example.

The phrase matcher can be imported from spacy dot matcher and follows the same API as the regular matcher.

Instead of a list of dictionaries, we pass in a Doc object as the pattern.

We can then iterate over the matches in the text, which gives us the match ID, and the start and end of the match. This lets us create a Span object for the matched tokens "Golden Retriever" to analyze it in context.

In [76]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"TEXT": "Amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"TEXT": "ad"},{"PUNCT":'-'},{"TEXT":"free"}, {"POS": "NOUN"}]


# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [77]:
[token.text for token in nlp("Amazon Prime")]

['Amazon', 'Prime']

Sometimes it’s more efficient to match exact strings instead of writing patterns describing the individual tokens. This is especially true for finite categories of things – like all countries of the world. We already have a list of countries, so let’s use this as the basis of our information extraction script. A list of string names is available as the variable COUNTRIES.

### Read Below example and see a new style of describing patterns different from the Golden retriever example

### spaCy Pipeline

You've already written this plenty of times by now: pass a string of text to the nlp object, and receive a Doc object.

But what does the nlp object actually do?

First, the tokenizer is applied to turn the string of text into a Doc object. Next, a series of pipeline components is applied to the Doc in order. In this case, the tagger, then the parser, then the entity recognizer. Finally, the processed Doc is returned, so you can work with it.

spaCy ships with the following built-in pipeline components.

The part-of-speech tagger sets the token dot tag attribute.

The dependency parser adds the token dot dep and token dot head attributes and is also responsible for detecting sentences and base noun phrases, also known as noun chunks.

The named entity recognizer adds the detected entities to the doc dot ents property. It also sets entity type attributes on the tokens that indicate if a token is part of an entity or not.

Finally, the text classifier sets category labels that apply to the whole text, and adds them to the doc dot cats property.

Because text categories are always very specific, the text classifier is not included in any of the pre-trained models by default. But you can use it to train your own system

In [80]:
nlp.pipe_names

['tagger', 'parser', 'ner']

To see the names of the pipeline components present in the current nlp object, you can use the nlp dot pipe names attribute.

For a list of component name and component function tuples, you can use the nlp dot pipeline attribute.

The component functions are the functions applied to the Doc to process it and set attributes – for example, part-of-speech tags or named entities.

Now that you know how spaCy's pipeline works, let's take a look at another very powerful feature: custom pipeline components.

Custom pipeline components let you add your own function to the spaCy pipeline that is executed when you call the nlp object on a text – for example, to modify the Doc and add more data to it.

After the text is tokenized and a Doc object has been created, pipeline components are applied in order. spaCy supports a range of built-in components, but also lets you define your own.

Custom components are executed automatically when you call the nlp object on a text.

They're especially useful for adding your own custom metadata to documents and tokens.

You can also use them to update built-in attributes, like the named entity spans.

Fundamentally, a pipeline component is a function or callable that takes a doc, modifies it and returns it, so it can be processed by the next component in the pipeline.

Components can be added to the pipeline using the nlp dot add pipe method. The method takes at least one argument: the component function.

To specify where to add the component in the pipeline, you can use the following keyword arguments:

Setting last to True will add the component last in the pipeline. This is the default behavior.

Setting first to True will add the component first in the pipeline, right after the tokenizer.

The "before" and "after" arguments let you define the name of an existing component to add the new component before or after. For example, before equals "ner" will add it before the named entity recognizer.

The other component to add the new component before or after needs to exist, though – otherwise, spaCy will raise an error.

Here's an example of a simple pipeline component.

We start off with the small English model.

We then define the component – a function that takes a Doc object and returns it.

Let's do something simple and print the length of the Doc that passes through the pipeline.

Don't forget to return the Doc so it can be processed by the next component in the pipeline! The Doc created by the tokenizer is passed through all components, so it's important that they all return the modified doc.

We can now add the component to the pipeline. Let's add it to the very beginning right after the tokenizer by setting first equals True.

When we print the pipeline component names, the custom component now shows up at the start. This means it will be applied first when we process a Doc.

In [103]:
def upper(doc1):
    upper_doc = len(doc1)
    print(upper_doc)
    return doc1

#nlp.add_pipe(upper,first=True)
doc1 = nlp('text')