<a href="https://colab.research.google.com/github/VitikaJain25/NLP/blob/master/spaCy_Tutorial/spaCy_Doc_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter - 2

# Data Structures : Vocab, Lexemes and StringStore



# String to Hashes

In [1]:
from spacy.lang.en import English

nlp = English()
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


In [2]:
from spacy.lang.en import English
from spacy.lang.de import German

# Create an English and German nlp object
nlp = English()
nlp_de = German()

doc = nlp_de("i have Bowie")

# Get the ID for the string 'Bowie'
bowie_id = nlp.vocab.strings["Bowie"]
print(bowie_id)

# Look up the ID for "Bowie" in the vocab
print(nlp_de.vocab.strings[bowie_id])

2644858412616767388
Bowie


# Data Structures : DOC, Span, Token

Doc is automatically created when we process a text with nlp object. But we can also instantiate the class manually.

Creating Doc Manually

In [3]:
# Create a nlp object
from spacy.lang.en import English
nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# The words and spaces to create the doc from
# Here we are creating a doc from 3 words
# Spaces is list of Boolean values indicating if the word is followed by space.
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Creating a doc manually
# Doc class takes 3 arguments (shared_vocab, words, spaces)
doc = Doc(nlp.vocab, words = words, spaces = spaces)
print(doc.text)

Hello world!


Creating Span manually

In [4]:
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Creating a doc manually
doc = Doc(nlp.vocab, words = words, spaces = spaces)

# Creating a span manually
span = Span(doc, 0, 2)

# Creating a span with a label
# Takes 3 arguments - (Doc it refers to, Start index, End index)
span_with_label = Span(doc, 0, 2, label = "GREETING")

# Adding span to the doc.ents
doc.ents = [span_with_label]
print([(ent.text, ent.label_) for ent in doc.ents])

[('Hello world', 'GREETING')]


In [5]:
from spacy.lang.en import English

nlp = English()

# Importing the Doc and Span classes
from spacy.tokens import Doc, Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Creating a doc from the words and spaces
doc = Doc(nlp.vocab, words, spaces)
print(doc.text)

# Creating a span for "David Bowie" from the doc
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# Adding the span to the doc's entities
doc.ents = [span]

# Printing entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


# Word Vectors and Semantic Similarity

Semantic Similarity

Doc Similarity

In [6]:
# To download the medium english model
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [7]:
import spacy

# Load a larger model with vectors
nlp = spacy.load("en_core_web_md")

# Compare 2 documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")

print(doc1.similarity(doc2))

0.8627204117787385


Token Simiarity

In [8]:
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.7369546


We can also use Similarity method to compare different types of objects

In [9]:
# Comparing Doc with a Token
doc = nlp("I like pizza")
token = nlp("soap")

print (token)
print(doc.similarity(token))

# Considered dissimilar as the similarit score is low.

soap
0.32531983451318713


In [10]:
# Comparing a Span with a Doc
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.6199092090831612


Word Vectors

In [11]:
# Loading medium model
nlp = spacy.load("en_core_web_md")

doc = nlp("I have a banana")

# Access the vector via the token.vector attribute
print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

# Combining Model and Rules

Adding Statistical Predictions

Span objects give us access to the original document and all other token attributes and linguistic features predicted by the model.

For example, we can get the span's root token. If the span consists of more than one token, this will be the token that decides the category of the phrase. 

For example, the root of "Golden Retriever" is "Retriever". We can also find the head token of the root. This is the syntactic "parent" that governs the phrase – in this case, the verb "have".

Finally, we can look at the previous token and its attributes. In this case, it's a determiner, the article "a".


In [13]:
import spacy
# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create nlp object
nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "golden"}, {"LOWER": "retriever"}]
matcher.add("DOG", None, pattern)
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)
    # Get the span's root token and root head token
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    # Get the previous token and its POS tag
    print("Previous token:", doc[start - 1].text, doc[start - 1].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


# Phrase Matcher

*   Follows same API as regular matcher
*   Instead a list of dictionaries we pass the doc object as the pattern



In [14]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

# Instead a list of dictionaries we pass the doc object as the pattern
pattern = nlp("Golden Retriever")
matcher.add("DOG", None, pattern)

doc = nlp("I have a Golden Retriever")

# Iterate over the matches
for match_id, start, end in matcher(doc):
  # Get matched span
  matched_span = doc[start:end]
  print("Matched Span : ", matched_span.text)

Matched Span :  Golden Retriever


In [20]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"TEXT": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
