<a href="https://colab.research.google.com/github/sateesh12/deep_learning_basics/blob/master/NLP_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Author : Sateesh Kalidas
#Date   : 10/July/2020

#Purpose: Basics of Spacy

In [None]:
from spacy.lang.en import English
nlp = English()


In [None]:
doc = nlp("Hello World!")
for token in doc:
  print(token.text + ":", token.is_alpha)

Hello: True
World: True
!: False


In [None]:
#Slicing of text
#Process the text
doc = nlp("I like tree kangaroos and narwhals.")
first_token = doc[0]
print(first_token.text)

I


In [None]:
# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


In [None]:
# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("She ate the pizza at Apple")
print("1.Printing all tokens")
for token in doc:
  print(token.text,token.pos_)

print("2.Print all entity")
for token in doc.ents:
  print(token.text, token.label_)



1.Printing all tokens
She PRON
ate VERB
the DET
pizza NOUN
at ADP
Apple PROPN
2.Print all entity
Apple ORG


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      punct     
official    NOUN      ccomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [None]:
# A case of not recognizing a new entity which is not part of the training data-set
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for token in doc.ents:
    # Print the entity text and label
    print(token.text, token.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

In [None]:
#Pattern matching for "iPhone X"
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"TEXT":"iPhone"},{"TEXT":"X"}]
matcher.add("IPH_MATCHER", None, pattern)
doc = nlp("Upcoming iPhone X release date leaked")
matches = matcher(doc)
print(matches)
for match_id,start,end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)

[(2639739455581463429, 1, 3)]
iPhone X


In [None]:
#Pattern Matching for "iOS 7 or iOS 10 or iOS 11"
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

In [None]:
#Pattern match for Pronoun version of the lemma download
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


In [None]:
#Pattern match for ADJ + Noun + Optional Noun
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "+"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

In [6]:
#Convert between hash and strings
import spacy
from spacy.lang.en import English
nlp = English()
doc = nlp("I have a cat")


cat_hash = nlp.vocab.strings[doc[3].text]
print(cat_hash)

cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)


cat
5439657043933447811
cat


In [None]:
# Doc object creation 
from spacy.lang.en import English

nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Oh, really?!"
words = ["Oh","," ,"really", "?", "!"]
spaces = [False, True, False, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

In [5]:
from spacy.lang.en import English

nlp = English()

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


In [21]:
# Use token attributes only for better performance
# spacy converts every token into hash, better to use the hash to do most operation
import spacy
from spacy.lang.en import English

#Without the model the below code will NOT identify parts-of-speec
nlp = spacy.load("en_core_web_sm")

doc = nlp("Berlin run")
print(doc)
for  token in doc:
  print(token.text,token.pos_)
  if(token.pos_ == "PROPN"):
    next_token = doc[token.i + 1]
    print("Found a proper noun",token.text)
    if(next_token.pos_ == "VERB"):
      print("Found a proper noun:", token.text, "followed by a verb:",next_token.text)
      

Berlin run
Berlin PROPN
Found a proper noun Berlin
Found a proper noun: Berlin followed by a verb: run
run VERB


In [2]:
#Inspect word vectors, the internals of spacy
!python -m spacy download en_core_web_lg
import spacy
from spacy.lang.en import English

nlp = spacy.load("en_core_web_lg")
doc = nlp("Bananna has high pottasium content.")
print(doc[0].vector)


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
[ 0.48161   -0.26935    0.41607    0.097462  -0.21416    0.13311
 -0.096497   0.044064  -0.26328   -1.4447    -0.059216   0.014736
 -0.61483    0.3467    -0.063183   0.22785   -0.11686   -1.1117
  0.12027    0.1859     0.83987    0.4215     0.10633    0.26942
  0.6032    -0.6417     0.013047  -0.57084   -0.064691  -0.32419
 -0.43331    0.39903    0.5483    -0.046713   0.35956    0.43647
  0.18186   -0.082784  -0.58176   -0.1223     0.17687    0.47564
 -0.71711    0.25396    0.26958    1.3095     0.61861    0.13637
  0.18156    0.13038    0.27144    0.21337   -0.021689  -0.011762
  0.46194   -0.63902    0.23845    0.0067374  0.14203   -0.58214
 -0.11631    0.21218    0.43444    0.57715   -0.5672    -0.4454
 -0.25828   -0.37902   -0.50193    0.15941    0.081702   0.34822
 -0.18409    0.64842    0.51085    0.086275   0.13315   -0.084919
 -0.18875   -0.13966    0.3175     0.06454

In [3]:
#Comparing two documents similary
import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_lg")

doc1 = nlp("It is a warm summer day.")
doc2 = nlp("It's sunny outside.")

similarity = doc1.similarity(doc2)
print(similarity)


0.8827609047123806


In [9]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
#pattern1 = [{"LOWER": "Amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern1 = [{"TEXT":"Amazon"}]
pattern2 = [{"TEXT": "ad-free"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)



PATTERN1 Amazon
PATTERN1 Amazon
