<a href="https://colab.research.google.com/github/VitikaJain25/NLP/blob/master/spaCy_Tutorial/spaCy_Doc_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# To find the percentage in the given text

In [3]:
from spacy.lang.en import English
nlp = English()

# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

# Iterate over the tokens in doc
for token in doc:
  # Check if the token resembles a number
  if token.like_num:
    # Get the next token in the document
    next_token = doc[token.i+1]
    # Check if the next token's text equals "%"
    if next_token.text == "%":
      print ("Percentage found: ", token.text)

Percentage found:  60
Percentage found:  4


# POS Tag

In [4]:
import spacy

# Loading the small English model
nlp = spacy.load("en_core_web_sm")
# Process text
doc = nlp("She ate the pizza")
# Iterate over the tokens in doc
for token in doc:
  # Print the text and predicted POS tag
  print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


# Syntactic Dependencies

In [5]:
# Predict how the words are related
for token in doc:
  # Print the text, predicted POS tag, predicted dependency label, and syntactical head token(Parent Token that this word is attached to)
  print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


# Named Entities

In [6]:
# Process a text
doc = nlp("Apple is looking at buying U.K. statup at $ 1 Billion")
# Iterate over predicted entries
# "doc.ents" lets you access entries predicted by the model
for ent in doc.ents:
  # Print the entity text and its label
  print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$ 1 Billion MONEY


# To get quick definitions of common tags and labels


In [7]:
spacy.explain("GPE")

'Countries, cities, states'

In [8]:
spacy.explain("dobj")

'direct object'

# Need For Rule Based Matcher

In [9]:
# Models are statistical and not always right. Whether their predictions 
# are correct depends on the training data and the text you’re processing.
# Here, we can see that "Iphone X" is not recognised as an entity.

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)


# To overcome this, we have spaCy's rule-based matcher, which can help to
# find certain words and phrases in text.

Apple ORG
Missing entity: iPhone X


# Rule Based Matcher

In [10]:
import spacy
# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to matcher
pattern = [{"TEXT" : "iPhone"}, {"TEXT" : "X"}]
matcher.add("IPHONE_PATTERN", None, pattern)
# matcher.add - lets you add a pattern
# First argument - Unique id to identify which pattern was matched
# Second argument - optional Callback
# Third argument - pattern to match

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
# Returns list of tupples. Each tupple consists of 3 values: Match id, Start index, End index of matched span
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
  # Get the matched span
  matched_span = doc[start:end]
  print(match_id)
  print(start)
  print(end)
  print(matched_span.text)

9528407286733565721
1
3
iPhone X


# Another example 
- using Lexical Attributes



In [0]:
import spacy
# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to matcher
# Looking for 5 tokens: A token consisting of only digits, 3 case insensitive tokens 
# for 'fifa', 'world' and 'cup' and a token than consist of punctuation.
pattern = [{"IS_DIGIT" : True},
           {"LOWER" : "fifa"},
           {"LOWER" : "world"},
           {"LOWER" : "cup"},
           {"IS_PUNCT" : True}]

matcher.add("FIFA_PATTERN", None, pattern)
# matcher.add - lets you add a pattern
# First argument - Unique id to identify which pattern was matched
# Second argument - optional Callback
# Third argument - pattern to match

# Process some text
doc = nlp("2018 FIFA World Cup: France won!")

# Call the matcher on the doc
# Returns list of tupples. Each tupple consists of 3 values: Match id, Start index, End index of matched span
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
  # Get the matched span
  matched_span = doc[start:end]
  print(match_id)
  print(start)
  print(end)
  print(matched_span.text)

Another example

In [12]:
import spacy
# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to matcher
# Looking for 2 tokens: A verb with a Lemma love followed by a noun
pattern = [{"LEMMA" : "love" , "POS" : "VERB"}, {"POS" : "NOUN"}]

matcher.add("DOG_CAT_PATTERN", None, pattern)
# matcher.add - lets you add a pattern
# First argument - Unique id to identify which pattern was matched
# Second argument - optional Callback
# Third argument - pattern to match

# Process some text
doc = nlp("I loved dogs but now I love cats more.")

# Call the matcher on the doc
# Returns list of tupples. Each tupple consists of 3 values: Match id, Start index, End index of matched span
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
  # Get the matched span
  matched_span = doc[start:end]
  print(match_id)
  print(start)
  print(end)
  print(matched_span.text)

16454400285746368565
1
3
loved dogs
16454400285746368565
6
8
love cats


# Using Operators and Quantifiers

Lets you define how often a token should be matched

In [14]:
import spacy
# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to matcher
# Here, "?" operator makes the determiner token optional
# So, it will match a token with Lemma 'buy' and an optional Article and a Noun
pattern = [{"LEMMA" : "buy"}, {"POS" : "DET" , "OP" : "?"}, {"POS" : "NOUN"}]

matcher.add("SMARTPHONE_PATTERN", None, pattern)
# matcher.add - lets you add a pattern
# First argument - Unique id to identify which pattern was matched
# Second argument - optional Callback
# Third argument - pattern to match

# Process some text
doc = nlp("I bought a smartphone. Now I'm buying apps.")

# Call the matcher on the doc
# Returns list of tupples. Each tupple consists of 3 values: Match id, Start index, End index of matched span
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
  # Get the matched span
  matched_span = doc[start:end]
  print(match_id)
  print(start)
  print(end)
  print(matched_span.text)

3772001973323200979
1
4
bought a smartphone
3772001973323200979
8
10
buying apps


Example for Operators and Quantifiers:

{"OP" : "!"} - Negation : match 0 times

{"OP" : "?"} - Optional : match 0 or 1 times

{"OP" : "+"} - Match 1 or more times

{"OP" : "*"} - Match 0 or more times