In [4]:
import spacy 
nlp = spacy.blank("en")
doc = nlp("Hello World!")

# passing it through nlp object made from spacy.blank() in english will tokenize it and we can access tokens as per our use.

In [5]:
print(doc.text)

Hello World!


In [6]:
for token in doc:
    print(token.text)

Hello
World
!


In [7]:
print(doc[1].text) # Indexing direct

World


In [8]:
span = doc[1:3]
print(span.text)

World!


In [9]:
doc = nlp("It costs $598,76.22 .")

# Lexical Attributes , doesn't depend on token's context

print(f"Index  :  {[token.i for token in doc]}") #indexing very-powerful
print(f"Text  :  {[token.text for token in doc]}")
print(f"is_alpha  :  {[token.is_alpha for token in doc]}")
print(f"is_punct  :  {[token.is_punct for token in doc]}")
print(f"like_num  :  {[token.like_num for token in doc]}") # like num would also be true for 10 , TEN , ONE ZERO not necessary a number only.
print(len(doc))

Index  :  [0, 1, 2, 3, 4]
Text  :  ['It', 'costs', '$', '598,76.22', '.']
is_alpha  :  [True, True, False, False, False]
is_punct  :  [False, False, False, False, True]
like_num  :  [False, False, False, True, False]
5


In [10]:
# In this example, you’ll use spaCy’s Doc and Token objects, and lexical attributes to find percentages in a text. You’ll be looking for two subsequent tokens: a number and a percent sign.

# Use the like_num token attribute to check whether a token in the doc resembles a number.
# Get the token following the current token in the document. The index of the next token in the doc is token.i + 1.
# Check whether the next token’s text attribute is a percent sign ”%“.


nlp = spacy.blank("en")

doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. "
        "Now less than 4% are."  )

for i in range(0,len(doc)):
    if doc[i].like_num:
        if doc[i+1] == "%":
            print(doc[i],doc[i+1])

# This code does not work correctly , although logic may seem correct
# NOT AN ADVISABLE APPROACH , use token.text method to compare strings with strings , otherwise - will not get the desired result            

In [11]:
nlp = spacy.blank("en")

doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. "
        "Now less than 4% are."  )

for token in doc:
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == "%":
            print(f"{token.text}{doc[token.i + 1]}") 

# use in-built index in tokens from nlp object itself and have to use token.text to compare string vs string           

60%
4%


In [14]:
# Parts of speech tagging
# "en_core_web_sm" --> pipeline package
# Pipeline package 
# -> binary weights to make predictions , -> vocab , -> meta info and config file 
nlp = spacy.load("en_core_web_sm") # small english pipeline package from spacy
doc = nlp("She ate the pizza")

for token in doc:
    print(token.text , token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [16]:
for token in doc:
    print(token.text , token.pos_ , token.dep_ , token.head.text)

# dep --> predicted dependency label (like subject , root , object , determiner etc)
# head --> parent token this word is attached to.    

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [22]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

ents = doc.ents # this ents methods access the named entities predicted by the ner model , and is applied to the sentence given to it , it have those tokens which are ner in the text.

for ent in ents:
    print(ent.text , ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [23]:
print(f"{spacy.explain("det") , spacy.explain("ORG") , spacy.explain("GPE")}") # helper function to understand tags

('determiner', 'Companies, agencies, institutions, etc.', 'Countries, cities, states')


In [28]:
nlp = spacy.load("en_core_web_sm")

text = "It's official: Apple is the first U.S. public company to reach $1 trillion market value"

doc = nlp (text)

print(doc.text)
print(type(doc)) # <class 'spacy.tokens.doc.Doc'>
print(type(doc.text)) # <class 'str'> 
# always use .text to get strings

It's official: Apple is the first U.S. public company to reach $1 trillion market value
<class 'spacy.tokens.doc.Doc'>
<class 'str'>


In [33]:
# You’ll now get to try one of spaCy’s trained pipeline packages and see its predictions in action. Feel free to try it out on your own text! To find out what a tag or label means, you can call spacy.explain in the loop. For example: spacy.explain("PROPN") or spacy.explain("GPE").

# Part 1
# Process the text with the nlp object and create a doc.
# For each token, print the token text, the token’s .pos_ (part-of-speech tag) and the token’s .dep_ (dependency label).

for token in doc:
    print(f"{token.text , token.pos_ , token.dep_ , token.head.text , spacy.explain(token.pos_) , spacy.explain(token.dep_)}")

('It', 'PRON', 'nsubj', "'s", 'pronoun', 'nominal subject')
("'s", 'AUX', 'ccomp', 'is', 'auxiliary', 'clausal complement')
('official', 'ADJ', 'acomp', "'s", 'adjective', 'adjectival complement')
(':', 'PUNCT', 'punct', "'s", 'punctuation', 'punctuation')
('Apple', 'PROPN', 'nsubj', 'is', 'proper noun', 'nominal subject')
('is', 'AUX', 'ROOT', 'is', 'auxiliary', 'root')
('the', 'DET', 'det', 'company', 'determiner', 'determiner')
('first', 'ADJ', 'amod', 'company', 'adjective', 'adjectival modifier')
('U.S.', 'PROPN', 'nmod', 'company', 'proper noun', 'modifier of nominal')
('public', 'ADJ', 'amod', 'company', 'adjective', 'adjectival modifier')
('company', 'NOUN', 'attr', 'is', 'noun', 'attribute')
('to', 'PART', 'aux', 'reach', 'particle', 'auxiliary')
('reach', 'VERB', 'relcl', 'company', 'verb', 'relative clause modifier')
('$', 'SYM', 'quantmod', 'trillion', 'symbol', 'modifier of quantifier')
('1', 'NUM', 'compound', 'trillion', 'numeral', 'compound')
('trillion', 'NUM', 'nummod

In [34]:
for ent in doc.ents:
    print(ent.text , ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [36]:
nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

doc = nlp(text)

for ent in doc.ents:
    print(ent.text , ent.label_) # iPhone X , model didn't extract it - manually have to correct it

Apple ORG


In [54]:
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)

pattern = [{"TEXT" : "iPhone"} , {"TEXT" : "X"}]
matcher.add("IPHONE_PATTERN" , [pattern])

doc = nlp("Upcoming iPhone X release date leaked")

matches = matcher(doc)

In [55]:
# matches -> list of tuples (match_id , start index , end index)

In [56]:
for match_id , start , end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [57]:
pattern = [{"IS_DIGIT": True} , {"LOWER" : "fifa"} , {"LOWER" : "world"} , {"LOWER": "cup"} , {"IS_PUNCT" : True}]
doc = nlp("2018 FIFA World Cup : Drance won!")

In [60]:
nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)

pattern = [{"TEXT" : "iPhone"} , {"TEXT" : "X"}]

matcher.add("I_Phone_Test" , [pattern])

doc = nlp("Apple has made iPhone X and launched it in late 1990s.")

matches = matcher(doc)

for i,j,k in matches:
    print(doc[j:k].text)

iPhone X


In [62]:
pattern1 = [{"TEXT" : "iOS"} , {"IS_DIGIT" : True}]
matcher.add("complex_pattern" , [pattern1])

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper. iPhone X"
)

matches = matcher(doc) # multiple patterns have been fed into this

for i,j,k in matches:
    print(doc[j:k].text)

iOS 7
iOS 11
iOS 10
iPhone X


In [65]:
pattern2 = [{"LEMMA" : "download"} , {"POS" : "PROPN"}]
matcher.add("complex_pattern_1" , [pattern2])

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

matches = matcher(doc)

for i,j,k in matches:
    print(doc[j:k].text)

downloaded Fortnite
downloading Minecraft
download Winzip


In [None]:
# Match Email Addresses
# Task: Write a pattern to match email addresses, such as:

# "example@domain.com"
# "contact_us@company.co.uk"
# The pattern should:

# Use token attributes like IS_ALPHA, IS_DIGIT, and TEXT.
# Ensure the presence of an "@" symbol and a valid domain.


