# Introduction to spaCy

In [1]:
# import english language class
from spacy.lang.en import English

# create a nlp object
nlp = English()

# process a string of text with nlp object
doc = nlp("This is a sentence.")

for token in doc:
    print(token.text)
    
doc = nlp("Hello World!")

# index into the doc to get a single token
token = doc[1]

# get the token text via the .text attribute
print(token.text)

# a slice from the Doc is a Span object
span = doc[1:3]
print(span.text)

# lexical attrubutes
doc = nlp("It costs $5.  het422")

print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])
print("is_alpha ", [token.is_alpha for token in doc])
print("is_digit ", [token.is_digit for token in doc])
print("is_punct ", [token.is_punct for token in doc])

This
is
a
sentence
.
World
World!
Index:    [0, 1, 2, 3, 4, 5, 6]
Text:     ['It', 'costs', '$', '5', '.', ' ', 'het422']
is_alpha  [True, True, False, False, False, False, False]
is_digit  [False, False, False, True, False, False, False]
is_punct  [False, False, False, False, True, False, False]


# Practice

<h3>Getting Started</h3>
<p>1. Import the English class from spacy.lang.en and create the nlp object.</p>
<p>2. Create a doc and print its text.<p>


In [2]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process a text
doc = nlp("This is a sentence.")

# Print the document text
print(doc.text)

This is a sentence.


<h3>Document, spans, tokens</h3>
<h6>Step 1</h6>
<p>1 Import the English language class and create the nlp object.</p>
<p>2 Process the text and instantiate a Doc object in the variable doc.</p>
<p>3Select the first token of the Doc and print its text.</p>

<h6>Step 2</h6>
<p>Create a slice of the Doc for the tokens “tree kangaroos” and “tree kangaroos and narwhals”.</p>

In [3]:
# Import the English language class and create the nlp object
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2: 4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2: 6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


<h3>Lexical attributes</h3>

<p> 1 Use the like_num token attribute to check whether a token in the doc resembles a number.</p>
<p> 2 Get the token following the current token in the document. The index of the next token in the doc is token.i + 1.</p>
<p> 3 Check whether the next token’s text attribute is a percent sign ”%“.</p>

In [4]:
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


# Statistical models

In [5]:
# Model packages
# python -m spacy download en_core_web_sm
# It has:-
#         Binary Weights
#         Vocabulary
#         Meta information(language, pipeline)

# predicting part-of-speech tags
import spacy

# Load the small english model
nlp = spacy.load("en_core_web_sm")

# process a text
doc = nlp("She ate the pizza")

# iterate over tokens
for token in doc:
    # print the text and predicted part-of-speech tag
    print(token.text, token.pos_)
print()

# predicting syntactic dependencies
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)
print()

# predicting named entities
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# iterate over the predicted entities
for ent in doc.ents:
    print(ent.text, ent.label_)
print()

spacy.explain("GPE")

She PRON
ate VERB
the DET
pizza NOUN

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate

Apple ORG
U.K. GPE
$1 billion MONEY



'Countries, cities, states'

# Practice

<h3>Loading models</h3>
<p>1 Use spacy.load to load the small English model "en_core_web_sm".</p>
<p>2 Process the text and print the document text.</p>

In [6]:
import spacy

# Load the "en_core_web_sm" model
nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Print the document text
print(doc.text)

It’s official: Apple is the first U.S. public company to reach a $1 trillion market value


<h3>predicting linguistic annotations</h3>
<p>1 Process the text with the nlp object and create a doc.</p>
<p>2 For each token, print the token text, the token’s .pos_ (part-of-speech tag) and the token’s .dep_ (dependency label).</p>
<p>3 Iterate over the doc.ents and print the entity text and label_ attribute.</p>

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")
print()

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

It          PRON      nsubj     
’s          VERB      punct     
official    NOUN      ccomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


<h3>Predicting named entites in context</h3>
<p>1 Process the text with the nlp object.</p>
<p>2 Iterate over the entities and print the entity text and label.</p>
<p>3 Looks like the model didn’t predict “iPhone X”. Create a span for those tokens manually.</p>

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


# Rule-based Matching

In [9]:
# Match patterns
# List of dictionaries, one per token
# Match exact token texts
# [{"TEXT":"iPhone"}, {"TEXT":"X"}]

# Match lexical attributes
# [{"LOWER": "iphone"}, {"LOWER":"x"}]

# Match any token attributes
# [{"LEMMA"}: "buy", {"POS": "NOUN"}]

# using matcher (1)
import spacy

# import the matcher
from spacy.matcher import Matcher

# load a model and create the nlp object
nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# initialize the matcher with shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", None, pattern)

# Process the text
matches = matcher(doc)

# iterate over the matches
for match_id, start, end in matches:
    # get the matched span
    matched_span = doc[start: end]
    print(matched_span.text)
print()

# using matcher (2)
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
matcher.add("FIFA_PATTERN", None, pattern)

doc = nlp("2018 FIFA World Cup: France won!")

# Process the text
matches = matcher(doc)

# iterate over the matches
for match_id, start, end in matches:
    # get the matched span
    matched_span = doc[start: end]
    print(matched_span.text)
print()

# using matcher (3)
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]
matcher.add("FIFA_PATTERN", None, pattern)

doc = nlp("I loved dogs but now I love cats more.")

# Process the text
matches = matcher(doc)

# iterate over the matches
for match_id, start, end in matches:
    # get the matched span
    matched_span = doc[start: end]
    print(matched_span.text)
print()


pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},
    {"POS": "NOUN"}
]

matcher.add("FIFA_PATTERN", None, pattern)

doc = nlp("I bought a smartphone. Now I'm buying apps.")

# Process the text
matches = matcher(doc)

# iterate over the matches
for match_id, start, end in matches:
    # get the matched span
    matched_span = doc[start: end]
    print(matched_span.text)
print()


iPhone X

2018 FIFA World Cup:

loved dogs
love cats

bought a smartphone
buying apps

