## NLP Object

In [1]:
# import the English language class
from spacy.lang.en import English

# creat the nlp object
nlp = English()

## Doc Object

In [2]:
# creating by processing a string of text with the help of nlp object
Doc= nlp("Hello World!")

# Iterate over token in a Doc
for token in Doc:
    print(token)

Hello
World
!


In [3]:
# Indexing a doc object
token = Doc[1]

# Get the token text via the .text attribute
token.text # string type object

'World'

## Span Object

In [4]:
# A slice from a Doc object is a span object
Doc= nlp("Hello World!")
span = Doc[1:4]

span.text

'World!'

## Lexical Attribute

In [5]:
Doc= nlp("It cost $5.")

print("Index: ", [token.i for token in Doc])
print("Text: ",[token.text for token in Doc])
print("Is_alpha: ",[token.is_alpha for token in Doc])
print("Is_punct: ",[token.is_punct for token in Doc])
print("like_num: ",[token.like_num for token in Doc])

Index:  [0, 1, 2, 3, 4]
Text:  ['It', 'cost', '$', '5', '.']
Is_alpha:  [True, True, False, False, False]
Is_punct:  [False, False, False, False, True]
like_num:  [False, False, False, True, False]


## Statistical models

### load spacy model 

In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')

### Predicting part-of-speech 

In [7]:
doc = nlp("he ate the pizza")

#Iterate over the tokens
for token in doc:
    # Print the text and predict their part of speech.
    print(token.text, token.pos_)

he PRON
ate VERB
the DET
pizza NOUN


### Predicting Syntactic Dependencies 

In [8]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

he PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


### Predicting Named Entities

In [9]:
#Process a text
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

#Iterate over predicted entities
for token in doc.ents:
    # Print the entity text and its label
    print(token.text, token.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


## Rule based matching

In [10]:
import spacy
from spacy.matcher import Matcher

# laod the model and create nlp object.
nlp = spacy.load('en_core_web_sm')

# Initial the matcher with shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the Matcher
pattern = [{"TEXT":"iPhone"}, {"TEXT":"X"}]
matcher.add("IPHONE_PATTERN",None, pattern)

# Process some text
doc = nlp("New iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the match span
    matched_span = doc[start:end]
    print(matched_span)

iPhone X


## Matching Lexical Attributes

In [11]:
custom_patter_1 = [
    {'IS_DIGIT':True},
    {'LOWER':'fifa'},
    {'LOWER':'world'},
    {'LOWER':'cup'},
    {'IS_PUNCT':True}
]

matcher.add("cust_matcher_1",None,custom_patter_1)
doc = nlp("2018 FIFA World Cup: France won!")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the match span
    matched_span = doc[start:end]
    print(matched_span)

2018 FIFA World Cup:


## Matching Other Token Attributes

In [12]:
custom_patter_2 = [
    {'LEMMA':'love', 'POS':'VERB'},
    {'POS':'NOUN'}
]

matcher.add("cust_matcher_2",None,custom_patter_2)
doc = nlp("I loved dogs but now I love cats more.")
# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the match span
    matched_span = doc[start:end]
    print(matched_span)

loved dogs
love cats


## Using Operators and Quantifiers

In [13]:
custom_patter_3 = [
    {'LEMMA':'buy'},
    {'POS':"DET", 'OP':'?'}, #optional match 0 or 1 times.
    {'POS':'NOUN'}
]

matcher.add("cust_matcher_2",None,custom_patter_3)
doc = nlp("I bought a smartphone. Now I'm buying apps.")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the match span
    matched_span = doc[start:end]
    print(matched_span)

bought a smartphone
buying apps
