In [1]:
from spacy.lang.en import English

In [None]:
import os


In [2]:
nlp = English()

In [4]:
doc = nlp("Hello world!")

In [5]:
for token in doc:
    print(token.text) # word or punctuation chars

Hello
world
!


In [6]:
token = doc[1]

In [7]:
token

world

In [9]:
span = doc[1:4]

In [10]:
span 

world!

In [13]:
span = doc[1:2]

In [14]:
span 

world

In [16]:
doc = nlp('It costs $5.')

In [19]:
print('Tokens: ', [token for token in doc])

Tokens:  [It, costs, $, 5, .]


In [20]:
print('Index: ', [token.i for token in doc])

Index:  [0, 1, 2, 3, 4]


In [23]:
print('is_alpha: ', [token.is_alpha for token in doc])

is_alpha:  [True, True, False, False, False]


In [24]:
print('is_punct: ', [token.is_punct for token in doc])

is_punct:  [False, False, False, False, True]


In [25]:
print('like_num: ', [token.like_num for token in doc])

like_num:  [False, False, False, True, False]


In [26]:
doc = nlp('It costs $5 and ten cents.')

In [28]:
print('like_num: ', [token.like_num for token in doc])
# can detect "TEN" as a number

like_num:  [False, False, False, True, False, True, False, False]


In [30]:
first_token = doc[0]

In [31]:
first_token 

It

In [None]:
# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than

In [32]:
# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

In [34]:
# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals '%'
        if next_token.text == '%':
            print('Percentage found:', token.text)

Percentage found: 60
Percentage found: 4


In [35]:
import spacy

nlp = spacy.load('en_core_web_md')

In [36]:
doc = nlp('She ate the pizza')

In [51]:
#iterate over the tokens
for token in doc:
    print(f"{token.text:{6}} {token.pos_:->{10}}")

She    ------PRON
ate    ------VERB
the    -------DET
pizza  ------NOUN


In [63]:
for token in doc:
    print(f"{token.text:{6}} {token.pos_:->{10}} {token.dep_:>{10}} {token.head.text:>{8}}")
    # dep_ return dependencies (subject or object)
    # token.head.tetx --> parent token of. Shows the child tokens
    # nsubj (nominal subject), dobj(direct object), det(determiner)

She    ------PRON      nsubj      ate
ate    ------VERB       ROOT      ate
the    -------DET        det    pizza
pizza  ------NOUN       dobj      ate


In [65]:
# Named Entitites

doc = nlp(u'Apple is looking at buying U.K. starup for $1 billion')

In [71]:
print('like_num: ', [token.like_num for token in doc])

like_num:  [False, False, False, False, False, False, False, False, False, True, True]


In [74]:
# Iterate overt the predicted entitites
for ent in doc.ents:
    print(f'{ent.text:{10}} {ent.label_}')

Apple      ORG
U.K.       GPE
$1 billion MONEY


In [76]:
spacy.explain('GPE')

'Countries, cities, states'

In [78]:
spacy.explain('MONEY')

'Monetary values, including unit'

In [80]:
spacy.explain('NNP')  # also for dependency labels

'noun, proper singular'

In [81]:
spacy.explain('dobj')

'direct object'

In [88]:
## Rule-Match 


from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

## Add the pattern to the macther
pattern = [{'ORTH':'iPhone'}, {'ORTH':'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)  # IPHONE_PATTERN is a unique ID

In [89]:
doc = nlp("New iPhone X relase date leaked")

In [90]:
matches = matcher(doc)

In [93]:
matches

[(9528407286733565721, 1, 3)]

In [95]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span)

iPhone X


In [107]:
pattern=[{'IS_DIGIT': True},
         {'LOWER':'fifa'},
         {'LOWER':'world'},
         {'LOWER':'cup'},
         {'IS_PUNCT': True}]

# Token include digits, punct and case INsensitive fifa, world, cup

In [108]:
doc = nlp('2018 FIFA World Cup: France Won!')

In [109]:
matcher.add('WORLDCUP_PATTERN', None, pattern)

In [110]:
macthes = matcher(doc)

In [111]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span)

FIFA World


In [117]:
pattern = [{'LEMMA':'love', 'POS':'VERB'},{'POS':'NOUN'}]

# We are looking for a 'Love' verb followed by a noun

In [118]:
doc = nlp('I loved dogs but now I love cats more')

In [119]:
matcher.add('LOVE_PATTERN', None, pattern)
macthes = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span)

loved dogs


In [121]:

# Write one pattern that matches adjectives ('ADJ') followed by one or two 'NOUN's 
# (one noun and one optional noun)


doc = nlp("Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.")

# Write a pattern for adjective plus one or two nouns
pattern = [{'POS': 'ADJ'}, {'POS': 'NOUN'}, {'POS': 'NOUN', 'OP': '?'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('ADJ_NOUN_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 8
Match found: beautiful design
Match found: beautiful design
Match found: smart search
Match found: smart search
Match found: automatic labels
Match found: automatic labels
Match found: optional voice responses
Match found: optional voice responses


In [122]:
## Write one pattern that only matches mentions of the full iOS versions: 
# "iOS 7", "iOS 11" and "iOS 10".


doc = nlp("After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper.")

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{'TEXT': 'iOS'}, {'IS_DIGIT': True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('IOS_VERSION_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 9
Match found: radical system
Match found: radical system
Match found: wide redesign
Match found: wide redesign
Match found: aesthetic upheaval
Match found: aesthetic upheaval
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [123]:
coffee_hash = nlp.vocab.strings['coffee']

In [124]:
coffee_hash

3197928453018144401

In [125]:
coffee_string = nlp.vocab.strings[coffee_hash]

In [126]:
coffee_string

'coffee'

In [131]:
## LEXEMES

doc = nlp('I love coffee')
lexeme = nlp.vocab['coffee']

# print the lexical attributes

print(lexeme.text, lexeme.orth, lexeme.is_alpha, lexeme.like_num)

coffee 3197928453018144401 True False
