In [1]:
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span

In [2]:
nlp = English()

In [3]:
doc = nlp("This is a sentence.")

In [4]:
print(doc.text)

This is a sentence.


In [5]:
first_token = doc[0]
print(first_token.text)

This


In [6]:
sentence = doc[3:4]
print(sentence.text)

sentence


In [7]:
doc_1 = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

In [8]:
for token in doc_1:
    if token.like_num:
        next_token = doc_1[token.i + 1]
        if next_token.text == '%':
            print('Percent found:', token.text)

Percent found: 60
Percent found: 4


In [9]:
nlp = spacy.load('en_core_web_sm')


In [10]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

In [11]:
doc_2 = nlp(text)

In [12]:
print(doc_2.text)

It’s official: Apple is the first U.S. public company to reach a $1 trillion market value


In [13]:
for token in doc_2:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    
    print('{:<12}{:<10}{:<10}'.format(token_text, token_pos, token_dep))

It          PRON      nsubj     
’s          VERB      punct     
official    NOUN      ccomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [14]:
for ent in doc_2.ents:
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [15]:
text_1 = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

In [16]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [17]:
doc_3 = nlp(text_1)

In [18]:
matcher = Matcher(nlp.vocab)

In [19]:
pattern = [{'ORTH':'iPhone'}, {'ORTH':'X'}]

In [20]:
matcher.add('IPHONE_X_PATTERN', None, pattern)

In [21]:
matches = matcher(doc_3)

In [22]:
print("Matches:", [doc_3[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [23]:
doc_4 = nlp("After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper.")


In [24]:
pattern_1 = [{'TEXT':'iOS'}, {'IS_DIGIT':True}]

In [25]:
matcher.add('IOS_VERSION_PATTERN', None, pattern_1)

In [26]:
matches = matcher(doc_4)

In [27]:
print('Total matches found:', len(matches))

Total matches found: 3


In [28]:
doc_5 = nlp("i downloaded Fortnite on my laptop and can't open the game at all. Help? so when I was downloading Minecraft, I got the Windows version where it is the '.zip' folder and I used the default program to unpack it... do I also need to download Winzip?")

In [29]:
pattern_2 = [{'LEMMA': 'download'}, {'POS':'PROPN'}]

In [30]:
matcher.add('DOWNLOAD_THINGS_PATTERN', None, pattern_2)

In [31]:
matches_2 = matcher(doc_5)

In [32]:
print('Total matches found', len(matches_2))

Total matches found 3


In [33]:
for match_id, start, end in matches_2:
    print('Match found:', doc[start:end].text)

Match found: is a
Match found: 
Match found: 


In [34]:
doc = nlp("Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.")

In [35]:
pattern= [{'POS': 'ADJ'}, {'POS':'ADJ'}, {'POS':'NOUN'}, {'OP':'?'}]

In [36]:
matcher.add('ADJ_NOUN_PATTERN', None, pattern)

In [37]:
matches = matcher(doc)

In [38]:
print('Total matches found:', len(matches))

Total matches found: 0


In [39]:
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

In [40]:
# Vocab 

person_hash = nlp.vocab.strings['PERSON']
print(person_hash)

380


In [41]:
person_string = nlp.vocab.strings[person_hash]
print(person_string)

PERSON


In [42]:
words = ['spaCy', 'is', 'cool', '!']
spaces = [True, True, False, False]

In [43]:
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [44]:
print(doc.text)

spaCy is cool!


In [45]:
words = ['Go', ',', 'get', 'started', '!']
spaces = [False, True, True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Go, get started!


In [46]:
words = ['I', 'like', 'David', 'Bowie']
spaces = [True, True, True, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

I like David Bowie


In [47]:
span = Span(doc, 2, 4, label="PERSON")
print(span)

David Bowie


In [48]:
doc.ents = [span]

In [49]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('David Bowie', 'PERSON')]


In [50]:
nlp = spacy.load('en_core_web_md')

In [51]:
doc = nlp("Teo bananas in pyjamas")

In [52]:
bananas_vector = doc[1].vector

In [53]:
print(bananas_vector)

[-2.2009e-01 -3.0322e-02 -7.9859e-02 -4.6279e-01 -3.8600e-01  3.6962e-01
 -7.7178e-01 -1.1529e-01  3.3601e-02  5.6573e-01 -2.4001e-01  4.1833e-01
  1.5049e-01  3.5621e-01 -2.1508e-01 -4.2743e-01  8.1400e-02  3.3916e-01
  2.1637e-01  1.4792e-01  4.5811e-01  2.0966e-01 -3.5706e-01  2.3800e-01
  2.7971e-02 -8.4538e-01  4.1917e-01 -3.9181e-01  4.0434e-04 -1.0662e+00
  1.4591e-01  1.4643e-03  5.1277e-01  2.6072e-01  8.3785e-02  3.0340e-01
  1.8579e-01  5.9999e-02 -4.0270e-01  5.0888e-01 -1.1358e-01 -2.8854e-01
 -2.7068e-01  1.1017e-02 -2.2217e-01  6.9076e-01  3.6459e-02  3.0394e-01
  5.6989e-02  2.2733e-01 -9.9473e-02  1.5165e-01  1.3540e-01 -2.4965e-01
  9.8078e-01 -8.0492e-01  1.9326e-01  3.1128e-01  5.5390e-02 -4.2423e-01
 -1.4082e-02  1.2708e-01  1.8868e-01  5.9777e-02 -2.2215e-01 -8.3950e-01
  9.1987e-02  1.0180e-01 -3.1299e-01  5.5083e-01 -3.0717e-01  4.4201e-01
  1.2666e-01  3.7643e-01  3.2333e-01  9.5673e-02  2.5083e-01 -6.4049e-02
  4.2143e-01 -1.9375e-01  3.8026e-01  7.0883e-03 -2

In [55]:
doc_1 = nlp("It's a warm summer day")
doc_2 = nlp("It's sunny outside")

In [56]:
similarities = doc_1.similarity(doc_2)
print(similarities)

0.8789265574516525


In [57]:
doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]
similarity = token1.similarity(token2)
print(similarity)

0.22325331


In [60]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")
span1 = doc[3:5]
span2 = doc[12:15]
similarity = span1.similarity(span2)
print(similarity)

0.75173926
