In [1]:
#checking if the spacy module exists and installing it it doesn't exist
!pip install -U spacy

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


The spacy module is an open source Python library that is used for Natural Language Processing (NLP). It features Named Entity Recognition (NER), Parts of Speech (POS) tagging, dependency parsing, word vectors and a lot more.

In [2]:
#importing the spacy module
import spacy

In [3]:
#loading the English language model
nlp = spacy.load('en_core_web_sm')

In [4]:
#processing an English sentence
doc_1 = nlp('Tesla is looking at buying U.S. startup for $6 millio')

In [5]:
#using for loop to perform tokenization and printing each token in the sentence with its part of speech and dependency
for token in doc_1:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM nmod
6 NUM pobj
millio NOUN pobj


In [6]:
#viewing the nlp pipeline
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x730f1eeb2320>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x730f1eeb1ea0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x730e6debf8b0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x730e6d565e40>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x730e6d548ec0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x730e70f05690>)]

In [7]:
#viewing the nlp pipe names
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

# Processing other English language sentences and tokenizing them

In [8]:
doc_2 = nlp("Tesla isn't looking into startups anymore.")

In [9]:
for token in doc_2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [10]:
#viewing the word in the 0th index
doc_2[0]

Tesla

In [11]:
#viewing the part of speech, of the word in the 0th index
doc_2[0].pos_

'PROPN'

In [12]:
#viewing the dependency of the word in the 0th index
doc_2[0].dep_

'nsubj'

In [13]:
#finding out the meaning of the dependency label
spacy.explain('nsubj')

'nominal subject'

In [14]:
#viewing the word in the 4th index
doc_2[4].lemma_

'into'

In [15]:
doc_3 = nlp(u"Tesla isn't looking into startups anymore.")

In [16]:
for token in doc_3:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [17]:
doc_3[0]

Tesla

In [18]:
doc_3[0].pos_

'PROPN'

In [19]:
doc_3[0].dep_

'nsubj'

In [20]:
doc_3[4].lemma_

'into'

# Tokenization of other strings

In [21]:
my_string = "We\'re moving to L.A.!"

In [22]:
my_string

"We're moving to L.A.!"

In [23]:
my_str = " We're moving to L.A.!"

In [24]:
doc_4 = nlp(my_string)

In [25]:
for token in doc_4:
    print(token.text, end=" | ")

We | 're | moving | to | L.A. | ! | 

In [26]:
doc_5 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

In [27]:
for token in doc_5:
    print(token)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [28]:
doc_6 = nlp(u"A 5km UBER cab costs $5")

In [29]:
for token in doc_6:
    print(token)

A
5
km
UBER
cab
costs
$
5


In [30]:
sample_sentence = "A 5km UBER cab costs $5"

In [31]:
for i in sample_sentence:
    print(i)

A
 
5
k
m
 
U
B
E
R
 
c
a
b
 
c
o
s
t
s
 
$
5


In [32]:
sample_sentence_one = "A5kmUBERcabcosts$5"

In [33]:
for i in sample_sentence_one:
    print(i)

A
5
k
m
U
B
E
R
c
a
b
c
o
s
t
s
$
5


In [34]:
doc_7 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [35]:
for token in doc_7:
    print(token)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [36]:
doc_8 = nlp(u"Let's visit St.Louis in the U.S. next year.")

In [37]:
for token in doc_8:
    print(token)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [38]:
len(doc_7)

11

In [40]:
len(doc_7.vocab)

801

In [41]:
doc_7[2:5]

visit St. Louis

In [42]:
doc_7[3]

St.

In [43]:
doc_9 = nlp(u"Apple to build a Hong Kong factory for $6 million.")

In [44]:
for token in doc_9:
    print(token, end=" | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | . | 

In [45]:
#usisng for loop, to understand the attributes of the sentence, like the text entity, its label and explanation of the entity label
for ent in doc_9.ents:
    print(ent.text + " - " + ent.label_ + " - " + str(spacy.explain(ent.label_)))

Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [46]:
#printingt he length of the entities of the 9th document
len(doc_9.ents)

3

In [47]:
#importing displacy from the spacy module
from spacy import displacy

Displacy is a modern syntactic dependency visualizer

In [48]:
#processing an English sentence
doc = nlp(u"Apple is going to build a Hong Kong factory for $6 million.")

In [49]:
#visualizing the dependency parse tree of the ENglish sentence
displacy.render(doc, style='dep', jupyter=True,options={'distance':110})

In [50]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")
displacy.render(doc, style='ent', jupyter=True)