In [1]:
import spacy

In [2]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_lg")

In [3]:
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")

In [4]:
doc = nlp(text)
type(doc)

spacy.tokens.doc.Doc

## Syntax Analysis

In [5]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']


In [6]:
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']


In [7]:
#Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


## Tokenization

In [8]:
for token in doc:
  print(token.text)

When
Sebastian
Thrun
started
working
on
self
-
driving
cars
at
Google
in
2007
,
few
people
outside
of
the
company
took
him
seriously
.
“
I
can
tell
you
very
senior
CEOs
of
major
American
car
companies
would
shake
my
hand
and
turn
away
because
I
was
n’t
worth
talking
to
,
”
said
Thrun
,
in
an
interview
with
Recode
earlier
this
week
.


## Text-Preprocessing

In [9]:
for token in doc:
  print(token.text,'--',token.is_stop,'---',token.is_punct)

When -- True --- False
Sebastian -- False --- False
Thrun -- False --- False
started -- False --- False
working -- False --- False
on -- True --- False
self -- False --- False
- -- False --- True
driving -- False --- False
cars -- False --- False
at -- True --- False
Google -- False --- False
in -- True --- False
2007 -- False --- False
, -- False --- True
few -- True --- False
people -- False --- False
outside -- False --- False
of -- True --- False
the -- True --- False
company -- False --- False
took -- False --- False
him -- True --- False
seriously -- False --- False
. -- False --- True
“ -- False --- True
I -- True --- False
can -- True --- False
tell -- False --- False
you -- True --- False
very -- True --- False
senior -- False --- False
CEOs -- False --- False
of -- True --- False
major -- False --- False
American -- False --- False
car -- False --- False
companies -- False --- False
would -- True --- False
shake -- False --- False
my -- True --- False
hand -- False --- False


In [10]:
# Removing StopWords and punctuations
doc_cleaned = [token for token in doc if not token.is_stop and not token.is_punct]

for token in doc_cleaned:
  print(token.text)

Sebastian
Thrun
started
working
self
driving
cars
Google
2007
people
outside
company
took
seriously
tell
senior
CEOs
major
American
car
companies
shake
hand
turn
away
worth
talking
said
Thrun
interview
Recode
earlier
week


## Sentence Boundry Detection

In [11]:
for sentence in doc.sents:
    print('A sentence: %s' % sentence)

A sentence: When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.
A sentence: “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.


## Lemmatization

In [12]:
# Lemmatizing the tokens of a doc
text = 'she played chess against rita she likes playing chess.'
doc = nlp(text)
for token in doc:
  print(token.lemma_)

she
play
chess
against
rita
she
like
play
chess
.


## Strings to Hashes

In [13]:
# Strings to Hashes and Back
doc = nlp("I love singing")

# Look up the hash for the word "traveling"
word_hash = nlp.vocab.strings["singing"]
print(word_hash)

# Look up the word_hash to get the string
word_string = nlp.vocab.strings[word_hash]
print(word_string)

13402777656386554723
singing


## Lexical attributes

In [14]:
# Printing the tokens which are like numbers
text=' 2022 is good than 2020'
doc=nlp(text)
for token in doc:
  if token.like_num:
    print(token)

2022
2020


## Text Classification - Detecting Email Addresses

In [15]:
# text containing employee details
employee_text="""name : Akshada age: 25 email : akshada@gmail.com
                 name : Aishwarya age: 30 email: aishwarya1990@gmail.com"""

# creating a spacy doc          
employee_doc=nlp(employee_text)

# Printing the tokens which are email through `like_email` attribute
for token in employee_doc:
  if token.like_email:
    print(token.text)

akshada@gmail.com
aishwarya1990@gmail.com


## Part of Speech (POS) Tagging

In [16]:
# POS tagging using spaCy
my_text='John plays basketball,if time permits. He played in high school too.'
my_doc=nlp(my_text)
for token in my_doc:
  print(token.text,'---- ',token.pos_)

John ----  PROPN
plays ----  VERB
basketball ----  NOUN
, ----  PUNCT
if ----  SCONJ
time ----  NOUN
permits ----  VERB
. ----  PUNCT
He ----  PRON
played ----  VERB
in ----  ADP
high ----  ADJ
school ----  NOUN
too ----  ADV
. ----  PUNCT


In [17]:
spacy.explain('SCONJ')

'subordinating conjunction'

## Visualization

In [18]:
# Importing displacy
from spacy import displacy
my_text='He always liked to see Sunrise. He never wanted to sit at home in the early mornings'
my_doc=nlp(my_text)

# displaying tokens with their POS tags
displacy.render(my_doc,style='dep',jupyter=True)

## Named Entity Recognition (NER)

In [19]:
# Preparing the spaCy document
text = 'Tony Stark owns the company Stark Industries . Bill Gates works at Microsoft and lives in Washington. He loves to read a lot of books'
doc = nlp(text)

# Printing the named entities
print(doc.ents)

(Tony Stark, Stark Industries, Bill Gates, Microsoft, Washington)


In [20]:
# Printing labels of entities.
for entity in doc.ents:
  print(entity.text,'--- ',entity.label_)

Tony Stark ---  PERSON
Stark Industries ---  ORG
Bill Gates ---  PERSON
Microsoft ---  ORG
Washington ---  GPE


In [21]:
# Using displacy for visualizing NER
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)


## Word Vectors and Similarity

In [22]:
tokens = nlp("I love to read books")

for token in tokens:
  print(token.text ,' ',token.has_vector)

I   True
love   True
to   True
read   True
books   True


In [23]:
# Extract the word Vector
tokens=nlp("I aspire to become an astronaut")
for token in tokens:
  print(token.text,' ',token.vector_norm)

I   6.4231944
aspire   6.497354
to   4.74484
become   5.52595
an   5.5624833
astronaut   7.3070984


In [24]:
# Compute Similarity
token_1=nlp("good")
token_2=nlp("excellent")

similarity_score=token_1.similarity(token_2)
print(similarity_score)

0.777407893397578


In [25]:
# Compute Similarity between texts 
sun = nlp('sun')
sunshine = nlp('sunshine')
ocean = nlp('ocean')

print('Sun and Sunshine  ',sun.similarity(sunshine))
print('Sun and Ocean  ',sun.similarity(ocean))

Sun and Sunshine   0.7453431436906861
Sun and Ocean   0.559622645398276


## Word Vector Representation

In [26]:
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(300,)
[-0.39086    0.33664    0.62282   -0.14561    0.089456   0.66506
  0.18482   -0.3824     0.21445    0.054463  -0.4647     0.067074
 -1.0379    -0.29004    0.023828  -0.092388  -0.0023503  0.98985
  0.37535   -0.13404    0.11986    0.65159    0.15408    0.71978
 -0.32733   -0.5124     0.050161   0.042919   0.094624  -0.89656
 -0.0084081 -0.40156    0.26622   -0.5297    -0.33762    0.19521
  0.15476   -0.039438  -1.0185     0.24024   -0.35842    0.020426
  0.0086298 -0.087828   0.21932    0.86854   -0.31764    0.49903
  0.22552    0.36338    0.12186   -0.35134    0.17495    0.015455
  1.0356    -0.72409    0.018939  -0.095863  -0.1387    -0.092748
  0.090583  -0.4015    -0.045474   0.35143    0.15385   -0.48643
 -0.023118  -0.2167    -0.31057    0.60142   -0.10843    0.71502
 -0.15498    0.51862    0.24765    0.062018   0.34098    0.03768
 -0.38138   -0.61405    0.25635   -0.019619  -0.10214    0.10405
 -0.19426   -0.063548   0.94613    0.53859   -0.99649   -0.23616
  0.10187   -0

## Dependency Parsing

In [27]:
doc = nlp(" In pursuit of a wall, President Trump ran into one.")

for chunk in doc.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [28]:
displacy.render(doc, style="dep", jupyter= True)