**NLP Basics**

Name: Prashanth B

# Spacy Installation

In [1]:
!pip install spacy
import spacy



In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


# Create blank language object and tokenize words in a sentence

In [3]:
nlp = spacy.blank("en")
nlp

<spacy.lang.en.English at 0x7f264440c820>

In [4]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")
doc

Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.

In [5]:
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [6]:
doc[0]

Dr.

In [7]:
type(doc[0])

spacy.tokens.token.Token

In [8]:
doc[0].text

'Dr.'

In [9]:
type(doc[0].text)

str

In [10]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [11]:
len(dir(token))

122

In [12]:
type(nlp)

spacy.lang.en.English

In [13]:
type(doc)

spacy.tokens.doc.Doc

In [14]:
type(token)

spacy.tokens.token.Token

In [15]:
nlp.pipe_names

[]

In [16]:
doc[0:5]

Dr. Strange loves pav bhaji

In [17]:
type(doc[0:5])

spacy.tokens.span.Span

In [18]:
#Token attributes
doc = nlp("Tony gave two $ to Peter.")

In [19]:
token0 = doc[0]
token0

Tony

In [20]:
token0.is_alpha, token0.like_num, token0.is_currency

(True, False, False)

In [21]:
token1 = doc[1]
token1

gave

In [22]:
token1.is_alpha, token1.like_num, token1.is_currency

(True, False, False)

In [23]:
token2 = doc[2]
token2

two

In [24]:
token2.is_alpha, token2.like_num, token2.is_currency

(True, True, False)

In [25]:
token3 = doc[3]
token3

$

In [26]:
token3.is_alpha, token3.like_num, token3.is_currency

(False, False, True)

In [27]:
for token in doc:
    print(token, "  |  ", "index:", token.i, "  |  ",
          "is_alpha:", token.is_alpha, "  |  ",
          "is_punct:", token.is_punct, "  |  ",
          "like_num:", token.like_num, "  |  ",
          "is_currency:", token.is_currency,
         )

Tony   |   index: 0   |   is_alpha: True   |   is_punct: False   |   like_num: False   |   is_currency: False
gave   |   index: 1   |   is_alpha: True   |   is_punct: False   |   like_num: False   |   is_currency: False
two   |   index: 2   |   is_alpha: True   |   is_punct: False   |   like_num: True   |   is_currency: False
$   |   index: 3   |   is_alpha: False   |   is_punct: False   |   like_num: False   |   is_currency: True
to   |   index: 4   |   is_alpha: True   |   is_punct: False   |   like_num: False   |   is_currency: False
Peter   |   index: 5   |   is_alpha: True   |   is_punct: False   |   like_num: False   |   is_currency: False
.   |   index: 6   |   is_alpha: False   |   is_punct: True   |   like_num: False   |   is_currency: False


# Collecting Email IDs of students from students information sheet

In [28]:
l = ['Virat   5 June, 1882    virat@kohli.com',
 'Maria   12 April, 2001  maria@sharapova.com',
 'Serena  24 June, 1998   serena@williams.com',
 'Joe      1 May, 1997    joe@root.com']

In [29]:
text = " ".join(l)
text

'Virat   5 June, 1882    virat@kohli.com Maria   12 April, 2001  maria@sharapova.com Serena  24 June, 1998   serena@williams.com Joe      1 May, 1997    joe@root.com'

In [30]:
doc = nlp(text)
doc

Virat   5 June, 1882    virat@kohli.com Maria   12 April, 2001  maria@sharapova.com Serena  24 June, 1998   serena@williams.com Joe      1 May, 1997    joe@root.com

In [31]:
emails = []
for token in doc:
  if token.like_email:
    emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

# Support in other languages

In [32]:
nlp = spacy.blank("hi")
nlp

<spacy.lang.hi.Hindi at 0x7f2643fd1a50>

In [33]:
doc = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")
doc

भैया जी! 5000 ₹ उधार थे वो वापस देदो

In [34]:
for token in doc:
    print(token, "---", token.is_currency)

भैया --- False
जी --- False
! --- False
5000 --- False
₹ --- True
उधार --- False
थे --- False
वो --- False
वापस --- False
देदो --- False


# Customizing tokenizer

In [35]:
from spacy.symbols import ORTH

In [36]:
nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
doc

gimme double cheese extra large healthy pizza

In [37]:
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [38]:
nlp.tokenizer.add_special_case("gimme", [{ORTH: "gim"}, {ORTH: "me"}])

In [39]:
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

# Sentence Tokenization or Segmentation

In [40]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
doc

Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi

In [None]:
for sentence in doc.sents:
    print(sentence)

ValueError: ignored

In [41]:
nlp.pipeline

[]

In [42]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7f2643ffbbc0>

In [43]:
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x7f2643ffbbc0>)]

In [44]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


# Pipelines Tutorial

In [45]:
nlp = spacy.blank("en")
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")
doc

Captain america ate 100$ of samosa. Then he said I can do this all day.

In [46]:
for token in doc:
  print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [47]:
nlp.pipe_names

[]

In [48]:
nlp = spacy.load(name = 'en_core_web_md')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [49]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f2643cf0b80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f2643cf0ac0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f2643ce44a0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f2643c21900>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f2643a8b2c0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f2643ce46d0>)]

In [50]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")
for token in doc:
  print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  noun  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


# Named Entity Recognition

In [51]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
doc

Tesla Inc is going to acquire twitter for $45 billion

In [52]:
doc.ents

(Tesla Inc, $45 billion)

In [53]:
for ent in doc.ents:
  print(ent.text, "---", ent.label_)

Tesla Inc --- ORG
$45 billion --- MONEY


In [54]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [55]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")
doc

Michael Bloomberg founded Bloomberg in 1982

In [56]:
for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Michael Bloomberg  |  PERSON  |  People, including fictional
Bloomberg  |  ORG  |  Companies, agencies, institutions, etc.
1982  |  DATE  |  Absolute or relative dates or periods


In [57]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
  print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


Setting custom entities

In [58]:
doc = nlp("Threads is going to acquire Twitter for $45 billion")
for ent in doc.ents:
  print(ent.text, " | ", ent.label_)

Twitter  |  ORG
$45 billion  |  MONEY


In [59]:
s = doc[0:1]
s

Threads

In [60]:
type(s)

spacy.tokens.span.Span

In [61]:
from spacy.tokens import Span
s1 = Span(doc, 0, 1, label="ORG")
doc.set_ents([s1], default="unmodified")

In [62]:
for ent in doc.ents:
  print(ent.text, " | ", ent.label_)

Threads  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


# Stemming in NLTK

In [63]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [64]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

In [65]:
for word in words:
  print(word, "  |  ", stemmer.stem(word))

eating   |   eat
eats   |   eat
eat   |   eat
ate   |   ate
adjustable   |   adjust
rafting   |   raft
ability   |   abil
meeting   |   meet


# Lemmatization in Spacy

In [66]:
doc1 = nlp("Mando talked for 3 hours although talking isn't his thing")
doc1

Mando talked for 3 hours although talking isn't his thing

In [67]:
for token in doc1:
  print(token, " --- ", token.lemma_)

Mando  ---  Mando
talked  ---  talk
for  ---  for
3  ---  3
hours  ---  hour
although  ---  although
talking  ---  talk
is  ---  be
n't  ---  not
his  ---  his
thing  ---  thing


In [68]:
doc2 = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc2:
    print(token, " --- ", token.lemma_)

eating  ---  eating
eats  ---  eat
eat  ---  eat
ate  ---  eat
adjustable  ---  adjustable
rafting  ---  rafting
ability  ---  ability
meeting  ---  meet
better  ---  well


# Customizing lemmatizer

In [69]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [70]:
ar = nlp.get_pipe('attribute_ruler')

In [71]:
ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
  print(token.text, "  ---  ", token.lemma_)

Bro   ---   Brother
,   ---   ,
you   ---   you
wanna   ---   wanna
go   ---   go
?   ---   ?
Brah   ---   Brother
,   ---   ,
do   ---   do
n't   ---   not
say   ---   say
no   ---   no
!   ---   !
I   ---   I
am   ---   be
exhausted   ---   exhausted


In [72]:
doc[6], doc[6].lemma_

(Brah, 'Brother')

# Parts of Speech (POS) Tags

In [73]:
doc = nlp("Elon flew to mars yesterday. He carried tesla with him")
doc

Elon flew to mars yesterday. He carried tesla with him

In [74]:
for token in doc:
  print(token," | ", token.pos_, " | ", spacy.explain(token.pos_))

Elon  |  PROPN  |  proper noun
flew  |  VERB  |  verb
to  |  ADP  |  adposition
mars  |  PROPN  |  proper noun
yesterday  |  NOUN  |  noun
.  |  PUNCT  |  punctuation
He  |  PRON  |  pronoun
carried  |  VERB  |  verb
tesla  |  NOUN  |  noun
with  |  ADP  |  adposition
him  |  PRON  |  pronoun


In [75]:
doc2 = nlp("Wow! Dr. Strange made 265 million $ on the very first day")

for token in doc2:
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_))

Wow  |  INTJ  |  interjection
!  |  PUNCT  |  punctuation
Dr.  |  PROPN  |  proper noun
Strange  |  PROPN  |  proper noun
made  |  VERB  |  verb
265  |  NUM  |  numeral
million  |  NUM  |  numeral
$  |  SYM  |  symbol
on  |  ADP  |  adposition
the  |  DET  |  determiner
very  |  ADV  |  adverb
first  |  ADJ  |  adjective
day  |  NOUN  |  noun


Tags

In [76]:
for token in doc2:
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_), " | ",
          token.tag_, " | ", spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
Dr.  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Strange  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
made  |  VERB  |  verb  |  VBD  |  verb, past tense
265  |  NUM  |  numeral  |  CD  |  cardinal number
million  |  NUM  |  numeral  |  CD  |  cardinal number
$  |  SYM  |  symbol  |  $  |  symbol, currency
on  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  determiner  |  DT  |  determiner
very  |  ADV  |  adverb  |  RB  |  adverb
first  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
day  |  NOUN  |  noun  |  NN  |  noun, singular or mass


In [77]:
doc = nlp("He quits the job")
doc[1]

quits

In [78]:
print(doc[1].text, "  |  ", doc[1].tag_, "  |  ", spacy.explain(doc[1].tag_))

quits   |   VBZ   |   verb, 3rd person singular present


In [79]:
doc = nlp("he quit the job")
print(doc[1].text, "  |  ", doc[1].tag_, "  |  ", spacy.explain(doc[1].tag_))

quit   |   VBD   |   verb, past tense


# Removing all SPACE, PUNCT and X token from text

In [80]:
earnings_text="""Microsoft Corp. today announced the following results for the quarter ended December 31,
                  2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints
and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft.
“As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse
and growing markets, with a common underlying technology stack and an operating model that reinforces a
common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments,
increased Microsoft Cloud revenue to $22.1 billion, up 32% year over year” said Amy Hood, executive
vice president and chief financial officer of Microsoft."""

In [81]:
doc = nlp(earnings_text)
doc

Microsoft Corp. today announced the following results for the quarter ended December 31,
                  2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints
and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft.
“As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse
and growing markets, with a common underlying technology stack and an operating model that reinforces a
common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments,
increased

In [82]:
for token in doc:
  print(token, "  |  ", token.pos_, "  |  ", spacy.explain(token.pos_))

Microsoft   |   PROPN   |   proper noun
Corp.   |   PROPN   |   proper noun
today   |   NOUN   |   noun
announced   |   VERB   |   verb
the   |   DET   |   determiner
following   |   VERB   |   verb
results   |   NOUN   |   noun
for   |   ADP   |   adposition
the   |   DET   |   determiner
quarter   |   NOUN   |   noun
ended   |   VERB   |   verb
December   |   PROPN   |   proper noun
31   |   NUM   |   numeral
,   |   PUNCT   |   punctuation

                     |   SPACE   |   space
2021   |   NUM   |   numeral
,   |   PUNCT   |   punctuation
as   |   ADP   |   adposition
compared   |   VERB   |   verb
to   |   ADP   |   adposition
the   |   DET   |   determiner
corresponding   |   ADJ   |   adjective
period   |   NOUN   |   noun
of   |   ADP   |   adposition
last   |   ADJ   |   adjective
fiscal   |   ADJ   |   adjective
year   |   NOUN   |   noun
:   |   PUNCT   |   punctuation


   |   SPACE   |   space
·   |   PUNCT   |   punctuation
           |   SPACE   |   space
Revenue   | 

In [83]:
filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE", "PUNCT", "X"]:
        filtered_tokens.append(token)

In [84]:
filtered_tokens[:10]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter]

In [85]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 12,
 92: 46,
 100: 23,
 90: 9,
 85: 17,
 93: 16,
 97: 27,
 103: 17,
 84: 22,
 87: 6,
 99: 5,
 89: 12,
 86: 3,
 94: 3,
 95: 2}

In [86]:
doc.vocab[96].text

'PROPN'

In [87]:
spacy.explain(doc.vocab[96].text)

'proper noun'

In [88]:
print("Total count:")
for k,v in count.items():
    print(doc.vocab[k].text, ":", v, "|", spacy.explain(doc.vocab[k].text))

Total count:
PROPN : 12 | proper noun
NOUN : 46 | noun
VERB : 23 | verb
DET : 9 | determiner
ADP : 17 | adposition
NUM : 16 | numeral
PUNCT : 27 | punctuation
SPACE : 17 | space
ADJ : 22 | adjective
AUX : 6 | auxiliary
SYM : 5 | symbol
CCONJ : 12 | coordinating conjunction
ADV : 3 | adverb
PART : 3 | particle
PRON : 2 | pronoun


# Bag Of Words

In [89]:
l = ["How to Make a girl Happy? It's not at all diff...",
"We made it! Eta at taunton is 12:30 as planned...",
"Beautiful truth : Expression of the face could...",
"Oh:) as usual vijay film or its different?"]

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

l_cv = v.fit_transform(l)

In [91]:
l_cv

<4x33 sparse matrix of type '<class 'numpy.int64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [92]:
l_array = l_cv.toarray()
l_array

array([[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0]])

In [93]:
l_array.shape

(4, 33)

In [94]:
v.get_feature_names_out()

array(['12', '30', 'all', 'as', 'at', 'beautiful', 'could', 'diff',
       'different', 'eta', 'expression', 'face', 'film', 'girl', 'happy',
       'how', 'is', 'it', 'its', 'made', 'make', 'not', 'of', 'oh', 'or',
       'planned', 'taunton', 'the', 'to', 'truth', 'usual', 'vijay', 'we'],
      dtype=object)

In [95]:
#for each word, index number is created
v.vocabulary_

{'how': 15,
 'to': 28,
 'make': 20,
 'girl': 13,
 'happy': 14,
 'it': 17,
 'not': 21,
 'at': 4,
 'all': 2,
 'diff': 7,
 'we': 32,
 'made': 19,
 'eta': 9,
 'taunton': 26,
 'is': 16,
 '12': 0,
 '30': 1,
 'as': 3,
 'planned': 25,
 'beautiful': 5,
 'truth': 29,
 'expression': 10,
 'of': 22,
 'the': 27,
 'face': 11,
 'could': 6,
 'oh': 23,
 'usual': 30,
 'vijay': 31,
 'film': 12,
 'or': 24,
 'its': 18,
 'different': 8}

In [96]:
v.get_feature_names_out()[31]

'vijay'

# Stop words

In [97]:
from spacy.lang.en.stop_words import STOP_WORDS

In [98]:
STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [99]:
len(STOP_WORDS)

326

In [100]:
nlp = spacy.load("en_core_web_md")
doc = nlp("We just opened our wings, the flying part is coming soon")
doc

We just opened our wings, the flying part is coming soon

In [101]:
for token in doc:
  if token.is_stop:
    print(token)

We
just
our
the
part
is


In [102]:
def preprocess(text):
  doc = nlp(text)

  no_stop_words = [token.text for token in doc if not token.is_stop]
  return " ".join(no_stop_words)

In [103]:
preprocess("Musk wants time to prepare for a trial over his")

'Musk wants time prepare trial'

In [104]:
preprocess("The other is not other but your divine brother")

'divine brother'

For Sentiment Analysis, Language Translation or Chat Bot problems removing stop words doesn't make sense. For example

In [105]:
preprocess("this is not a good movie")

'good movie'

# Bag of N Grams

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

In [107]:
#for each word index number is generated
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [108]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [109]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [110]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [111]:
nlp = spacy.load("en_core_web_md")

In [112]:
def preprocess(text):
  # remove stop words and lemmatize the text
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)

  return " ".join(filtered_tokens)

In [113]:
preprocess("Thor ate pizza")

'Thor eat pizza'

In [114]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [115]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['Thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [116]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [117]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

Out Of Vocabulary (OOV) Term

Eg: Hulk

In [118]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

# Text Representation: TF-IDF

What is TF-IDF?

TF stands for Term Frequency and denotes the ratio of number of times a particular word
appeared in a Document to total number of words in the document.

Term Frequency(TF) = [number of times word appeared /
total no of words in a document]

Term Frequency values ranges between 0 and 1.

If a word occurs more number of times,
then it's value will be close to 1.

IDF stands for Inverse Document Frequency and denotes the log of ratio of total
number of documents/datapoints in the whole dataset to the number of documents that
contains the particular word.

Inverse Document Frequency(IDF) = [log(Total number of documents / number of documents that contains the word)]

In IDF, if a word occured in more number of documents and is common across all documents, then it's value will be less and ratio will approaches to 0.

Finally:

TF-IDF = Term Frequency(TF) * Inverse Document Frequency(IDF)

In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [120]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [121]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [122]:
v.vocabulary_

{'thor': 25,
 'eating': 10,
 'pizza': 22,
 'loki': 17,
 'is': 16,
 'ironman': 15,
 'ate': 7,
 'already': 0,
 'apple': 5,
 'announcing': 4,
 'new': 20,
 'iphone': 14,
 'tomorrow': 26,
 'tesla': 24,
 'model': 19,
 'google': 12,
 'pixel': 21,
 'microsoft': 18,
 'surface': 23,
 'amazon': 2,
 'eco': 11,
 'dot': 9,
 'am': 1,
 'biryani': 8,
 'and': 3,
 'you': 27,
 'are': 6,
 'grapes': 13}

In [123]:
v.get_feature_names_out()

array(['already', 'am', 'amazon', 'and', 'announcing', 'apple', 'are',
       'ate', 'biryani', 'dot', 'eating', 'eco', 'google', 'grapes',
       'iphone', 'ironman', 'is', 'loki', 'microsoft', 'model', 'new',
       'pixel', 'pizza', 'surface', 'tesla', 'thor', 'tomorrow', 'you'],
      dtype=object)

In [124]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:

    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)

    #get the score
    idf_score = v.idf_[indx]

    print(f"{word} : {idf_score:.2f}")

already : 2.39
am : 2.39
amazon : 2.39
and : 2.39
announcing : 1.29
apple : 2.39
are : 2.39
ate : 2.39
biryani : 2.39
dot : 2.39
eating : 1.98
eco : 2.39
google : 2.39
grapes : 2.39
iphone : 2.39
ironman : 2.39
is : 1.13
loki : 2.39
microsoft : 2.39
model : 2.39
new : 1.29
pixel : 2.39
pizza : 2.39
surface : 2.39
tesla : 2.39
thor : 2.39
tomorrow : 1.29
you : 2.39


In [125]:
transform_output.toarray()

array([[0.24266547, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24266547, 0.        , 0.        ,
        0.40286636, 0.        , 0.        , 0.        , 0.        ,
        0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
        0.        , 0.        , 0.72799642, 0.        , 0.        ,
        0.24266547, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.30652086,
        0.5680354 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.5680354 ,
        0.        , 0.26982671, 0.        , 0.        , 0.        ,
        0.30652086, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30652086, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.30652086,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.2698

In [126]:
transform_output.toarray()[0]

array([0.24266547, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.24266547, 0.        , 0.        ,
       0.40286636, 0.        , 0.        , 0.        , 0.        ,
       0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
       0.        , 0.        , 0.72799642, 0.        , 0.        ,
       0.24266547, 0.        , 0.        ])

# Word Embeddings

In [128]:
nlp = spacy.load("en_core_web_md")

In [130]:
doc = nlp("dog cat banana prashanth") #here prashanth is not a familiar word(out of vocabulary)

for token in doc:
    print(token.text, " | ", "Vector:", token.has_vector, " | ", "OOV:", token.is_oov)

dog  |  Vector: True  |  OOV: False
cat  |  Vector: True  |  OOV: False
banana  |  Vector: True  |  OOV: False
prashanth  |  Vector: False  |  OOV: True


In [131]:
#Each word vector has dimension 300
doc[0].vector.shape

(300,)

In [132]:
doc[1].vector.shape #for cat

(300,)

In [133]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [134]:
#Comparing the words similarity with the base token
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 1.0
sandwich <-> bread: 0.6341067010130894
burger <-> bread: 0.47520687769584247
car <-> bread: 0.06451533308853552
tiger <-> bread: 0.04764611675903374
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.6150360888607199


In [135]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [136]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone:  0.3634123187833088
samsung <-> iphone:  0.6683552428198852
iphone <-> iphone:  1.0
dog <-> iphone:  0.062353975727114645
kitten <-> iphone:  0.09053956522798948


In [137]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [138]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.61780137]], dtype=float32)

**Thank you**