# **Part Of Speech Tagging "Spacy"**

In [2]:
import spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp("Elon flew to mars yesterday. He carried biryani masala with him")

for token in doc:
  print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_))

Elon  |  PROPN  |  proper noun
flew  |  VERB  |  verb
to  |  ADP  |  adposition
mars  |  NOUN  |  noun
yesterday  |  NOUN  |  noun
.  |  PUNCT  |  punctuation
He  |  PRON  |  pronoun
carried  |  VERB  |  verb
biryani  |  ADJ  |  adjective
masala  |  NOUN  |  noun
with  |  ADP  |  adposition
him  |  PRON  |  pronoun


In [17]:
doc = nlp("Wow! Dr. Strange made 265 million $ on the very first day")

for token in doc:
  print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
Dr.  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Strange  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
made  |  VERB  |  verb  |  VBD  |  verb, past tense
265  |  NUM  |  numeral  |  CD  |  cardinal number
million  |  NUM  |  numeral  |  CD  |  cardinal number
$  |  NUM  |  numeral  |  CD  |  cardinal number
on  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  determiner  |  DT  |  determiner
very  |  ADV  |  adverb  |  RB  |  adverb
first  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
day  |  NOUN  |  noun  |  NN  |  noun, singular or mass


In [23]:
doc = nlp("He quits his job")
print(doc[1].text, " | ", doc[1].pos_, " | ", spacy.explain(doc[1].pos_), " | ", doc[1].tag_, " | ", spacy.explain(doc[1].tag_))

quits  |  VERB  |  verb  |  VBZ  |  verb, 3rd person singular present


In [24]:
doc = nlp("He quit his job")
print(doc[1].text, " | ", doc[1].pos_, " | ", spacy.explain(doc[1].pos_), " | ", doc[1].tag_, " | ", spacy.explain(doc[1].tag_))

quit  |  VERB  |  verb  |  VBD  |  verb, past tense


# **remove all spaces, punct and x token from text**

In [26]:
earnings_text="""Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology etc. is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud revenue to $22.1 billion, up 32% year over year” said Amy Hood, executive vice president and chief financial officer of Microsoft."""

doc = nlp(earnings_text)
for token in doc:
  print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

Microsoft  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Corp.  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
today  |  NOUN  |  noun  |  NN  |  noun, singular or mass
announced  |  VERB  |  verb  |  VBD  |  verb, past tense
the  |  DET  |  determiner  |  DT  |  determiner
following  |  VERB  |  verb  |  VBG  |  verb, gerund or present participle
results  |  NOUN  |  noun  |  NNS  |  noun, plural
for  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  determiner  |  DT  |  determiner
quarter  |  NOUN  |  noun  |  NN  |  noun, singular or mass
ended  |  VERB  |  verb  |  VBD  |  verb, past tense
December  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
31  |  NUM  |  numeral  |  CD  |  cardinal number
,  |  PUNCT  |  punctuation  |  ,  |  punctuation mark, comma
2021  |  NUM  |  numeral  |  CD  |  cardinal number
,  |  PUNCT  |  punctuation  |  ,  |  punctuation mark, comma
as  |  SCONJ  |  subordinating conjun

In [27]:
filtered_tokens = []
for token in doc:
  if token.pos_ not in ["SPACE", "PUNCT", "X"]:
    filtered_tokens.append(token)

In [30]:
filtered_tokens[:15]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2021,
 as]

# **way to count number of pos in the corpus**

In [33]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 13,
 92: 46,
 100: 24,
 90: 9,
 85: 16,
 93: 16,
 97: 27,
 98: 1,
 84: 20,
 103: 10,
 87: 6,
 99: 5,
 89: 12,
 101: 2,
 86: 3,
 94: 3,
 95: 2}

In [36]:
doc.vocab[103].text

'SPACE'

In [39]:
doc.vocab[93].text

'NUM'

In [35]:
doc.vocab[96].text

'PROPN'

# **another wayyyyyyy for counting ..**

In [44]:
count.items()

dict_items([(96, 13), (92, 46), (100, 24), (90, 9), (85, 16), (93, 16), (97, 27), (98, 1), (84, 20), (103, 10), (87, 6), (99, 5), (89, 12), (101, 2), (86, 3), (94, 3), (95, 2)])

In [43]:
for k,v in count.items():
  print(doc.vocab[k].text, "|", v)

PROPN | 13
NOUN | 46
VERB | 24
DET | 9
ADP | 16
NUM | 16
PUNCT | 27
SCONJ | 1
ADJ | 20
SPACE | 10
AUX | 6
SYM | 5
CCONJ | 12
X | 2
ADV | 3
PART | 3
PRON | 2


# **Part Of Speech Tagging "nltk"**

In [46]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [47]:
text = "the little yellow dog barked at the cat"

In [48]:
#tokenize text into individual words
words = nltk.word_tokenize(text)

In [49]:
#Perform POS tagging on the tokenized word
pos_tags = nltk.pos_tag(words)

In [50]:
for word, tag in pos_tags:
  print(f"{word}: {tag}")

the: DT
little: JJ
yellow: JJ
dog: NN
barked: VBD
at: IN
the: DT
cat: NN


# **Using txt->Dataset**

In [58]:
import pandas as pd
with open("/content/lyrics-Alanwalker(8).txt", "r") as file:
  #print(file.read())
  data = file.read()

text = pd.DataFrame({"lyrics" : [data]})
text.head()

Unnamed: 0,lyrics
0,-Faded-\nYou were the shadow to my light\nDid ...


In [59]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [67]:
tokens = nltk.word_tokenize(data)
#tokens
pos_tags = nltk.pos_tag(tokens)

for word, tag in pos_tags:
  print(f"{word}: {tag}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
,: ,
ooh: FW
Follow: NNP
everywhere: RB
I: PRP
go: VBP
Top: RB
over: IN
the: DT
mountains: NNS
or: CC
valley: NN
low: JJ
Give: NNP
you: PRP
everything: NN
you: PRP
've: VBP
been: VBN
dreaming: VBG
of: IN
Just: NNP
let: VB
me: PRP
in: IN
,: ,
ooh: NN
Then: RB
she: PRP
ran: VBD
faster: JJR
than: IN
Start: NNP
screaming: NN
,: ,
``: ``
Is: VBZ
there: RB
someone: NN
out: IN
there: RB
?: .
'': ''
Please: NNP
help: VB
me: PRP
Just: NNP
let: VB
me: PRP
in: IN
,: ,
ooh: IN
-Alone: JJ
pt2-: NN
We: PRP
were: VBD
young: JJ
Posters: NNS
on: IN
the: DT
wall: NN
Praying: VBG
we: PRP
're: VBP
the: DT
ones: NNS
That: IN
the: DT
teacher: NN
would: MD
n't: RB
call: VB
We: PRP
would: MD
stare: VB
at: IN
each: DT
other: JJ
'Cause: IN
we: PRP
were: VBD
always: RB
in: IN
trouble: NN
And: CC
all: PDT
the: DT
cool: JJ
kids: NNS
Did: VBP
their: PRP$
own: JJ
thing: NN
I: PRP
was: VBD
on: IN
the: DT
outside: JJ
Always: NNP
looking: VBG
in: IN
Yeah: