In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp("Elon flew to Mars yesterday. He carried biriyani masala with him.")

In [8]:
for token in doc:
    print(token, " | ", token.pos_, spacy.explain(token.pos_))

Elon  |  PROPN proper noun
flew  |  VERB verb
to  |  ADP adposition
Mars  |  PROPN proper noun
yesterday  |  NOUN noun
.  |  PUNCT punctuation
He  |  PRON pronoun
carried  |  VERB verb
biriyani  |  PROPN proper noun
masala  |  NOUN noun
with  |  ADP adposition
him  |  PRON pronoun
.  |  PUNCT punctuation


In [9]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2930c1af0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2930c12b0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x292faec00>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x29322c2d0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x29321ff10>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x292faee30>)]

In [10]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [11]:
doc = nlp("Wow! Dr. Strange made 265 million $ on the very first day")

In [12]:
for token in doc:
    print(token, " | ", token.pos_, spacy.explain(token.pos_))

Wow  |  INTJ interjection
!  |  PUNCT punctuation
Dr.  |  PROPN proper noun
Strange  |  PROPN proper noun
made  |  VERB verb
265  |  NUM numeral
million  |  NUM numeral
$  |  NUM numeral
on  |  ADP adposition
the  |  DET determiner
very  |  ADV adverb
first  |  ADJ adjective
day  |  NOUN noun


In [15]:
# made is a verb but is in past tense, can you know that it is in past tense ?

# Yes, using tags
for token in doc:
    print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))


Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
Dr.  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Strange  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
made  |  VERB  |  verb  |  VBD  |  verb, past tense
265  |  NUM  |  numeral  |  CD  |  cardinal number
million  |  NUM  |  numeral  |  CD  |  cardinal number
$  |  NUM  |  numeral  |  CD  |  cardinal number
on  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  determiner  |  DT  |  determiner
very  |  ADV  |  adverb  |  RB  |  adverb
first  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
day  |  NOUN  |  noun  |  NN  |  noun, singular or mass


In [18]:
doc = nlp("He quits the job")
# look at the amount of detail it gives for the word quit

for token in doc:
    print(token.text, " | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

He  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
quits  |  VERB  |  verb  |  VBZ  |  verb, 3rd person singular present
the  |  DET  |  determiner  |  DT  |  determiner
job  |  NOUN  |  noun  |  NN  |  noun, singular or mass


In [19]:
#quits changed to quit
doc = nlp("He quit the job")
# look at the amount of detail it gives for the word quit

for token in doc:
    print(token.text, " | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

He  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
quit  |  VERB  |  verb  |  VBD  |  verb, past tense
the  |  DET  |  determiner  |  DT  |  determiner
job  |  NOUN  |  noun  |  NN  |  noun, singular or mass


In [20]:
#spacy is smart enough to figure out the tenses

In [21]:
#Fun example
earnings_text="""Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud revenue to $22.1 billion, up 32% year over year” said Amy Hood, executive vice president and chief financial officer of Microsoft."""

doc = nlp(earnings_text)

In [25]:
#remove unnecessary tokens from Microsoft earning report using POS tags
filtered_tokens = []
for token in doc:
    if token.pos_ not in ['SPACE', 'X', 'PUNCT']:
        filtered_tokens.append(token)
        #print(token.text, " | ", token.pos_, " | ", spacy.explain(token.pos_))

In [23]:
type(doc[1].pos_)

str

In [27]:
print(filtered_tokens[:20])

[Microsoft, Corp., today, announced, the, following, results, for, the, quarter, ended, December, 31, 2021, as, compared, to, the, corresponding, period]


In [28]:
#want to know how many nouns, verbs etc are present
count = doc.count_by(spacy.attrs.POS)

In [29]:
count

{96: 13,
 92: 46,
 100: 24,
 90: 9,
 85: 16,
 93: 16,
 97: 27,
 98: 1,
 84: 20,
 103: 10,
 87: 6,
 99: 5,
 89: 12,
 86: 3,
 94: 3,
 95: 2}

In [30]:
doc.vocab[96].text

'PROPN'

In [31]:
for k,v in count.items():
    print(doc.vocab[k].text, " | ", v)

PROPN  |  13
NOUN  |  46
VERB  |  24
DET  |  9
ADP  |  16
NUM  |  16
PUNCT  |  27
SCONJ  |  1
ADJ  |  20
SPACE  |  10
AUX  |  6
SYM  |  5
CCONJ  |  12
ADV  |  3
PART  |  3
PRON  |  2
