In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
introduction_text = "This tutorial is about Natural Language Processing in Spacy."

In [4]:
introduction_doc = nlp(introduction_text)

# Extract tokens for the given doc

In [6]:
print([token.text for token in introduction_doc])

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


In [7]:
about_text = (
    "Hello all, I am Dr. Chetana. Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

In [9]:
about_doc = nlp(about_text)
about_doc

Hello all, I am Dr. Chetana. Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing.

In [10]:
sentences = list(about_doc.sents)
sentences

[Hello all, I am Dr. Chetana.,
 Gus Proto is a Python developer currently working for a London-based Fintech company.,
 He is interested in learning Natural Language Processing.]

In [11]:
len(sentences)

3

In [12]:
for sentence in sentences:
    print(sentence)

Hello all, I am Dr. Chetana.
Gus Proto is a Python developer currently working for a London-based Fintech company.
He is interested in learning Natural Language Processing.


In [13]:
for token in about_doc:
    print(token, token.idx)

Hello 0
all 6
, 9
I 11
am 13
Dr. 16
Chetana 20
. 27
Gus 29
Proto 33
is 39
a 42
Python 44
developer 51
currently 61
working 71
for 79
a 83
London 85
- 91
based 92
Fintech 98
company 106
. 113
He 115
is 118
interested 121
in 132
learning 135
Natural 144
Language 152
Processing 161
. 171


In [14]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [16]:
len(spacy_stopwords), spacy_stopwords

(326,
 {"'d",
  "'ll",
  "'m",
  "'re",
  "'s",
  "'ve",
  'a',
  'about',
  'above',
  'across',
  'after',
  'afterwards',
  'again',
  'against',
  'all',
  'almost',
  'alone',
  'along',
  'already',
  'also',
  'although',
  'always',
  'am',
  'among',
  'amongst',
  'amount',
  'an',
  'and',
  'another',
  'any',
  'anyhow',
  'anyone',
  'anything',
  'anyway',
  'anywhere',
  'are',
  'around',
  'as',
  'at',
  'back',
  'be',
  'became',
  'because',
  'become',
  'becomes',
  'becoming',
  'been',
  'before',
  'beforehand',
  'behind',
  'being',
  'below',
  'beside',
  'besides',
  'between',
  'beyond',
  'both',
  'bottom',
  'but',
  'by',
  'ca',
  'call',
  'can',
  'cannot',
  'could',
  'did',
  'do',
  'does',
  'doing',
  'done',
  'down',
  'due',
  'during',
  'each',
  'eight',
  'either',
  'eleven',
  'else',
  'elsewhere',
  'empty',
  'enough',
  'even',
  'ever',
  'every',
  'everyone',
  'everything',
  'everywhere',
  'except',
  'few',
  'fifteen',

In [17]:
for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

n‘t
upon
latter
top
nine
never
somehow
around
very
at


In [18]:
for token in about_doc:
    if not token.is_stop:
        print(token)

Hello
,
Dr.
Chetana
.
Gus
Proto
Python
developer
currently
working
London
-
based
Fintech
company
.
interested
learning
Natural
Language
Processing
.


# Lemmatization

In [19]:
conference_help_text = (
    "Gus is helping organize a developer"
    "conference on Applications of Natural Language"
    " Processing. He keeps organizing local Python meetups"
    " and several internal talks at his workplace."
)

In [20]:
conference_help_doc = nlp(conference_help_text)
conference_help_doc

Gus is helping organize a developerconference on Applications of Natural Language Processing. He keeps organizing local Python meetups and several internal talks at his workplace.

In [21]:
for token in conference_help_doc:
    print(token, token.lemma_)

Gus Gus
is be
helping helping
organize organize
a a
developerconference developerconference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He he
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his his
workplace workplace
. .


In [24]:
import pandas as pd

# Part of Speech Tagging


df = pd.DataFrame(
    [[token, token.tag_, token.pos_, spacy.explain(token.tag_)] for token in about_doc],
    columns=["token", "TAG", "Part of speech", "explanation"],
)
df

Unnamed: 0,token,TAG,Part of speech,explanation
0,Hello,UH,INTJ,interjection
1,all,DT,PRON,determiner
2,",",",",PUNCT,"punctuation mark, comma"
3,I,PRP,PRON,"pronoun, personal"
4,am,VBP,AUX,"verb, non-3rd person singular present"
5,Dr.,NNP,PROPN,"noun, proper singular"
6,Chetana,NNP,PROPN,"noun, proper singular"
7,.,.,PUNCT,"punctuation mark, sentence closer"
8,Gus,NNP,PROPN,"noun, proper singular"
9,Proto,NNP,PROPN,"noun, proper singular"


In [25]:
nouns = []
adjectives = []
for token in about_doc:
    if token.pos_ == "NOUN":
        nouns.append(token)
    if token.pos_ == "ADJ":
        adjectives.append(token)

In [26]:
nouns

[developer, company]

In [27]:
adjectives

[interested]

In [28]:
from spacy import displacy

In [29]:
about_interest_text = "He is interested in learning" " Natural Language Processing."

In [30]:
about_interest_doc = nlp(about_interest_text)

In [31]:
displacy.serve(about_interest_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [10/Jun/2024 20:07:50] "GET / HTTP/1.1" 200 6751
127.0.0.1 - - [10/Jun/2024 20:07:51] "GET /favicon.ico HTTP/1.1" 200 6751
