# Preprocessing using spacy

In [1]:
import spacy

In [2]:
nlp=spacy.load('en_core_web_sm')

In [3]:
doc1=nlp('India, a South Asian nation, is the seventh-largest country by area, the second-most populous country with over 1.38 billion people, and the most populous democracy in the world. India boasts of an immensely rich cultural heritage, including numerous languages, traditions, and people. The country holds its uniqueness in its diversity, and hence has adapted itself to international changes with poise and comfort. While the economy has welcomed international companies to invest in it with open arms since liberalisation in the 1990s, Indians have been prudent and proactive in adopting global approaches and skills. Indian villagers have proudly taken up farming, advanced agriculture and unique handicrafts as their profession on one hand, while the modern industries and professional services sectors are coming up in a big way on the other.Thus, the country is attracting many global majors for strategic investments owing to the presence of a vast range of industries, investment avenues and a supportive Government. A huge population, mostly comprising the youth, is a strong driver for demand and an ample source of manpower.')

In [4]:
type(doc1)

spacy.tokens.doc.Doc

## Tokenization

In [5]:
for token in doc1:
    print(token)

India
,
a
South
Asian
nation
,
is
the
seventh
-
largest
country
by
area
,
the
second
-
most
populous
country
with
over
1.38
billion
people
,
and
the
most
populous
democracy
in
the
world
.
India
boasts
of
an
immensely
rich
cultural
heritage
,
including
numerous
languages
,
traditions
,
and
people
.
The
country
holds
its
uniqueness
in
its
diversity
,
and
hence
has
adapted
itself
to
international
changes
with
poise
and
comfort
.
While
the
economy
has
welcomed
international
companies
to
invest
in
it
with
open
arms
since
liberalisation
in
the
1990s
,
Indians
have
been
prudent
and
proactive
in
adopting
global
approaches
and
skills
.
Indian
villagers
have
proudly
taken
up
farming
,
advanced
agriculture
and
unique
handicrafts
as
their
profession
on
one
hand
,
while
the
modern
industries
and
professional
services
sectors
are
coming
up
in
a
big
way
on
the
other
.
Thus
,
the
country
is
attracting
many
global
majors
for
strategic
investments
owing
to
the
presence
of
a
vast
range
of
industries
,
in

In [6]:
len(doc1)

201

In [7]:
# Count the number of tokens

t_count=0
for token in doc1:
    t_count=t_count+1
    print(token)
print('The no of tokens:',t_count)

India
,
a
South
Asian
nation
,
is
the
seventh
-
largest
country
by
area
,
the
second
-
most
populous
country
with
over
1.38
billion
people
,
and
the
most
populous
democracy
in
the
world
.
India
boasts
of
an
immensely
rich
cultural
heritage
,
including
numerous
languages
,
traditions
,
and
people
.
The
country
holds
its
uniqueness
in
its
diversity
,
and
hence
has
adapted
itself
to
international
changes
with
poise
and
comfort
.
While
the
economy
has
welcomed
international
companies
to
invest
in
it
with
open
arms
since
liberalisation
in
the
1990s
,
Indians
have
been
prudent
and
proactive
in
adopting
global
approaches
and
skills
.
Indian
villagers
have
proudly
taken
up
farming
,
advanced
agriculture
and
unique
handicrafts
as
their
profession
on
one
hand
,
while
the
modern
industries
and
professional
services
sectors
are
coming
up
in
a
big
way
on
the
other
.
Thus
,
the
country
is
attracting
many
global
majors
for
strategic
investments
owing
to
the
presence
of
a
vast
range
of
industries
,
in

## Stop words

In [8]:
from spacy.lang.en.stop_words import STOP_WORDS

In [9]:
print(STOP_WORDS)

{'when', 'who', 'seemed', 'or', 'first', 'perhaps', 'we', 'that', 'above', 'see', 'go', "'s", 'others', 'there', 'behind', 'done', 'without', 'does', 'mine', 'but', 'be', 'as', 'either', 'eight', 'serious', 'thus', 'keep', 'they', 'hundred', 'in', 'nor', 'whereafter', 'due', '’m', 'throughout', 'regarding', 'call', 'whenever', 'anyway', 'whose', 'take', 'our', 'became', 'well', 'yours', 'at', 'through', 'amongst', 'could', '‘ll', 'ever', 'few', 'thru', "n't", 'nine', 'would', 'why', 'itself', 'she', 'you', 'this', '’d', 'under', 'further', 'other', 'neither', "'d", 'nowhere', 'latterly', 'side', 'thereafter', 'which', 'whom', 'himself', 'ten', 'nevertheless', 'both', 'since', 'twenty', 'herself', 'across', 'becoming', 'toward', 'whereas', 'of', 'until', 'doing', 'therefore', 'for', 'also', 'else', 'everything', 'five', 'my', 'might', 'up', 're', 'do', 'somewhere', 'an', 'anyhow', 'how', 'her', 'own', 'whither', 'everywhere', 'while', 'its', "'ll", 'about', 'towards', 'every', 'nothing'

In [10]:
len(STOP_WORDS)

326

## Is it a stop word

In [11]:
for token in doc1:
    print(token,'==>',token.is_stop)

India ==> False
, ==> False
a ==> True
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> True
the ==> True
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> True
area ==> False
, ==> False
the ==> True
second ==> False
- ==> False
most ==> True
populous ==> False
country ==> False
with ==> True
over ==> True
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> True
the ==> True
most ==> True
populous ==> False
democracy ==> False
in ==> True
the ==> True
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> True
an ==> True
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> True
people ==> False
. ==> False
The ==> True
country ==> False
holds ==> False
its ==> True
uniqueness ==> False
in ==> True
its ==> True
diversity ==> False
, ==> False
and ==> True
hence ==> True
h

In [12]:
# Count the no of non-stop words
s_count=0
for token in doc1:
    print(token,'==>',token.is_stop)
print('\n\n The non-stop words:\n')
for token in doc1:
    if token.is_stop==False:
        s_count=s_count+1
        print(token)
print('\n The count of non-stop words:',s_count)

India ==> False
, ==> False
a ==> True
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> True
the ==> True
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> True
area ==> False
, ==> False
the ==> True
second ==> False
- ==> False
most ==> True
populous ==> False
country ==> False
with ==> True
over ==> True
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> True
the ==> True
most ==> True
populous ==> False
democracy ==> False
in ==> True
the ==> True
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> True
an ==> True
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> True
people ==> False
. ==> False
The ==> True
country ==> False
holds ==> False
its ==> True
uniqueness ==> False
in ==> True
its ==> True
diversity ==> False
, ==> False
and ==> True
hence ==> True
h

## Is it a punctuation ?

In [14]:
# Count the no of non-punctations
p_count=0
for token in doc1:
    print(token,'==>',token.is_punct)
print('\n\n The non-punctuation words:\n')
for token in doc1:
    if token.is_punct==False:
        p_count=p_count+1
        print(token)
print('\n The count of non-punctuations:',p_count)

India ==> False
, ==> True
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> True
is ==> False
the ==> False
seventh ==> False
- ==> True
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> True
the ==> False
second ==> False
- ==> True
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> True
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> True
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> True
including ==> False
numerous ==> False
languages ==> False
, ==> True
traditions ==> False
, ==> True
and ==> False
people ==> False
. ==> True
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> True
and ==> False
hence =

In [15]:
# Count the no of punctations
p_count=0
for token in doc1:
    print(token,'==>',token.is_punct)
print('\n\n The punctuation words:\n')
for token in doc1:
    if token.is_punct==True:
        p_count=p_count+1
        print(token)
print('\n The count of punctuations:',p_count)

India ==> False
, ==> True
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> True
is ==> False
the ==> False
seventh ==> False
- ==> True
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> True
the ==> False
second ==> False
- ==> True
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> True
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> True
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> True
including ==> False
numerous ==> False
languages ==> False
, ==> True
traditions ==> False
, ==> True
and ==> False
people ==> False
. ==> True
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> True
and ==> False
hence =

## Is it a left punctuation

In [16]:
# Count the no of left punctations
lp_count=0
for token in doc1:
    print(token,'==>',token.is_left_punct)
print('\n\n The punctuation words:\n')
for token in doc1:
    if token.is_left_punct==True:
        lp_count=lp_count+1
        print(token)
print('\n The count of leftv punctuations:',lp_count)

India ==> False
, ==> False
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> False
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> F

## Is it a right punctuation?

In [17]:
# Count the no of right punctations
rp_count=0
for token in doc1:
    print(token,'==>',token.is_right_punct)
print('\n\n The right punctuation words:\n')
for token in doc1:
    if token.is_right_punct==True:
        rp_count=rp_count+1
        print(token)
print('\n The count of right punctuations:',rp_count)

India ==> False
, ==> False
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> False
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> F

## Is it an alphabet?

In [18]:
# Count the tokens, whcih are made of alphabets
a_count=0
for token in doc1:
    print(token,'==>',token.is_alpha)
print('\n\n The alphabets tokens:\n')
for token in doc1:
    if token.is_alpha==True:
        a_count=a_count+1
        print(token)
print('\n The count of alphabet-tokens:',a_count)

India ==> True
, ==> False
a ==> True
South ==> True
Asian ==> True
nation ==> True
, ==> False
is ==> True
the ==> True
seventh ==> True
- ==> False
largest ==> True
country ==> True
by ==> True
area ==> True
, ==> False
the ==> True
second ==> True
- ==> False
most ==> True
populous ==> True
country ==> True
with ==> True
over ==> True
1.38 ==> False
billion ==> True
people ==> True
, ==> False
and ==> True
the ==> True
most ==> True
populous ==> True
democracy ==> True
in ==> True
the ==> True
world ==> True
. ==> False
India ==> True
boasts ==> True
of ==> True
an ==> True
immensely ==> True
rich ==> True
cultural ==> True
heritage ==> True
, ==> False
including ==> True
numerous ==> True
languages ==> True
, ==> False
traditions ==> True
, ==> False
and ==> True
people ==> True
. ==> False
The ==> True
country ==> True
holds ==> True
its ==> True
uniqueness ==> True
in ==> True
its ==> True
diversity ==> True
, ==> False
and ==> True
hence ==> True
has ==> True
adapted ==> True
it

## is it a digit?

In [19]:
# Count the no of digits
d_count=0
for token in doc1:
    print(token,'==>',token.is_digit)
print('\n\n The digits:\n')
for token in doc1:
    if token.is_digit==True:
        d_count=d_count+1
        print(token)
print('\n The count of digits:',d_count)

India ==> False
, ==> False
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> False
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> F

In [20]:
doc2=nlp('1000 is a big number')
for token in doc2:
    print(token,'==>',token.is_digit)

1000 ==> True
is ==> False
a ==> False
big ==> False
number ==> False


## Is it  lower case ?

In [21]:
# Count the no of lower case tokens
l_count=0
for token in doc1:
    print(token,'==>',token.is_lower)
print('\n\n The lower case words:\n')
for token in doc1:
    if token.is_lower==True:
        l_count=l_count+1
        print(token)
print('\n The count of lower case tokens:',l_count)

India ==> False
, ==> False
a ==> True
South ==> False
Asian ==> False
nation ==> True
, ==> False
is ==> True
the ==> True
seventh ==> True
- ==> False
largest ==> True
country ==> True
by ==> True
area ==> True
, ==> False
the ==> True
second ==> True
- ==> False
most ==> True
populous ==> True
country ==> True
with ==> True
over ==> True
1.38 ==> False
billion ==> True
people ==> True
, ==> False
and ==> True
the ==> True
most ==> True
populous ==> True
democracy ==> True
in ==> True
the ==> True
world ==> True
. ==> False
India ==> False
boasts ==> True
of ==> True
an ==> True
immensely ==> True
rich ==> True
cultural ==> True
heritage ==> True
, ==> False
including ==> True
numerous ==> True
languages ==> True
, ==> False
traditions ==> True
, ==> False
and ==> True
people ==> True
. ==> False
The ==> False
country ==> True
holds ==> True
its ==> True
uniqueness ==> True
in ==> True
its ==> True
diversity ==> True
, ==> False
and ==> True
hence ==> True
has ==> True
adapted ==> Tr

## Is it upper case?


In [22]:
# Count the no of upper case tokens
u_count=0
for token in doc1:
    print(token,'==>',token.is_upper)
print('\n\n The lower case words:\n')
for token in doc1:
    if token.is_upper==True:
        u_count=u_count+1
        print(token)
print('\n The count of upper case tokens:',u_count)

India ==> False
, ==> False
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> False
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> F

## Is it title case?

In [23]:
# Count the no of title case tokens
t_count=0
for token in doc1:
    print(token,'==>',token.is_title)
print('\n\n The title case words:\n')
for token in doc1:
    if token.is_title==True:
        t_count=t_count+1
        print(token)
print('\n The count of title case tokens:',t_count)

India ==> True
, ==> False
a ==> False
South ==> True
Asian ==> True
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> False
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> True
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> True
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> False


## Is it a bracket ?

In [24]:
# Count the no of bracket tokens
b_count=0
for token in doc1:
    print(token,'==>',token.is_bracket)
print('\n\n The bracket tokens:\n')
for token in doc1:
    if token.is_bracket==True:
        b_count=b_count+1
        print(token)
print('\n The count of bracket tokens:',b_count)

India ==> False
, ==> False
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> False
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> F

## is it a quote?


In [25]:
# Count the no of quote tokens
q_count=0
for token in doc1:
    print(token,'==>',token.is_quote)
print('\n\n The quote words:\n')
for token in doc1:
    if token.is_quote==True:
        q_count=q_count+1
        print(token)
print('\n The count of quote tokens:',q_count)

India ==> False
, ==> False
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> False
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> False
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> False
billion ==> False
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> F

## is it like a number?

In [26]:
# Count the no tokens
n_count=0
for token in doc1:
    print(token,'==>',token.like_num)
print('\n\n The numbers:\n')
for token in doc1:
    if token.like_num==True:
        n_count=n_count+1
        print(token)
print('\n The count of number tokens:',n_count)

India ==> False
, ==> False
a ==> False
South ==> False
Asian ==> False
nation ==> False
, ==> False
is ==> False
the ==> False
seventh ==> True
- ==> False
largest ==> False
country ==> False
by ==> False
area ==> False
, ==> False
the ==> False
second ==> True
- ==> False
most ==> False
populous ==> False
country ==> False
with ==> False
over ==> False
1.38 ==> True
billion ==> True
people ==> False
, ==> False
and ==> False
the ==> False
most ==> False
populous ==> False
democracy ==> False
in ==> False
the ==> False
world ==> False
. ==> False
India ==> False
boasts ==> False
of ==> False
an ==> False
immensely ==> False
rich ==> False
cultural ==> False
heritage ==> False
, ==> False
including ==> False
numerous ==> False
languages ==> False
, ==> False
traditions ==> False
, ==> False
and ==> False
people ==> False
. ==> False
The ==> False
country ==> False
holds ==> False
its ==> False
uniqueness ==> False
in ==> False
its ==> False
diversity ==> False
, ==> False
and ==> False

## Is it like a url?

In [27]:
doc3=nlp('The site of Times of India is www.timesofindia.com.')

In [28]:
for token in doc3:
    print(token.text,'==>',token.like_url)

The ==> False
site ==> False
of ==> False
Times ==> False
of ==> False
India ==> False
is ==> False
www.timesofindia.com ==> True
. ==> False


## is it like an email ID?

In [29]:
doc4=nlp('My email ID is abc1234@nmims.edu')

In [30]:
for token in doc4:
    print(token.text,'==>',token.like_email)

My ==> False
email ==> False
ID ==> False
is ==> False
abc1234@nmims.edu ==> True


In [31]:
doc4=nlp('My email ID is abc1234@nmims.edu.in')

In [32]:
for token in doc4:
    print(token.text,'==>',token.like_email)

My ==> False
email ==> False
ID ==> False
is ==> False
abc1234@nmims.edu.in ==> True


In [35]:
doc4=nlp(' My email ID is abc1234@nmims.edu.in, in the data')

In [36]:
for token in doc4:
    print(token.text,'==>',token.like_email)

  ==> False
My ==> False
email ==> False
ID ==> False
is ==> False
abc1234@nmims.edu.in ==> True
, ==> False
in ==> False
the ==> False
data ==> False


# Parts of Speech - POS

In [37]:
for token in doc2:
    print(token)

1000
is
a
big
number


In [40]:
for token in doc2:
    print(token,'==>',token.pos_)

1000 ==> NUM
is ==> AUX
a ==> DET
big ==> ADJ
number ==> NOUN


In [41]:
spacy.explain('AUX')

'auxiliary'

## Token in a DF

In [42]:
# Creating columns for the DF

cols=['Token','POS','Explain_POS','Tag','Explain_Tag']
cols

['Token', 'POS', 'Explain_POS', 'Tag', 'Explain_Tag']

In [43]:
rows=[]
for token in doc1:
    row=token,token.pos_,spacy.explain(token.pos_),token.tag_,spacy.explain(token.tag_)
    rows.append(row)
rows

[(India, 'PROPN', 'proper noun', 'NNP', 'noun, proper singular'),
 (,, 'PUNCT', 'punctuation', ',', 'punctuation mark, comma'),
 (a, 'DET', 'determiner', 'DT', 'determiner'),
 (South,
  'ADJ',
  'adjective',
  'JJ',
  'adjective (English), other noun-modifier (Chinese)'),
 (Asian,
  'ADJ',
  'adjective',
  'JJ',
  'adjective (English), other noun-modifier (Chinese)'),
 (nation, 'NOUN', 'noun', 'NN', 'noun, singular or mass'),
 (,, 'PUNCT', 'punctuation', ',', 'punctuation mark, comma'),
 (is, 'AUX', 'auxiliary', 'VBZ', 'verb, 3rd person singular present'),
 (the, 'DET', 'determiner', 'DT', 'determiner'),
 (seventh, 'ADV', 'adverb', 'RB', 'adverb'),
 (-, 'PUNCT', 'punctuation', 'HYPH', 'punctuation mark, hyphen'),
 (largest, 'ADJ', 'adjective', 'JJS', 'adjective, superlative'),
 (country, 'NOUN', 'noun', 'NN', 'noun, singular or mass'),
 (by, 'ADP', 'adposition', 'IN', 'conjunction, subordinating or preposition'),
 (area, 'NOUN', 'noun', 'NN', 'noun, singular or mass'),
 (,, 'PUNCT', 'p

In [45]:
import pandas as pd
token_df=pd.DataFrame(rows,columns=cols)
token_df

Unnamed: 0,Token,POS,Explain_POS,Tag,Explain_Tag
0,India,PROPN,proper noun,NNP,"noun, proper singular"
1,",",PUNCT,punctuation,",","punctuation mark, comma"
2,a,DET,determiner,DT,determiner
3,South,ADJ,adjective,JJ,"adjective (English), other noun-modifier (Chin..."
4,Asian,ADJ,adjective,JJ,"adjective (English), other noun-modifier (Chin..."
...,...,...,...,...,...
196,ample,ADJ,adjective,JJ,"adjective (English), other noun-modifier (Chin..."
197,source,NOUN,noun,NN,"noun, singular or mass"
198,of,ADP,adposition,IN,"conjunction, subordinating or preposition"
199,manpower,NOUN,noun,NN,"noun, singular or mass"


In [46]:
token_df['POS'].value_counts()

NOUN     48
ADJ      29
PUNCT    24
ADP      24
DET      20
VERB     12
CCONJ    10
AUX       9
ADV       9
PRON      5
NUM       4
PROPN     3
SCONJ     3
PART      1
Name: POS, dtype: int64