# Spacy

In [1]:
import spacy # import spacy library. check first time installation instructions on spacy website https://spacy.io/usage

ModuleNotFoundError: No module named 'spacy'

In [2]:
nlp = spacy.load('en_core_web_sm') #load spacy model and save it with a variable name, here calling it nlp. 'en' stnads for english languate. spacy supports multiple languages. So we need to specifically load the english language model first

In [3]:
# some random text
text = 'The cutting edge technologies available today can make life enormously exiting as well as dangerous and complicated. This can also be a mischievous comment generating controversies across the political spectrum. This can spill over beyond India and impact the rest of the world to the tune of $ 10 million'

## Creating a Spacy Document Object

In [4]:
doc = nlp(text) # creating a spacy document object. This stores a lot of info about each word(token) in the string

In [5]:
doc # displays the text content as it is

The cutting edge technologies available today can make life enormously exiting as well as dangerous and complicated. This can also be a mischievous comment generating controversies across the political spectrum. This can spill over beyond India and impact the rest of the world to the tune of $ 10 million

## Extracting Sentences from a Spacy Document

In [6]:
# extracting sentences from the spacy doc

for sent in doc.sents:
    print(sent)

The cutting edge technologies available today can make life enormously exiting as well as dangerous and complicated.
This can also be a mischievous comment generating controversies across the political spectrum.
This can spill over beyond India and impact the rest of the world to the tune of $ 10 million


## Extracting Part of Speech Tags from a Spacy Document

In [8]:
# part of speech (pos) tags --> vrb, noun, adjective etc.
for token in doc:
    print(f'{token.text:20}{token.pos_:10}{spacy.explain(token.pos_)}')

The                 DET       determiner
cutting             VERB      verb
edge                NOUN      noun
technologies        NOUN      noun
available           ADJ       adjective
today               NOUN      noun
can                 AUX       auxiliary
make                VERB      verb
life                NOUN      noun
enormously          ADV       adverb
exiting             VERB      verb
as                  ADV       adverb
well                ADV       adverb
as                  ADP       adposition
dangerous           ADJ       adjective
and                 CCONJ     coordinating conjunction
complicated         ADJ       adjective
.                   PUNCT     punctuation
This                DET       determiner
can                 AUX       auxiliary
also                ADV       adverb
be                  VERB      verb
a                   DET       determiner
mischievous         ADJ       adjective
comment             NOUN      noun
generating          NOUN      noun
c

## Lemmatized Form of Words using `lemma_` attribute

In [9]:
# lemmatizing is a smarter way of getting the root words compared to stemming

for token in doc:
    print(f'{token.text:20}{token.lemma_}') # using .lemma_ we can access the lemmatized form of each token

# token.text prints the string form of the token

The                 the
cutting             cut
edge                edge
technologies        technology
available           available
today               today
can                 can
make                make
life                life
enormously          enormously
exiting             exit
as                  as
well                well
as                  as
dangerous           dangerous
and                 and
complicated         complicated
.                   .
This                this
can                 can
also                also
be                  be
a                   a
mischievous         mischievous
comment             comment
generating          generating
controversies       controversy
across              across
the                 the
political           political
spectrum            spectrum
.                   .
This                this
can                 can
spill               spill
over                over
beyond              beyond
India               India
and 

## Named Entity Recognition using Spacy

In [30]:
# named entity recognition (NER) --> Name, place, person, location, country, city, state, continent,Money etc.

for entities in doc.ents:
    print(f'{entities.text:20}{entities.label_}')

today               DATE
India               GPE
$ 10 million        MONEY


Spacy is smart enough to combine $ and 10 and million and understand that these terms together means money !!
GPE stands for geographic or polical Entity.

# NLTK

In [10]:
import nltk

In [11]:
from nltk.stem import PorterStemmer

In [12]:
stemmer = PorterStemmer()

In [13]:
for token in doc:
    print(f'{token.text:20}{stemmer.stem(token.text)}') # compare the stemmed form using nltk with the lemmatized form using spacy earlier... not so good ! many stemmed form of the words have no meaning.

The                 the
cutting             cut
edge                edg
technologies        technolog
available           avail
today               today
can                 can
make                make
life                life
enormously          enorm
exiting             exit
as                  as
well                well
as                  as
dangerous           danger
and                 and
complicated         complic
.                   .
This                thi
can                 can
also                also
be                  be
a                   a
mischievous         mischiev
comment             comment
generating          gener
controversies       controversi
across              across
the                 the
political           polit
spectrum            spectrum
.                   .
This                thi
can                 can
spill               spill
over                over
beyond              beyond
India               india
and                 and
impact      

## Lemmatization using nltk

In [14]:
from nltk.stem import WordNetLemmatizer

In [20]:
lems = WordNetLemmatizer()

In [22]:
from nltk import word_tokenize # As WordNetLemmatizer works only with nltk tokens

In [23]:
toks = word_tokenize(text)
toks

['The',
 'cutting',
 'edge',
 'technologies',
 'available',
 'today',
 'can',
 'make',
 'life',
 'enormously',
 'exiting',
 'as',
 'well',
 'as',
 'dangerous',
 'and',
 'complicated',
 '.',
 'This',
 'can',
 'also',
 'be',
 'a',
 'mischievous',
 'comment',
 'generating',
 'controversies',
 'across',
 'the',
 'political',
 'spectrum',
 '.',
 'This',
 'can',
 'spill',
 'over',
 'beyond',
 'India',
 'and',
 'impact',
 'the',
 'rest',
 'of',
 'the',
 'world',
 'to',
 'the',
 'tune',
 'of',
 '$',
 '10',
 'million']

In [25]:
for token in toks:
    print(f'{token:20}{lems.lemmatize(token)}') # ouput much better than stemming

The                 The
cutting             cutting
edge                edge
technologies        technology
available           available
today               today
can                 can
make                make
life                life
enormously          enormously
exiting             exiting
as                  a
well                well
as                  a
dangerous           dangerous
and                 and
complicated         complicated
.                   .
This                This
can                 can
also                also
be                  be
a                   a
mischievous         mischievous
comment             comment
generating          generating
controversies       controversy
across              across
the                 the
political           political
spectrum            spectrum
.                   .
This                This
can                 can
spill               spill
over                over
beyond              beyond
India               India

In [26]:
# create a dataframe for comparison of stemmed and lemmatized form of the words
stemmed = []
lemmatized = []

for token in toks:
    stemmed.append(stemmer.stem(token))
    lemmatized.append(lems.lemmatize(token))

In [27]:
import pandas as pd 

In [29]:
df = pd.DataFrame(dict(original = toks,stemmed = stemmed, lemmatized = lemmatized))
df # output of lemmatization much more sensible..

Unnamed: 0,original,stemmed,lemmatized
0,The,the,The
1,cutting,cut,cutting
2,edge,edg,edge
3,technologies,technolog,technology
4,available,avail,available
5,today,today,today
6,can,can,can
7,make,make,make
8,life,life,life
9,enormously,enorm,enormously
