## Python for NLP: Tokenization, Stemming, and Lemmatization with SpaCy Library
https://stackabuse.com/python-for-nlp-tokenization-stemming-and-lemmatization-with-spacy-library/

In [3]:
!pip install -U spacy

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/95/9c/afd55bb35cc03e4b3dadc41dd48bc26e0678b08d59f32411735c35bda550/spacy-2.1.8-cp36-cp36m-manylinux1_x86_64.whl (30.8MB)
[K     |████████████████████████████████| 30.9MB 833kB/s eta 0:00:01    |████▏                           | 4.0MB 516kB/s eta 0:00:52     |██████████▎                     | 9.9MB 718kB/s eta 0:00:30     |███████████████████▏            | 18.5MB 826kB/s eta 0:00:15     |████████████████████████████    | 26.9MB 818kB/s eta 0:00:05
[?25hCollecting thinc<7.1.0,>=7.0.8 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/18/a5/9ace20422e7bb1bdcad31832ea85c52a09900cd4a7ce711246bfb92206ba/thinc-7.0.8-cp36-cp36m-manylinux1_x86_64.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 479kB/s eta 0:00:01     |█████████████████████████████▍  | 1.9MB 479kB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading https://files.pythonhosted.org/packages/3d/

In [8]:
# Downloading language model
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/rodrigo/anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
/home/rodrigo/anaconda3/lib/python3.6/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


### Basic Funcionality

In [13]:
import spacy
sp = spacy.load('en')
sentence = sp(u'Manchester United is looking to sign a forward for $90 million')

for word in sentence:
    print(word.text)

Manchester
United
is
looking
to
sign
a
forward
for
$
90
million


In [14]:
for word in sentence:
    print(word.text,  word.pos_)

Manchester PROPN
United PROPN
is VERB
looking VERB
to PART
sign VERB
a DET
forward NOUN
for ADP
$ SYM
90 NUM
million NUM


In [15]:
sentence2 = sp(u"Manchester United isn't looking to sign any forward.")

In [16]:
for word in sentence2:
    print(word.text,  word.pos_, word.dep_)

Manchester PROPN compound
United PROPN nsubj
is VERB aux
n't ADV neg
looking VERB ROOT
to PART aux
sign VERB xcomp
any DET det
forward ADV advmod
. PUNCT punct


In [17]:
document = sp(u'Hello from Stackabuse. The site with the best Python Tutorials. What are you looking for?')
for sentence in document.sents:
    print(sentence)

Hello from Stackabuse.
The site with the best Python Tutorials.
What are you looking for?


In [18]:
document[4]

The

In [19]:
document[4].is_sent_start

True

### Tokenization

In [21]:
sentence3 = sp(u'"They\'re leaving U.K. for U.S.A."')
print(sentence3)

"They're leaving U.K. for U.S.A."


In [22]:
for word in sentence3:
    print(word.text)

"
They
're
leaving
U.K.
for
U.S.A.
"


In [23]:
sentence4 = sp(u"Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com")
print(sentence4)

Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com


In [24]:
for word in sentence4:
    print(word.text)

Hello
,
I
am
non
-
vegetarian
,
email
me
the
menu
at
abc-xyz@gmai.com


In [25]:
len(sentence4)

14

### Detecting Entities

In [30]:
sentence5 = sp(u'Manchester United is looking to sign Harry Kane for $90 million')  
for word in sentence5:
    print(word.text)

Manchester
United
is
looking
to
sign
Harry
Kane
for
$
90
million


In [34]:
for entity in sentence5.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

Manchester United - ORG - Companies, agencies, institutions, etc.
Harry Kane - PERSON - People, including fictional
$90 million - MONEY - Monetary values, including unit


### Detecting Nouns

In [45]:
sentence5 = sp(u'Latest Rumours: Manchester United is looking to sign Harry Kane for $90 million') 

In [46]:
for noun in sentence5.noun_chunks:
    print(noun.text)

Latest Rumours
Manchester United
Harry Kane


### Stemming

#### Porter Stemmer

In [40]:
import nltk

from nltk.stem.porter import *

In [42]:
stemmer = PorterStemmer()
tokens = ['compute', 'computer', 'computed', 'computing']

In [43]:
for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


#### Snowball Stemmer

In [48]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

tokens = ['compute', 'computer', 'computed', 'computing']

for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


### Lemmatization

In [50]:
sentence6 = sp(u'compute computer computed computing')

In [51]:
for word in sentence6:
    print(word.text,  word.lemma_)

compute compute
computer computer
computed compute
computing computing


In [52]:
sentence7 = sp(u'A letter has been written, asking him to be released')

for word in sentence7:
    print(word.text + '  ===>', word.lemma_)

A  ===> a
letter  ===> letter
has  ===> have
been  ===> be
written  ===> write
,  ===> ,
asking  ===> ask
him  ===> -PRON-
to  ===> to
be  ===> be
released  ===> release
