## Stemming

In [2]:
import nltk #Spacy does not use a stemmer, only lemmatization

In [3]:
from nltk.stem.porter import PorterStemmer

In [4]:
p_stemmer = PorterStemmer()

In [6]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly']

In [8]:
for word in words:
    print(word + ' ------> ' + p_stemmer.stem(word))

run ------> run
runner ------> runner
ran ------> ran
runs ------> run
easily ------> easili
fairly ------> fairli


In [9]:
from nltk.stem.snowball import SnowballStemmer #more advanced stemmer

In [10]:
s_stemmer = SnowballStemmer(language = 'english')

In [12]:
for word in words:
    print(word + ' ------> ' + s_stemmer.stem(word))

run ------> run
runner ------> runner
ran ------> ran
runs ------> run
easily ------> easili
fairly ------> fair


In [13]:
words2 = ['generous', 'generate', 'generously', 'generation']

In [15]:
for word in words2:
    print(word + ' ------> ' + s_stemmer.stem(word))

generous ------> generous
generate ------> generat
generously ------> generous
generation ------> generat


## Lemmatization

In [20]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [21]:
doc1 = nlp(u'I am a runner running in races because I ran a race today.')

In [30]:
for token in doc1:
    print(token, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
races 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


Better formatted version

In [36]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')
    return(token)

In [37]:
show_lemmas(doc1)

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
races        NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


.

## Stop words

Words like 'a' and 'the' that can be filtered from text (305 in total)

In [39]:
print(nlp.Defaults.stop_words)

{'what', 'therefore', 'could', 'take', 'otherwise', 'twelve', 'hers', 'i', 'how', 'be', 'latter', '’d', 'nobody', 'name', 'it', 'others', 'whenever', 'go', '‘ve', 'perhaps', 'whom', 'been', 'n’t', 'the', 'thereupon', 'ours', '’re', 'either', 'just', 'their', 'give', 'does', 'also', 'as', 'though', 'a', 'that', 'moreover', 'hereby', 'when', 'mine', 'behind', 'front', 'can', 'made', 'someone', 'nowhere', 'yet', 'then', 'at', 'eight', 'almost', 'whither', 'too', 'enough', 'everything', 'afterwards', 'himself', 're', 'thence', 'hereupon', 'make', 'ca', 'those', 'rather', 'off', 'via', 'seem', 'herein', 'above', 'within', 'often', 'up', 'first', 'to', 'well', 'only', 'whatever', 'hereafter', 'mostly', 'both', 'throughout', 'through', 'would', 'really', 'whence', 'various', 'call', 'must', '‘ll', 'whereby', 'alone', 'together', 'top', 'yours', 'wherever', 'three', 'before', 'them', 'since', 'while', 'not', 'always', 'of', 'nine', 'again', 'where', 'against', 'itself', 'fifty', 'did', "'re", 

check to see if a stop word

In [44]:
nlp.vocab['become'].is_stop # only one word at a time

True

add a stop word - 2 step process

In [45]:
nlp.Defaults.stop_words.add('btw')

In [47]:
nlp.vocab['btw'].is_stop = True

In [48]:
nlp.vocab['btw'].is_stop

True

In [49]:
len(nlp.Defaults.stop_words)

327

Reverse process using 'remove' to delete words from the list