In [None]:
import pandas as pd 
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import warnings 
warnings.filterwarnings('ignore')

In [2]:
text = """
"Looking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled "warm and fuzzy feelings." As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips. I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar's Palace in Lake Tahoe."

"The day I picked my dog up from the pound was one of the happiest days of both of our lives. I had gone to the pound just a week earlier with the idea that I would just "look" at a puppy. Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening. I knew within minutes of walking in the door that I would get a puppy… but it wasn't until I saw him that I knew I had found my puppy."

"Looking for houses was supposed to be a fun and exciting process. Unfortunately, none of the ones that we saw seemed to match the specifications that we had established. They were too small, too impersonal, too close to the neighbors. After days of finding nothing even close, we began to wonder: was there really a perfect house out there for us?"
"""

In [3]:
text

'\n"Looking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled "warm and fuzzy feelings." As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips. I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar\'s Palace in Lake Tahoe."\n\n"The day I picked my dog up from the pound was one of the happiest days of both of our lives. I had gone to the pound just a week earlier with the idea that I would just "look" at a puppy. Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening. I knew within minutes of walking in the door that I would get a puppy… but it wasn\'t until I saw him that I knew I had found my puppy."\n\n"Looking for houses was supposed to be a fun and exciting process. Unfortuna

## let's create word tokens 
Tokenization breaks the raw text into words, sentences called tokens. These tokens help in understanding the context or developing the model for the NLP. The tokenization helps in interpreting the meaning of the text by analyzing the sequence of the words

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
# let's create word tokens 
tokens = word_tokenize(text.lower())

print(tokens)
print(len(tokens))

["''", 'looking', 'back', 'on', 'a', 'childhood', 'filled', 'with', 'events', 'and', 'memories', ',', 'i', 'find', 'it', 'rather', 'difficult', 'to', 'pick', 'one', 'that', 'leaves', 'me', 'with', 'the', 'fabled', '``', 'warm', 'and', 'fuzzy', 'feelings', '.', "''", 'as', 'the', 'daughter', 'of', 'an', 'air', 'force', 'major', ',', 'i', 'had', 'the', 'pleasure', 'of', 'traveling', 'across', 'america', 'in', 'many', 'moving', 'trips', '.', 'i', 'have', 'visited', 'the', 'monstrous', 'trees', 'of', 'the', 'sequoia', 'national', 'forest', ',', 'stood', 'on', 'the', 'edge', 'of', 'the', 'grand', 'canyon', 'and', 'have', 'jumped', 'on', 'the', 'beds', 'at', 'caesar', "'s", 'palace', 'in', 'lake', 'tahoe', '.', "''", '``', 'the', 'day', 'i', 'picked', 'my', 'dog', 'up', 'from', 'the', 'pound', 'was', 'one', 'of', 'the', 'happiest', 'days', 'of', 'both', 'of', 'our', 'lives', '.', 'i', 'had', 'gone', 'to', 'the', 'pound', 'just', 'a', 'week', 'earlier', 'with', 'the', 'idea', 'that', 'i', 'wo

[NLTK:Stemmer](https://www.nltk.org/howto/stem.html)

In [7]:
stemmer = PorterStemmer()
porter_stemmer = [stemmer.stem(token) for token in tokens]
print(porter_stemmer)
print(len(porter_stemmer))

["''", 'look', 'back', 'on', 'a', 'childhood', 'fill', 'with', 'event', 'and', 'memori', ',', 'i', 'find', 'it', 'rather', 'difficult', 'to', 'pick', 'one', 'that', 'leav', 'me', 'with', 'the', 'fabl', '``', 'warm', 'and', 'fuzzi', 'feel', '.', "''", 'as', 'the', 'daughter', 'of', 'an', 'air', 'forc', 'major', ',', 'i', 'had', 'the', 'pleasur', 'of', 'travel', 'across', 'america', 'in', 'mani', 'move', 'trip', '.', 'i', 'have', 'visit', 'the', 'monstrou', 'tree', 'of', 'the', 'sequoia', 'nation', 'forest', ',', 'stood', 'on', 'the', 'edg', 'of', 'the', 'grand', 'canyon', 'and', 'have', 'jump', 'on', 'the', 'bed', 'at', 'caesar', "'s", 'palac', 'in', 'lake', 'taho', '.', "''", '``', 'the', 'day', 'i', 'pick', 'my', 'dog', 'up', 'from', 'the', 'pound', 'wa', 'one', 'of', 'the', 'happiest', 'day', 'of', 'both', 'of', 'our', 'live', '.', 'i', 'had', 'gone', 'to', 'the', 'pound', 'just', 'a', 'week', 'earlier', 'with', 'the', 'idea', 'that', 'i', 'would', 'just', '``', 'look', "''", 'at', '

In [8]:
stemmer = SnowballStemmer('english')
snowball = [stemmer.stem(token) for token in tokens]
print(snowball)
print(len(snowball))

["''", 'look', 'back', 'on', 'a', 'childhood', 'fill', 'with', 'event', 'and', 'memori', ',', 'i', 'find', 'it', 'rather', 'difficult', 'to', 'pick', 'one', 'that', 'leav', 'me', 'with', 'the', 'fabl', '``', 'warm', 'and', 'fuzzi', 'feel', '.', "''", 'as', 'the', 'daughter', 'of', 'an', 'air', 'forc', 'major', ',', 'i', 'had', 'the', 'pleasur', 'of', 'travel', 'across', 'america', 'in', 'mani', 'move', 'trip', '.', 'i', 'have', 'visit', 'the', 'monstrous', 'tree', 'of', 'the', 'sequoia', 'nation', 'forest', ',', 'stood', 'on', 'the', 'edg', 'of', 'the', 'grand', 'canyon', 'and', 'have', 'jump', 'on', 'the', 'bed', 'at', 'caesar', "'s", 'palac', 'in', 'lake', 'taho', '.', "''", '``', 'the', 'day', 'i', 'pick', 'my', 'dog', 'up', 'from', 'the', 'pound', 'was', 'one', 'of', 'the', 'happiest', 'day', 'of', 'both', 'of', 'our', 'live', '.', 'i', 'had', 'gone', 'to', 'the', 'pound', 'just', 'a', 'week', 'earlier', 'with', 'the', 'idea', 'that', 'i', 'would', 'just', '``', 'look', "''", 'at',

In [9]:
df = pd.DataFrame({'token':tokens, 'Porter_STEMMER':porter_stemmer, 'SNOWBALL_STEMMER':snowball})
df.head()

Unnamed: 0,token,Porter_STEMMER,SNOWBALL_STEMMER
0,'','',''
1,looking,look,look
2,back,back,back
3,on,on,on
4,a,a,a


In [10]:
# let's check the changed tokens only 
df[(df.token != df.Porter_STEMMER) | (df.token != df.SNOWBALL_STEMMER)]

Unnamed: 0,token,Porter_STEMMER,SNOWBALL_STEMMER
1,looking,look,look
6,filled,fill,fill
8,events,event,event
10,memories,memori,memori
21,leaves,leav,leav
25,fabled,fabl,fabl
29,fuzzy,fuzzi,fuzzi
30,feelings,feel,feel
39,force,forc,forc
45,pleasure,pleasur,pleasur


In [11]:
# unchanged values 
df[(df.token == df.Porter_STEMMER) | (df.token == df.SNOWBALL_STEMMER)]

Unnamed: 0,token,Porter_STEMMER,SNOWBALL_STEMMER
0,'','',''
2,back,back,back
3,on,on,on
4,a,a,a
5,childhood,childhood,childhood
...,...,...,...
269,there,there,there
270,for,for,for
271,us,us,us
272,?,?,?
