# ***`Understand NLP`***

In [1]:
## Basic packages
import os
import sys
import shutil
import re                # Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
import pickle

from tqdm import tqdm

## Data processing and visualization packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## ML and Scientific python packages
import sklearn
import scipy

## NLP Packages
import nltk

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

%matplotlib inline

#### **Reading the files from current location**

In [2]:
import glob

In [3]:
file_names = [file_name.split("\\")[-1] for file_name in glob.glob(os.getcwd()+'\\doc*.txt')]

In [4]:
file_names

['doc1.txt', 'doc2.txt']

#### **Creating the corpus from files**

In [5]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [6]:
corpus = PlaintextCorpusReader(root=os.getcwd(),fileids=file_names)

In [7]:
corpus.fileids()

['doc1.txt', 'doc2.txt']

#### ***`Courpus Paras`***

In [8]:
print([para for para in corpus.paras()])

[[['"', 'This', 'is', 'the', 'worlds', 'greatest', 'mountain', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'In', 'our', 'childhood', 'days', ',', 'we', 'loved', 'to', 'watch', 'WWE', 'and', 'pro', 'wrestling', '."']], [['"', 'My', 'Name', 'is', 'Rama', 'Shyama', '.'], ['",', '"', 'Python', 'is', 'awsome', 'and', 'machine', 'learning', 'is', 'great', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'was', 'as', 'always', 'cripy', 'and', 'fingerlicious', '."']]]


#### ***`Courpus Sentences`***

In [9]:
corpus_sents = [sent for sent in corpus.sents()]
print(corpus_sents)

[['"', 'This', 'is', 'the', 'worlds', 'greatest', 'mountain', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'In', 'our', 'childhood', 'days', ',', 'we', 'loved', 'to', 'watch', 'WWE', 'and', 'pro', 'wrestling', '."'], ['"', 'My', 'Name', 'is', 'Rama', 'Shyama', '.'], ['",', '"', 'Python', 'is', 'awsome', 'and', 'machine', 'learning', 'is', 'great', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'was', 'as', 'always', 'cripy', 'and', 'fingerlicious', '."']]


#### ***`Courpus Words`***

In [10]:
corpus_words = [word for word in corpus.words()]
print(corpus_words)

['"', 'This', 'is', 'the', 'worlds', 'greatest', 'mountain', '.",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.",', '"', 'In', 'our', 'childhood', 'days', ',', 'we', 'loved', 'to', 'watch', 'WWE', 'and', 'pro', 'wrestling', '."', '"', 'My', 'Name', 'is', 'Rama', 'Shyama', '.",', '"', 'Python', 'is', 'awsome', 'and', 'machine', 'learning', 'is', 'great', '.",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.', 'But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.",', '"', 'The', 'dominoz', 'tacco', 'was', 'as', 'always', 'cripy', 'and', 'fingerlicious', '."']


#### ***`English Stopwords`***

In [11]:
eng_stopwords = stopwords.words('english')

In [12]:
for word in ['not','nor','no']:
    eng_stopwords.remove(word)

In [13]:
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

#### ***`Cleaning the Corpus`***
- ##### **Removing special characters**
- ##### **Removing unwanted spaces**
- ##### **Lower case the words**
- ##### **Tokenizing the words**

In [14]:
print(corpus_sents)

[['"', 'This', 'is', 'the', 'worlds', 'greatest', 'mountain', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'In', 'our', 'childhood', 'days', ',', 'we', 'loved', 'to', 'watch', 'WWE', 'and', 'pro', 'wrestling', '."'], ['"', 'My', 'Name', 'is', 'Rama', 'Shyama', '.'], ['",', '"', 'Python', 'is', 'awsome', 'and', 'machine', 'learning', 'is', 'great', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'was', 'as', 'always', 'cripy', 'and', 'fingerlicious', '."']]


In [15]:
cleaned_sent = []
for sent in corpus_sents:
    process_sent = [re.sub('[^A-Za-z]+', ' ', str(sent)).strip().lower()]
    cleaned_sent.append(process_sent)
    
print(cleaned_sent)

[['this is the worlds greatest mountain'], ['i love working on data science projects'], ['the nexon car is very affordable'], ['the pizza was cheap tasty and delicious'], ['in our childhood days we loved to watch wwe and pro wrestling'], ['my name is rama shyama'], ['python is awsome and machine learning is great'], ['the tata nexon car is very stylish dynamic and has a strong build'], ['but their after sales service is not good'], ['the pizza in the party was tasty and cheesy'], ['the dominoz tacco was as always cripy and fingerlicious']]


#### ***`Storing text messages in a DataFrame`***

In [16]:
pd.set_option('display.max_colwidth',2000)

In [17]:
text_df = pd.DataFrame(cleaned_sent).reset_index()
text_df.columns = ['Id','Pre_processed_Message']
text_df

Unnamed: 0,Id,Pre_processed_Message
0,0,this is the worlds greatest mountain
1,1,i love working on data science projects
2,2,the nexon car is very affordable
3,3,the pizza was cheap tasty and delicious
4,4,in our childhood days we loved to watch wwe and pro wrestling
5,5,my name is rama shyama
6,6,python is awsome and machine learning is great
7,7,the tata nexon car is very stylish dynamic and has a strong build
8,8,but their after sales service is not good
9,9,the pizza in the party was tasty and cheesy


#### ***`Removing Stopwords and Tokenization`***

In [18]:
text_df['Tokens'] = text_df['Pre_processed_Message'].apply(lambda row: [word for word in row.split(" ") if word not in eng_stopwords])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]"
5,5,my name is rama shyama,"[name, rama, shyama]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]"


### ***`Stemming`***
#### ***`Porter Stemmer`***

In [19]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [20]:
port_stem = PorterStemmer()

In [21]:
text_df['Porter_Stems'] = text_df['Tokens'].apply(lambda row: [port_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]"


### **Porter Stemmer is simply chopping off the tails of the words. Not a good way!!**

#### ***`Snowball Stemmer`***

In [22]:
snow_stem = SnowballStemmer(language='english',ignore_stopwords=True)

In [23]:
text_df['Snowball_Stems'] = text_df['Tokens'].apply(lambda row: [snow_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]"


### **Snowball Stemmer is considered as a strongest method for taking a word to its root form. However, here, I didn't oberserved any difference with Porter Stemmer.**

#### ***`Lancaster Stemmer`***

In [24]:
lanc_stem = LancasterStemmer()

In [25]:
text_df['Lancaster_Stems'] = text_df['Tokens'].apply(lambda row: [lanc_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]"


### **Definitely Lancaster Stemmer is not a good approach to bring a word to its root form. It is chopping-off the vowels from the tails of the words.**

### ***`Lemmatizers`***
##### **Reference Links**

- https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    
- https://www.nltk.org/book/ch05.html


#### ***`Wordnet Lemmatizer`***

In [26]:
from nltk.stem.wordnet import WordNetLemmatizer

In [27]:
wnet_lemma = WordNetLemmatizer()

##### **Trying to understand the working of Wordnet Lemmatizer on some examples**

In [28]:
example_words = ['cows','cow','boys','males','females','accesses','accessed','caring','cares','watches','watched','watching','is','are',\
                 'were','we','did','does','fruits','fruity','tastes','tasty','beauties','beautification','beauty','beautiful','unreliable',\
                 'unreliability','explain','explaines','explanation']

print("Given word along its Stems(Snowball) and Lemmas(Wordnet):\n")
for word in example_words:
    print("'{}' ---> '{}' ---> '{}'".format(word,snow_stem.stem(word),wnet_lemma.lemmatize(word)))

Given word along its Stems(Snowball) and Lemmas(Wordnet):

'cows' ---> 'cow' ---> 'cow'
'cow' ---> 'cow' ---> 'cow'
'boys' ---> 'boy' ---> 'boy'
'males' ---> 'male' ---> 'male'
'females' ---> 'femal' ---> 'female'
'accesses' ---> 'access' ---> 'access'
'accessed' ---> 'access' ---> 'accessed'
'caring' ---> 'care' ---> 'caring'
'cares' ---> 'care' ---> 'care'
'watches' ---> 'watch' ---> 'watch'
'watched' ---> 'watch' ---> 'watched'
'watching' ---> 'watch' ---> 'watching'
'is' ---> 'is' ---> 'is'
'are' ---> 'are' ---> 'are'
'were' ---> 'were' ---> 'were'
'we' ---> 'we' ---> 'we'
'did' ---> 'did' ---> 'did'
'does' ---> 'does' ---> 'doe'
'fruits' ---> 'fruit' ---> 'fruit'
'fruity' ---> 'fruiti' ---> 'fruity'
'tastes' ---> 'tast' ---> 'taste'
'tasty' ---> 'tasti' ---> 'tasty'
'beauties' ---> 'beauti' ---> 'beauty'
'beautification' ---> 'beautif' ---> 'beautification'
'beauty' ---> 'beauti' ---> 'beauty'
'beautiful' ---> 'beauti' ---> 'beautiful'
'unreliable' ---> 'unreli' ---> 'unreliable'


### **Here, I have observed that Lemmatizer works good only in the case of plural words that belongs to the category of 'Inflectional Bound Morphemes'. It doesn't really work with other kind or forms of words. On the other hand, Stemmers firmly believe on just chopping the tails of words which many times leads to incorrect or misspelt words.**

In [29]:
test_sentence = ("Waaooo, such an amazing match. I really enjoyed watching this event of WWE and pro wrestling. \
And, they are the best and my stress busters too.")

In [30]:
[wnet_lemma.lemmatize(word.replace(".","")) for word in test_sentence.split(" ")]      ## Lemmatizer

['Waaooo,',
 'such',
 'an',
 'amazing',
 'match',
 'I',
 'really',
 'enjoyed',
 'watching',
 'this',
 'event',
 'of',
 'WWE',
 'and',
 'pro',
 'wrestling',
 'And,',
 'they',
 'are',
 'the',
 'best',
 'and',
 'my',
 'stress',
 'buster',
 'too']

### **Nothing really changed here other than busters got converted to buster.**

In [31]:
[snow_stem.stem(word.replace(".","")) for word in test_sentence.split(" ")]      ## Stemmer

['waaooo,',
 'such',
 'an',
 'amaz',
 'match',
 'i',
 'realli',
 'enjoy',
 'watch',
 'this',
 'event',
 'of',
 'wwe',
 'and',
 'pro',
 'wrestl',
 'and,',
 'they',
 'are',
 'the',
 'best',
 'and',
 'my',
 'stress',
 'buster',
 'too']

### **Good amount of words tail chopping have been performed here and it looks like Stemmer lowercase the words before performing any action.**

### ***`Wordnet Lemmatizer with POS(Part-of-speech) tagging`***

##### **CASE-I**

In [32]:
wnet_lemma.lemmatize('watching',pos='v')

'watch'

In [33]:
wnet_lemma.lemmatize('watching',pos='a')

'watching'

In [34]:
wnet_lemma.lemmatize('watching',pos='s')

'watching'

In [35]:
wnet_lemma.lemmatize('watching',pos='n')

'watching'

In [36]:
wnet_lemma.lemmatize('watching',pos='r')

'watching'

### **In the previous and above examples lemmatizer didn't performed really well. Earlier 'watching' was not converted to 'watch' but here this conversion happened when we used the POS tag as 'verb'. This is beacuse we provided the correct ‘part-of-speech’ tag (POS tag) as the second argument to lemmatize(). Sometimes, the same word can have a multiple lemmas based on the meaning / context.**

##### **CASE-II :: POS-TAG as VERB**

In [37]:
example_words = ['cows','cow','boys','males','females','accesses','accessed','caring','cares','watches','watched','watching','is','are',\
                 'were','we','did','does','fruits','fruity','tastes','tasty','beauties','beautes','beautification','beauty','beautiful',\
                 'unreliable','unreliability','explain','explaines','explanation','refuse','deny','good','better','best']

print("Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):\n")
for word in example_words:
    print("{} --> '{}' ---> '{}'".format(nltk.pos_tag([word]),snow_stem.stem(word),wnet_lemma.lemmatize(word,pos='v')))

Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):

[('cows', 'NNS')] --> 'cow' ---> 'cow'
[('cow', 'NN')] --> 'cow' ---> 'cow'
[('boys', 'NNS')] --> 'boy' ---> 'boys'
[('males', 'NNS')] --> 'male' ---> 'males'
[('females', 'NNS')] --> 'femal' ---> 'females'
[('accesses', 'NNS')] --> 'access' ---> 'access'
[('accessed', 'VBN')] --> 'access' ---> 'access'
[('caring', 'VBG')] --> 'care' ---> 'care'
[('cares', 'NNS')] --> 'care' ---> 'care'
[('watches', 'NNS')] --> 'watch' ---> 'watch'
[('watched', 'VBN')] --> 'watch' ---> 'watch'
[('watching', 'VBG')] --> 'watch' ---> 'watch'
[('is', 'VBZ')] --> 'is' ---> 'be'
[('are', 'VBP')] --> 'are' ---> 'be'
[('were', 'VBD')] --> 'were' ---> 'be'
[('we', 'PRP')] --> 'we' ---> 'we'
[('did', 'VBD')] --> 'did' ---> 'do'
[('does', 'VBZ')] --> 'does' ---> 'do'
[('fruits', 'NNS')] --> 'fruit' ---> 'fruit'
[('fruity', 'NN')] --> 'fruiti' ---> 'fruity'
[('tastes', 'NNS')] --> 'tast' ---> 'taste'
[('tasty', 'NN')] --> 'tasti' ---> 'tasty

##### **CASE-III :: POS-TAG as ADVERB**

In [38]:
example_words = ['cows','cow','boys','males','females','accesses','accessed','caring','cares','watches','watched','watching','is','are',\
                 'were','we','did','does','fruits','fruity','tastes','tasty','beauties','beautes','beautification','beauty','beautiful',\
                 'unreliable','unreliability','explain','explaines','explanation','refuse','deny','good','better','best']

print("Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):\n")
for word in example_words:
    print("{} --> '{}' ---> '{}'".format(nltk.pos_tag([word]),snow_stem.stem(word),wnet_lemma.lemmatize(word,pos='r')))

Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):

[('cows', 'NNS')] --> 'cow' ---> 'cows'
[('cow', 'NN')] --> 'cow' ---> 'cow'
[('boys', 'NNS')] --> 'boy' ---> 'boys'
[('males', 'NNS')] --> 'male' ---> 'males'
[('females', 'NNS')] --> 'femal' ---> 'females'
[('accesses', 'NNS')] --> 'access' ---> 'accesses'
[('accessed', 'VBN')] --> 'access' ---> 'accessed'
[('caring', 'VBG')] --> 'care' ---> 'caring'
[('cares', 'NNS')] --> 'care' ---> 'cares'
[('watches', 'NNS')] --> 'watch' ---> 'watches'
[('watched', 'VBN')] --> 'watch' ---> 'watched'
[('watching', 'VBG')] --> 'watch' ---> 'watching'
[('is', 'VBZ')] --> 'is' ---> 'is'
[('are', 'VBP')] --> 'are' ---> 'are'
[('were', 'VBD')] --> 'were' ---> 'were'
[('we', 'PRP')] --> 'we' ---> 'we'
[('did', 'VBD')] --> 'did' ---> 'did'
[('does', 'VBZ')] --> 'does' ---> 'does'
[('fruits', 'NNS')] --> 'fruit' ---> 'fruits'
[('fruity', 'NN')] --> 'fruiti' ---> 'fruity'
[('tastes', 'NNS')] --> 'tast' ---> 'tastes'
[('tasty', 'NN')] 

### **Clearly, POS TAG plays a crucial role in understanding the meaning or context of the sentence.**

##### **Check the TAG type via Help**

In [39]:
nltk.help.upenn_tagset('RBR')

RBR: adverb, comparative
    further gloomier grander graver greater grimmer harder harsher
    healthier heavier higher however larger later leaner lengthier less-
    perfectly lesser lonelier longer louder lower more ...


In [40]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


### **This way we see some help on TAG codes.**

##### **CASE-IV :: POS-TAG of words in a sentence**

In [41]:
test_sent_tokens_tags = nltk.pos_tag(nltk.word_tokenize(re.sub('[^A-Za-z]+',' ',test_sentence)))
test_sent_tokens_tags

[('Waaooo', 'NNP'),
 ('such', 'PDT'),
 ('an', 'DT'),
 ('amazing', 'JJ'),
 ('match', 'NN'),
 ('I', 'PRP'),
 ('really', 'RB'),
 ('enjoyed', 'VBD'),
 ('watching', 'VBG'),
 ('this', 'DT'),
 ('event', 'NN'),
 ('of', 'IN'),
 ('WWE', 'NNP'),
 ('and', 'CC'),
 ('pro', 'JJ'),
 ('wrestling', 'NN'),
 ('And', 'CC'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('and', 'CC'),
 ('my', 'PRP$'),
 ('stress', 'NN'),
 ('busters', 'NNS'),
 ('too', 'RB')]

In [42]:
from nltk.corpus import wordnet

In [43]:
def get_wordnet_pos(tag):
    """Map POS tag to first character lemmatize() accepts"""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag,wordnet.NOUN)

In [44]:
get_wordnet_pos('better')

'n'

In [45]:
wnet_lemma.lemmatize('better',get_wordnet_pos('better'))

'better'

In [46]:
get_wordnet_pos('watching')

'n'

In [47]:
wnet_lemma.lemmatize('watching',get_wordnet_pos('watching'))

'watching'

In [48]:
print([wnet_lemma.lemmatize(word,get_wordnet_pos(tag)) for word,tag in test_sent_tokens_tags])

['Waaooo', 'such', 'an', 'amazing', 'match', 'I', 'really', 'enjoy', 'watch', 'this', 'event', 'of', 'WWE', 'and', 'pro', 'wrestling', 'And', 'they', 'be', 'the', 'best', 'and', 'my', 'stress', 'buster', 'too']


### **Now, the results are quite good. Here, if you closely examine 'amazing' is unchanged because as per the context of the sentence it belongs to the Adjective class and in this category its lemma is 'amazing'(refer below cell). Therefore, it remains the same.**

In [49]:
wnet_lemma.lemmatize('amazing',pos='a')          ## a for adjective, n for noun, v for verb and r for adverb

'amazing'

#### **CASE-V :: Lets check the same with a sentence having some homonyms.**

In [50]:
homonym_sent = ("They refuse to permit us to obtain the refuse permit")

In [51]:
homonym_sent_tokens = nltk.word_tokenize(homonym_sent,preserve_line=False)
homonym_sent_tokens

['They',
 'refuse',
 'to',
 'permit',
 'us',
 'to',
 'obtain',
 'the',
 'refuse',
 'permit']

In [52]:
homonym_sent_tokens_tags = nltk.pos_tag(homonym_sent_tokens)
homonym_sent_tokens_tags

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [53]:
wnet_lemma.lemmatize('refuse','v')

'refuse'

In [54]:
print([wnet_lemma.lemmatize(word,get_wordnet_pos(tag)) for word,tag in homonym_sent_tokens_tags])

['They', 'refuse', 'to', 'permit', 'u', 'to', 'obtain', 'the', 'refuse', 'permit']


### **Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning "deny," while REFuse is a noun meaning "trash" (i.e. they are not homophones).**

### **Thus, we need to know which word is being used in order to pronounce the text correctly. (For this reason, text-to-speech systems usually perform POS-tagging.)**

### ***`TextBlob Lemmatizer`***

In [55]:
import textblob
from textblob import TextBlob, Word

##### **TextBlob :: Word Lemmatizer**

In [56]:
word='amazing'

In [57]:
txt_blob_word = Word(word,pos_tag='n')

In [58]:
txt_blob_word

'amazing'

##### **TextBlob :: Sentence Lemmatizer**

In [59]:
test_sentence

'Waaooo, such an amazing match. I really enjoyed watching this event of WWE and pro wrestling. And, they are the best and my stress busters too.'

In [60]:
txt_blob_sent = TextBlob(test_sentence,tokenizer=nltk.tokenize.NLTKWordTokenizer(),pos_tagger=textblob.taggers.NLTKTagger())

In [61]:
txt_blob_sent

TextBlob("Waaooo, such an amazing match. I really enjoyed watching this event of WWE and pro wrestling. And, they are the best and my stress busters too.")

##### **Some properties of TextBlob Sentence Lemmatizer**

In [62]:
txt_blob_sent.detect_language()         ## It returns the language code of the given text or sentence

'en'

In [63]:
txt_blob_sent.ngrams()                  ## It returns the n-grams by default n==3

[WordList(['Waaooo', 'such', 'an']),
 WordList(['such', 'an', 'amazing']),
 WordList(['an', 'amazing', 'match']),
 WordList(['amazing', 'match', 'I']),
 WordList(['match', 'I', 'really']),
 WordList(['I', 'really', 'enjoyed']),
 WordList(['really', 'enjoyed', 'watching']),
 WordList(['enjoyed', 'watching', 'this']),
 WordList(['watching', 'this', 'event']),
 WordList(['this', 'event', 'of']),
 WordList(['event', 'of', 'WWE']),
 WordList(['of', 'WWE', 'and']),
 WordList(['WWE', 'and', 'pro']),
 WordList(['and', 'pro', 'wrestling']),
 WordList(['pro', 'wrestling', 'And']),
 WordList(['wrestling', 'And', 'they']),
 WordList(['And', 'they', 'are']),
 WordList(['they', 'are', 'the']),
 WordList(['are', 'the', 'best']),
 WordList(['the', 'best', 'and']),
 WordList(['best', 'and', 'my']),
 WordList(['and', 'my', 'stress']),
 WordList(['my', 'stress', 'busters']),
 WordList(['stress', 'busters', 'too'])]

In [64]:
txt_blob_sent_tags = txt_blob_sent.pos_tags                  ## It returns the POS tags
txt_blob_sent_tags

[('Waaooo', 'NNP'),
 ('such', 'JJ'),
 ('an', 'DT'),
 ('amazing', 'JJ'),
 ('match', 'NN'),
 ('I', 'PRP'),
 ('really', 'RB'),
 ('enjoyed', 'VBD'),
 ('watching', 'VBG'),
 ('this', 'DT'),
 ('event', 'NN'),
 ('of', 'IN'),
 ('WWE', 'NNP'),
 ('and', 'CC'),
 ('pro', 'JJ'),
 ('wrestling', 'NN'),
 ('And', 'CC'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('and', 'CC'),
 ('my', 'PRP$'),
 ('stress', 'NN'),
 ('busters', 'NNS'),
 ('too', 'RB')]

In [65]:
txt_blob_sent.sentiment                ## This returns the sentiment score of the text

Sentiment(polarity=0.525, subjectivity=0.5999999999999999)

### **`Polarity` :: The key aspect of sentiment analysis is to analyze a body of text for understanding the opinion expressed by it. Typically, we quantify this sentiment with a positive or negative value, called polarity. The overall sentiment is often inferred as positive, neutral or negative from the sign of the polarity score.**

### **`Subjectivity` :: In natural language, subjectivity refers to an expression of opinions, evaluations, feelings, and speculations (Banfield, 1982; Wiebe, 1994) and thus incorporates sentiment.**

In [66]:
txt_blob_sent.tokenize()           ## This performs the tokenization

WordList(['Waaooo', ',', 'such', 'an', 'amazing', 'match.', 'I', 'really', 'enjoyed', 'watching', 'this', 'event', 'of', 'WWE', 'and', 'pro', 'wrestling.', 'And', ',', 'they', 'are', 'the', 'best', 'and', 'my', 'stress', 'busters', 'too', '.'])

In [67]:
txt_blob_sent.tokens              ## This returns the tokens of the text

WordList(['Waaooo', ',', 'such', 'an', 'amazing', 'match.', 'I', 'really', 'enjoyed', 'watching', 'this', 'event', 'of', 'WWE', 'and', 'pro', 'wrestling.', 'And', ',', 'they', 'are', 'the', 'best', 'and', 'my', 'stress', 'busters', 'too', '.'])

In [68]:
txt_blob_sent.words               ## This returns the words of the given text with no special characters

WordList(['Waaooo', 'such', 'an', 'amazing', 'match', 'I', 'really', 'enjoyed', 'watching', 'this', 'event', 'of', 'WWE', 'and', 'pro', 'wrestling', 'And', 'they', 'are', 'the', 'best', 'and', 'my', 'stress', 'busters', 'too'])

In [69]:
" ".join([w.lemmatize() for w in txt_blob_sent.words])

'Waaooo such an amazing match I really enjoyed watching this event of WWE and pro wrestling And they are the best and my stress buster too'

### ***`TextBlob Lemmatizer with POS Tagging`***

In [70]:
" ".join([w.lemmatize(pos=get_wordnet_pos(tag)) for w,tag in txt_blob_sent_tags])

'Waaooo such an amazing match I really enjoy watch this event of WWE and pro wrestling And they be the best and my stress buster too'

### **Quite similar results here as well. Without POS Tagging it really doesn't perform any major action, however, with POS Tagging it work pretty well.**

### ***`TreeTaggerWrapper Lemmatizer`***

In [71]:
import treetaggerwrapper as ttw

In [72]:
tree_tagger = ttw.TreeTagger(TAGLANG='en',TAGDIR="C:\TreeTagger")

In [73]:
test_sentence

'Waaooo, such an amazing match. I really enjoyed watching this event of WWE and pro wrestling. And, they are the best and my stress busters too.'

In [74]:
tree_tagger_tags = tree_tagger.tag_text(text=re.sub('[^A-Za-z]',' ',test_sentence))

In [75]:
tree_tagger_tags

['Waaooo\tVVG\tWaaooo',
 'such\tPDT\tsuch',
 'an\tDT\tan',
 'amazing\tJJ\tamazing',
 'match\tNN\tmatch',
 'I\tPP\tI',
 'really\tRB\treally',
 'enjoyed\tVVD\tenjoy',
 'watching\tVVG\twatch',
 'this\tDT\tthis',
 'event\tNN\tevent',
 'of\tIN\tof',
 'WWE\tNP\tWWE',
 'and\tCC\tand',
 'pro\tJJ\tpro',
 'wrestling\tNN\twrestling',
 'And\tCC\tand',
 'they\tPP\tthey',
 'are\tVBP\tbe',
 'the\tDT\tthe',
 'best\tJJS\tgood',
 'and\tCC\tand',
 'my\tPP$\tmy',
 'stress\tNN\tstress',
 'busters\tNNS\tbuster',
 'too\tRB\ttoo']

### **This work fabulous. Really TreeTagger has done a great job here!! 'Best' is also changed to 'good'.**

In [76]:
tree_tagger_lemmas = [tag.split('\t')[-1] for tag in tree_tagger_tags]

In [77]:
print(tree_tagger_lemmas)

['Waaooo', 'such', 'an', 'amazing', 'match', 'I', 'really', 'enjoy', 'watch', 'this', 'event', 'of', 'WWE', 'and', 'pro', 'wrestling', 'and', 'they', 'be', 'the', 'good', 'and', 'my', 'stress', 'buster', 'too']


### **In conclusion, I can say that out of all the Lemmatizers TreeTagger works the best. However, others also did a good job with the POS Tagging and without it all of the lemmatizers gives the diminish results. Whereas, majority of the Stemmers follow the principle of chopping the tails.**

### ***`Applying Lemmatizers on the dataframe dataset`***

In [78]:
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]"


#### **POS Tagging**

In [79]:
text_df['POS_Tags'] = text_df['Pre_processed_Message'].apply(lambda row: nltk.pos_tag(nltk.word_tokenize(row)))
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems,POS_Tags
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]"


#### **WordNet Lemmas with POS Tagging**

In [80]:
text_df['Wordnet_Lemmas'] = text_df['POS_Tags'].apply(lambda row: [wnet_lemma.lemmatize(word,get_wordnet_pos(tag)) for word,tag in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems,POS_Tags,Wordnet_Lemmas
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, be, the, worlds, great, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, work, on, data, science, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, be, very, affordable]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, be, cheap, tasty, and, delicious]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, be, rama, shyama]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, be, awsome, and, machine, learning, be, great]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, be, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, be, tasty, and, cheesy]"


#### **TextBlob POS Tagging**
##### **NOTE :: This step is not necessary as we have already generated the POS Tags.**

In [81]:
text_df['TextBlob_Tags'] = text_df['Pre_processed_Message'].apply(lambda row: TextBlob(text=row,\
                                                                                       tokenizer=nltk.tokenize.NLTKWordTokenizer(),\
                                                                                       pos_tagger=textblob.taggers.NLTKTagger()).pos_tags)
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems,POS_Tags,Wordnet_Lemmas,TextBlob_Tags
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, be, the, worlds, great, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, work, on, data, science, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, be, very, affordable]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, be, cheap, tasty, and, delicious]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, be, rama, shyama]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, be, awsome, and, machine, learning, be, great]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, be, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, be, tasty, and, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]"


#### **TextBlob Lemmas with POS Tagging**

In [82]:
text_df['TextBlob_Lemmas'] = text_df['TextBlob_Tags'].apply(lambda row: [Word(string=word,pos_tag=tag).lemmatize() for word,tag in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems,POS_Tags,Wordnet_Lemmas,TextBlob_Tags,TextBlob_Lemmas
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, be, the, worlds, great, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, is, the, world, greatest, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, work, on, data, science, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, working, on, data, science, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, be, very, affordable]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, is, very, affordable]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, be, cheap, tasty, and, delicious]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, wa, cheap, tasty, and, delicious]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, loved, to, watch, wwe, and, pro, wrestling]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, be, rama, shyama]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, is, rama, shyama]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, be, awsome, and, machine, learning, be, great]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, is, awsome, and, machine, learning, is, great]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, is, very, stylish, dynamic, and, ha, a, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, be, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, is, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, be, tasty, and, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, wa, tasty, and, cheesy]"


#### **TreeTaggerWrapper Tags**

In [83]:
text_df['TreeTagger_Tags'] = text_df['Pre_processed_Message'].apply(lambda row: tree_tagger.tag_text(text=re.sub('[^A-Za-z]',' ',row)))
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems,POS_Tags,Wordnet_Lemmas,TextBlob_Tags,TextBlob_Lemmas,TreeTagger_Tags
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, be, the, worlds, great, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, is, the, world, greatest, mountain]","[this\tDT\tthis, is\tVBZ\tbe, the\tDT\tthe, worlds\tNNS\tworld, greatest\tJJ\tgreatest, mountain\tNN\tmountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, work, on, data, science, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, working, on, data, science, project]","[i\tNP\ti, love\tNN\tlove, working\tVVG\twork, on\tIN\ton, data\tNN\tdata, science\tNN\tscience, projects\tNNS\tproject]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, be, very, affordable]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, is, very, affordable]","[the\tDT\tthe, nexon\tNN\tnexon, car\tNN\tcar, is\tVBZ\tbe, very\tRB\tvery, affordable\tJJ\taffordable]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, be, cheap, tasty, and, delicious]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, wa, cheap, tasty, and, delicious]","[the\tDT\tthe, pizza\tNN\tpizza, was\tVBD\tbe, cheap\tRB\tcheap, tasty\tJJ\ttasty, and\tCC\tand, delicious\tJJ\tdelicious]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, loved, to, watch, wwe, and, pro, wrestling]","[in\tIN\tin, our\tPP$\tour, childhood\tNN\tchildhood, days\tNNS\tday, we\tPP\twe, loved\tVVD\tlove, to\tTO\tto, watch\tVV\twatch, wwe\tNP\twwe, and\tCC\tand, pro\tJJ\tpro, wrestling\tNN\twrestling]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, be, rama, shyama]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, is, rama, shyama]","[my\tPP$\tmy, name\tNN\tname, is\tVBZ\tbe, rama\tNP\tRama, shyama\tNP\tShyama]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, be, awsome, and, machine, learning, be, great]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, is, awsome, and, machine, learning, is, great]","[python\tNN\tpython, is\tVBZ\tbe, awsome\tJJ\tawsome, and\tCC\tand, machine\tNN\tmachine, learning\tNN\tlearning, is\tVBZ\tbe, great\tJJ\tgreat]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, is, very, stylish, dynamic, and, ha, a, strong, build]","[the\tDT\tthe, tata\tNN\ttata, nexon\tNN\tnexon, car\tNN\tcar, is\tVBZ\tbe, very\tRB\tvery, stylish\tJJ\tstylish, dynamic\tNN\tdynamic, and\tCC\tand, has\tVHZ\thave, a\tDT\ta, strong\tJJ\tstrong, build\tVV\tbuild]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, be, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, is, not, good]","[but\tCC\tbut, their\tPP$\ttheir, after\tRB\tafter, sales\tJJ\tsales, service\tNN\tservice, is\tVBZ\tbe, not\tRB\tnot, good\tJJ\tgood]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, be, tasty, and, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, wa, tasty, and, cheesy]","[the\tDT\tthe, pizza\tNN\tpizza, in\tIN\tin, the\tDT\tthe, party\tNN\tparty, was\tVBD\tbe, tasty\tJJ\ttasty, and\tCC\tand, cheesy\tJJ\tcheesy]"


#### **TreeTaggerWrapper Lemmas**

In [84]:
text_df['TreeTagger_Lemmas'] = text_df['TreeTagger_Tags'].apply(lambda row: [tag.split("\t")[-1] for tag in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems,POS_Tags,Wordnet_Lemmas,TextBlob_Tags,TextBlob_Lemmas,TreeTagger_Tags,TreeTagger_Lemmas
0,0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, be, the, worlds, great, mountain]","[(this, DT), (is, VBZ), (the, DT), (worlds, JJ), (greatest, JJS), (mountain, NN)]","[this, is, the, world, greatest, mountain]","[this\tDT\tthis, is\tVBZ\tbe, the\tDT\tthe, worlds\tNNS\tworld, greatest\tJJ\tgreatest, mountain\tNN\tmountain]","[this, be, the, world, greatest, mountain]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, work, on, data, science, project]","[(i, NN), (love, VBP), (working, VBG), (on, IN), (data, NNS), (science, NN), (projects, NNS)]","[i, love, working, on, data, science, project]","[i\tNP\ti, love\tNN\tlove, working\tVVG\twork, on\tIN\ton, data\tNN\tdata, science\tNN\tscience, projects\tNNS\tproject]","[i, love, work, on, data, science, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, be, very, affordable]","[(the, DT), (nexon, JJ), (car, NN), (is, VBZ), (very, RB), (affordable, JJ)]","[the, nexon, car, is, very, affordable]","[the\tDT\tthe, nexon\tNN\tnexon, car\tNN\tcar, is\tVBZ\tbe, very\tRB\tvery, affordable\tJJ\taffordable]","[the, nexon, car, be, very, affordable]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, be, cheap, tasty, and, delicious]","[(the, DT), (pizza, NN), (was, VBD), (cheap, JJ), (tasty, NN), (and, CC), (delicious, JJ)]","[the, pizza, wa, cheap, tasty, and, delicious]","[the\tDT\tthe, pizza\tNN\tpizza, was\tVBD\tbe, cheap\tRB\tcheap, tasty\tJJ\ttasty, and\tCC\tand, delicious\tJJ\tdelicious]","[the, pizza, be, cheap, tasty, and, delicious]"
4,4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","[(in, IN), (our, PRP$), (childhood, NN), (days, NNS), (we, PRP), (loved, VBD), (to, TO), (watch, VB), (wwe, NN), (and, CC), (pro, JJ), (wrestling, NN)]","[in, our, childhood, day, we, loved, to, watch, wwe, and, pro, wrestling]","[in\tIN\tin, our\tPP$\tour, childhood\tNN\tchildhood, days\tNNS\tday, we\tPP\twe, loved\tVVD\tlove, to\tTO\tto, watch\tVV\twatch, wwe\tNP\twwe, and\tCC\tand, pro\tJJ\tpro, wrestling\tNN\twrestling]","[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]"
5,5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, be, rama, shyama]","[(my, PRP$), (name, NN), (is, VBZ), (rama, JJ), (shyama, NN)]","[my, name, is, rama, shyama]","[my\tPP$\tmy, name\tNN\tname, is\tVBZ\tbe, rama\tNP\tRama, shyama\tNP\tShyama]","[my, name, be, Rama, Shyama]"
6,6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, be, awsome, and, machine, learning, be, great]","[(python, NN), (is, VBZ), (awsome, JJ), (and, CC), (machine, NN), (learning, NN), (is, VBZ), (great, JJ)]","[python, is, awsome, and, machine, learning, is, great]","[python\tNN\tpython, is\tVBZ\tbe, awsome\tJJ\tawsome, and\tCC\tand, machine\tNN\tmachine, learning\tNN\tlearning, is\tVBZ\tbe, great\tJJ\tgreat]","[python, be, awsome, and, machine, learning, be, great]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","[(the, DT), (tata, NN), (nexon, NN), (car, NN), (is, VBZ), (very, RB), (stylish, JJ), (dynamic, NN), (and, CC), (has, VBZ), (a, DT), (strong, JJ), (build, NN)]","[the, tata, nexon, car, is, very, stylish, dynamic, and, ha, a, strong, build]","[the\tDT\tthe, tata\tNN\ttata, nexon\tNN\tnexon, car\tNN\tcar, is\tVBZ\tbe, very\tRB\tvery, stylish\tJJ\tstylish, dynamic\tNN\tdynamic, and\tCC\tand, has\tVHZ\thave, a\tDT\ta, strong\tJJ\tstrong, build\tVV\tbuild]","[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, be, not, good]","[(but, CC), (their, PRP$), (after, IN), (sales, NNS), (service, NN), (is, VBZ), (not, RB), (good, JJ)]","[but, their, after, sale, service, is, not, good]","[but\tCC\tbut, their\tPP$\ttheir, after\tRB\tafter, sales\tJJ\tsales, service\tNN\tservice, is\tVBZ\tbe, not\tRB\tnot, good\tJJ\tgood]","[but, their, after, sales, service, be, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, be, tasty, and, cheesy]","[(the, DT), (pizza, NN), (in, IN), (the, DT), (party, NN), (was, VBD), (tasty, JJ), (and, CC), (cheesy, NN)]","[the, pizza, in, the, party, wa, tasty, and, cheesy]","[the\tDT\tthe, pizza\tNN\tpizza, in\tIN\tin, the\tDT\tthe, party\tNN\tparty, was\tVBD\tbe, tasty\tJJ\ttasty, and\tCC\tand, cheesy\tJJ\tcheesy]","[the, pizza, in, the, party, be, tasty, and, cheesy]"


## ***Stemmer and Lemmatizers Comparison***

In [85]:
text_df.columns

Index(['Id', 'Pre_processed_Message', 'Tokens', 'Porter_Stems',
       'Snowball_Stems', 'Lancaster_Stems', 'POS_Tags', 'Wordnet_Lemmas',
       'TextBlob_Tags', 'TextBlob_Lemmas', 'TreeTagger_Tags',
       'TreeTagger_Lemmas'],
      dtype='object')

In [86]:
text_df[['Pre_processed_Message', 'Tokens', 'Porter_Stems','Snowball_Stems', 'Lancaster_Stems',]]

Unnamed: 0,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems
0,this is the worlds greatest mountain,"[worlds, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]","[world, greatest, mountain]"
1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]"
2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]"
3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]"
4,in our childhood days we loved to watch wwe and pro wrestling,"[childhood, days, loved, watch, wwe, pro, wrestling]","[childhood, day, love, watch, wwe, pro, wrestl]","[childhood, day, love, watch, wwe, pro, wrestl]","[child, day, lov, watch, wwe, pro, wrestl]"
5,my name is rama shyama,"[name, rama, shyama]","[name, rama, shyama]","[name, rama, shyama]","[nam, ram, shyam]"
6,python is awsome and machine learning is great,"[python, awsome, machine, learning, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, great]","[python, awsom, machin, learn, gre]"
7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]"
8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]"
9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]"


In [87]:
text_df[['Pre_processed_Message', 'Wordnet_Lemmas', 'TextBlob_Lemmas', 'TreeTagger_Lemmas']]

Unnamed: 0,Pre_processed_Message,Wordnet_Lemmas,TextBlob_Lemmas,TreeTagger_Lemmas
0,this is the worlds greatest mountain,"[this, be, the, worlds, great, mountain]","[this, is, the, world, greatest, mountain]","[this, be, the, world, greatest, mountain]"
1,i love working on data science projects,"[i, love, work, on, data, science, project]","[i, love, working, on, data, science, project]","[i, love, work, on, data, science, project]"
2,the nexon car is very affordable,"[the, nexon, car, be, very, affordable]","[the, nexon, car, is, very, affordable]","[the, nexon, car, be, very, affordable]"
3,the pizza was cheap tasty and delicious,"[the, pizza, be, cheap, tasty, and, delicious]","[the, pizza, wa, cheap, tasty, and, delicious]","[the, pizza, be, cheap, tasty, and, delicious]"
4,in our childhood days we loved to watch wwe and pro wrestling,"[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","[in, our, childhood, day, we, loved, to, watch, wwe, and, pro, wrestling]","[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]"
5,my name is rama shyama,"[my, name, be, rama, shyama]","[my, name, is, rama, shyama]","[my, name, be, Rama, Shyama]"
6,python is awsome and machine learning is great,"[python, be, awsome, and, machine, learning, be, great]","[python, is, awsome, and, machine, learning, is, great]","[python, be, awsome, and, machine, learning, be, great]"
7,the tata nexon car is very stylish dynamic and has a strong build,"[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","[the, tata, nexon, car, is, very, stylish, dynamic, and, ha, a, strong, build]","[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]"
8,but their after sales service is not good,"[but, their, after, sale, service, be, not, good]","[but, their, after, sale, service, is, not, good]","[but, their, after, sales, service, be, not, good]"
9,the pizza in the party was tasty and cheesy,"[the, pizza, in, the, party, be, tasty, and, cheesy]","[the, pizza, in, the, party, wa, tasty, and, cheesy]","[the, pizza, in, the, party, be, tasty, and, cheesy]"


### **After performing stemming and lematization I have decided to go ahead with the "TreeTagger" Lemmas. So, before performing the featurization we need to remove the stopwords from lemmas.**

In [88]:
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

In [89]:
cleaned_df = text_df[['Pre_processed_Message','TreeTagger_Lemmas']].copy(deep=True)

In [158]:
cleaned_df['Lemmas'] = cleaned_df['TreeTagger_Lemmas'].apply(lambda row: str(",".join([word for word in row if word not in eng_stopwords])))
cleaned_df

Unnamed: 0,Pre_processed_Message,TreeTagger_Lemmas,Lemmas,Spell_checked_Lemmas
0,this is the worlds greatest mountain,"[this, be, the, world, greatest, mountain]","world,greatest,mountain","[world, greatest, mountain]"
1,i love working on data science projects,"[i, love, work, on, data, science, project]","love,work,data,science,project","[love, work, data, science, project]"
2,the nexon car is very affordable,"[the, nexon, car, be, very, affordable]","nexon,car,affordable","[nixon, car, affordable]"
3,the pizza was cheap tasty and delicious,"[the, pizza, be, cheap, tasty, and, delicious]","pizza,cheap,tasty,delicious","[pizza, cheap, tasty, delicious]"
4,in our childhood days we loved to watch wwe and pro wrestling,"[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","childhood,day,love,watch,wwe,pro,wrestling","[childhood, day, love, watch, we, pro, wrestling]"
5,my name is rama shyama,"[my, name, be, Rama, Shyama]","name,Rama,Shyama","[name, Rama, shama]"
6,python is awsome and machine learning is great,"[python, be, awsome, and, machine, learning, be, great]","python,awsome,machine,learning,great","[python, awesome, machine, learning, great]"
7,the tata nexon car is very stylish dynamic and has a strong build,"[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","tata,nexon,car,stylish,dynamic,strong,build","[tata, nixon, car, stylish, dynamic, strong, build]"
8,but their after sales service is not good,"[but, their, after, sales, service, be, not, good]","sales,service,not,good","[sales, service, not, good]"
9,the pizza in the party was tasty and cheesy,"[the, pizza, in, the, party, be, tasty, and, cheesy]","pizza,party,tasty,cheesy","[pizza, party, tasty, cheesy]"


### **Spell Checking**

#### **`Pyspellchecker`**

In [91]:
from spellchecker import SpellChecker

In [92]:
dir(SpellChecker)

['_SpellChecker__edit_distance_alt',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_case_sensitive',
 '_check_if_should_check',
 '_distance',
 '_tokenizer',
 '_word_frequency',
 'candidates',
 'correction',
 'distance',
 'edit_distance_1',
 'edit_distance_2',
 'export',
 'known',
 'split_words',
 'unknown',
 'word_frequency',
 'word_probability']

In [93]:
spell_check = SpellChecker()

In [94]:
[f'{word}:{spell_check.correction(word)}:{np.round(spell_check.word_probability(word),9)}' for word in ['forest','alw','cripy','hello']]

['forest:forest:7.4901e-05',
 'alw:alw:6.2e-08',
 'cripy:crispy:0.0',
 'hello:hello:3.9826e-05']

#### **Spell Checker on above generated Lemmas**

In [148]:
cleaned_df

Unnamed: 0,Pre_processed_Message,TreeTagger_Lemmas,Lemmas
0,this is the worlds greatest mountain,"[this, be, the, world, greatest, mountain]","world,greatest,mountain"
1,i love working on data science projects,"[i, love, work, on, data, science, project]","love,work,data,science,project"
2,the nexon car is very affordable,"[the, nexon, car, be, very, affordable]","nexon,car,affordable"
3,the pizza was cheap tasty and delicious,"[the, pizza, be, cheap, tasty, and, delicious]","pizza,cheap,tasty,delicious"
4,in our childhood days we loved to watch wwe and pro wrestling,"[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","childhood,day,love,watch,wwe,pro,wrestling"
5,my name is rama shyama,"[my, name, be, Rama, Shyama]","name,Rama,Shyama"
6,python is awsome and machine learning is great,"[python, be, awsome, and, machine, learning, be, great]","python,awsome,machine,learning,great"
7,the tata nexon car is very stylish dynamic and has a strong build,"[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","tata,nexon,car,stylish,dynamic,strong,build"
8,but their after sales service is not good,"[but, their, after, sales, service, be, not, good]","sales,service,not,good"
9,the pizza in the party was tasty and cheesy,"[the, pizza, in, the, party, be, tasty, and, cheesy]","pizza,party,tasty,cheesy"


In [159]:
lemmas = cleaned_df['Lemmas'].apply(lambda row: row.split(","))
cleaned_df['Spell_checked_Lemmas'] = lemmas.apply(lambda row : [spell_check.correction(word) for word in row])
cleaned_df

Unnamed: 0,Pre_processed_Message,TreeTagger_Lemmas,Lemmas,Spell_checked_Lemmas
0,this is the worlds greatest mountain,"[this, be, the, world, greatest, mountain]","world,greatest,mountain","[world, greatest, mountain]"
1,i love working on data science projects,"[i, love, work, on, data, science, project]","love,work,data,science,project","[love, work, data, science, project]"
2,the nexon car is very affordable,"[the, nexon, car, be, very, affordable]","nexon,car,affordable","[nixon, car, affordable]"
3,the pizza was cheap tasty and delicious,"[the, pizza, be, cheap, tasty, and, delicious]","pizza,cheap,tasty,delicious","[pizza, cheap, tasty, delicious]"
4,in our childhood days we loved to watch wwe and pro wrestling,"[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","childhood,day,love,watch,wwe,pro,wrestling","[childhood, day, love, watch, we, pro, wrestling]"
5,my name is rama shyama,"[my, name, be, Rama, Shyama]","name,Rama,Shyama","[name, Rama, shama]"
6,python is awsome and machine learning is great,"[python, be, awsome, and, machine, learning, be, great]","python,awsome,machine,learning,great","[python, awesome, machine, learning, great]"
7,the tata nexon car is very stylish dynamic and has a strong build,"[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","tata,nexon,car,stylish,dynamic,strong,build","[tata, nixon, car, stylish, dynamic, strong, build]"
8,but their after sales service is not good,"[but, their, after, sales, service, be, not, good]","sales,service,not,good","[sales, service, not, good]"
9,the pizza in the party was tasty and cheesy,"[the, pizza, in, the, party, be, tasty, and, cheesy]","pizza,party,tasty,cheesy","[pizza, party, tasty, cheesy]"


### **Spell Checking has introduced many changes in the lemmas which doesn't seems to be correct. All changes except 'cripy' to 'crispy' are unnecessary.**

In [183]:
cleaned_df['Lemmas'].iloc[-1] = ','.join([spell_check.correction(word) if word == 'cripy' else word for word in cleaned_df['Lemmas'].iloc[-1].split(",")])

In [184]:
cleaned_df

Unnamed: 0,Pre_processed_Message,TreeTagger_Lemmas,Lemmas,Spell_checked_Lemmas
0,this is the worlds greatest mountain,"[this, be, the, world, greatest, mountain]","world,greatest,mountain","[world, greatest, mountain]"
1,i love working on data science projects,"[i, love, work, on, data, science, project]","love,work,data,science,project","[love, work, data, science, project]"
2,the nexon car is very affordable,"[the, nexon, car, be, very, affordable]","nexon,car,affordable","[nixon, car, affordable]"
3,the pizza was cheap tasty and delicious,"[the, pizza, be, cheap, tasty, and, delicious]","pizza,cheap,tasty,delicious","[pizza, cheap, tasty, delicious]"
4,in our childhood days we loved to watch wwe and pro wrestling,"[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","childhood,day,love,watch,wwe,pro,wrestling","[childhood, day, love, watch, we, pro, wrestling]"
5,my name is rama shyama,"[my, name, be, Rama, Shyama]","name,Rama,Shyama","[name, Rama, shama]"
6,python is awsome and machine learning is great,"[python, be, awsome, and, machine, learning, be, great]","python,awsome,machine,learning,great","[python, awesome, machine, learning, great]"
7,the tata nexon car is very stylish dynamic and has a strong build,"[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","tata,nexon,car,stylish,dynamic,strong,build","[tata, nixon, car, stylish, dynamic, strong, build]"
8,but their after sales service is not good,"[but, their, after, sales, service, be, not, good]","sales,service,not,good","[sales, service, not, good]"
9,the pizza in the party was tasty and cheesy,"[the, pizza, in, the, party, be, tasty, and, cheesy]","pizza,party,tasty,cheesy","[pizza, party, tasty, cheesy]"


#### **`TextBlob spellchecker`**

In [185]:
import textblob

In [188]:
for word in list(cleaned_df.iloc[10,1]):
    text = textblob.TextBlob(word)
    print(word , Word.spellcheck(text))

the [('the', 1.0)]
dominoz [('dominion', 0.7619047619047619), ('domingo', 0.23809523809523808)]
tacco [('tobacco', 0.6), ('tact', 0.175), ('sacro', 0.0625), ('tack', 0.0375), ('tacit', 0.0375), ('tache', 0.025), ('facto', 0.025), ('tarso', 0.0125), ('sarco', 0.0125), ('marco', 0.0125)]
be [('be', 1.0)]
as [('as', 1.0)]
always [('always', 1.0)]
cripy [('cried', 0.32342857142857145), ('city', 0.2057142857142857), ('cry', 0.16342857142857142), ('crime', 0.06971428571428571), ('copy', 0.052571428571428575), ('cries', 0.03542857142857143), ('crops', 0.027428571428571427), ('crop', 0.027428571428571427), ('grip', 0.014857142857142857), ('trip', 0.013714285714285714), ('crazy', 0.011428571428571429), ('crept', 0.010285714285714285), ('ripe', 0.006857142857142857), ('crisp', 0.006857142857142857), ('clip', 0.005714285714285714), ('chip', 0.005714285714285714), ('crile', 0.0034285714285714284), ('drip', 0.002285714285714286), ('crib', 0.002285714285714286), ('clips', 0.002285714285714286), ('tr

In [189]:
Word.spellcheck(text)

[('fingerlicious', 0.0)]

# ***`Featurization`***
- **It is the technique in which the words are converted into machine understandable format means in numbers.**
### **1. BAG of WORDS (BOW)**

In [190]:
cv = CountVectorizer(lowercase=True)

In [191]:
BOW = cv.fit_transform(cleaned_df['Lemmas'])

In [192]:
BOW

<11x45 sparse matrix of type '<class 'numpy.int64'>'
	with 50 stored elements in Compressed Sparse Row format>

In [193]:
pd.set_option('display.max_columns',100)

In [199]:
bow_features = pd.DataFrame(BOW.toarray(),columns=cv.get_feature_names())
bow_features

Unnamed: 0,affordable,always,awsome,build,car,cheap,cheesy,childhood,crispy,data,day,delicious,dominoz,dynamic,fingerlicious,good,great,greatest,learning,love,machine,mountain,name,nexon,not,party,pizza,pro,project,python,rama,sales,science,service,shyama,strong,stylish,tacco,tasty,tata,watch,work,world,wrestling,wwe
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [196]:
np.array(cv.get_feature_names()).shape

(45,)

In [197]:
print(cv.get_feature_names())

['affordable', 'always', 'awsome', 'build', 'car', 'cheap', 'cheesy', 'childhood', 'crispy', 'data', 'day', 'delicious', 'dominoz', 'dynamic', 'fingerlicious', 'good', 'great', 'greatest', 'learning', 'love', 'machine', 'mountain', 'name', 'nexon', 'not', 'party', 'pizza', 'pro', 'project', 'python', 'rama', 'sales', 'science', 'service', 'shyama', 'strong', 'stylish', 'tacco', 'tasty', 'tata', 'watch', 'work', 'world', 'wrestling', 'wwe']


In [198]:
print(cv.get_stop_words())       ## Here, in countvectoriser we can also initialize the stopwords but in this case I have kept it blank

None


### **2. N-grams**

##### **CASE-I :: Unigrams and Bi-grams**

In [200]:
n_gram_cv = CountVectorizer(ngram_range=(1,2))

In [206]:
ngrams_features = n_gram_cv.fit_transform(cleaned_df['Lemmas'])

In [207]:
ngrams_features

<11x83 sparse matrix of type '<class 'numpy.int64'>'
	with 89 stored elements in Compressed Sparse Row format>

In [209]:
cleaned_df

Unnamed: 0,Pre_processed_Message,TreeTagger_Lemmas,Lemmas,Spell_checked_Lemmas
0,this is the worlds greatest mountain,"[this, be, the, world, greatest, mountain]","world,greatest,mountain","[world, greatest, mountain]"
1,i love working on data science projects,"[i, love, work, on, data, science, project]","love,work,data,science,project","[love, work, data, science, project]"
2,the nexon car is very affordable,"[the, nexon, car, be, very, affordable]","nexon,car,affordable","[nixon, car, affordable]"
3,the pizza was cheap tasty and delicious,"[the, pizza, be, cheap, tasty, and, delicious]","pizza,cheap,tasty,delicious","[pizza, cheap, tasty, delicious]"
4,in our childhood days we loved to watch wwe and pro wrestling,"[in, our, childhood, day, we, love, to, watch, wwe, and, pro, wrestling]","childhood,day,love,watch,wwe,pro,wrestling","[childhood, day, love, watch, we, pro, wrestling]"
5,my name is rama shyama,"[my, name, be, Rama, Shyama]","name,Rama,Shyama","[name, Rama, shama]"
6,python is awsome and machine learning is great,"[python, be, awsome, and, machine, learning, be, great]","python,awsome,machine,learning,great","[python, awesome, machine, learning, great]"
7,the tata nexon car is very stylish dynamic and has a strong build,"[the, tata, nexon, car, be, very, stylish, dynamic, and, have, a, strong, build]","tata,nexon,car,stylish,dynamic,strong,build","[tata, nixon, car, stylish, dynamic, strong, build]"
8,but their after sales service is not good,"[but, their, after, sales, service, be, not, good]","sales,service,not,good","[sales, service, not, good]"
9,the pizza in the party was tasty and cheesy,"[the, pizza, in, the, party, be, tasty, and, cheesy]","pizza,party,tasty,cheesy","[pizza, party, tasty, cheesy]"


In [212]:
print(n_gram_cv.get_feature_names())

['affordable', 'always', 'always crispy', 'awsome', 'awsome machine', 'build', 'car', 'car affordable', 'car stylish', 'cheap', 'cheap tasty', 'cheesy', 'childhood', 'childhood day', 'crispy', 'crispy fingerlicious', 'data', 'data science', 'day', 'day love', 'delicious', 'dominoz', 'dominoz tacco', 'dynamic', 'dynamic strong', 'fingerlicious', 'good', 'great', 'greatest', 'greatest mountain', 'learning', 'learning great', 'love', 'love watch', 'love work', 'machine', 'machine learning', 'mountain', 'name', 'name rama', 'nexon', 'nexon car', 'not', 'not good', 'party', 'party tasty', 'pizza', 'pizza cheap', 'pizza party', 'pro', 'pro wrestling', 'project', 'python', 'python awsome', 'rama', 'rama shyama', 'sales', 'sales service', 'science', 'science project', 'service', 'service not', 'shyama', 'strong', 'strong build', 'stylish', 'stylish dynamic', 'tacco', 'tacco always', 'tasty', 'tasty cheesy', 'tasty delicious', 'tata', 'tata nexon', 'watch', 'watch wwe', 'work', 'work data', 'wo

In [208]:
ngrams_features = pd.DataFrame(ngrams_features.toarray(),columns=n_gram_cv.get_feature_names())
ngrams_features

Unnamed: 0,affordable,always,always crispy,awsome,awsome machine,build,car,car affordable,car stylish,cheap,cheap tasty,cheesy,childhood,childhood day,crispy,crispy fingerlicious,data,data science,day,day love,delicious,dominoz,dominoz tacco,dynamic,dynamic strong,fingerlicious,good,great,greatest,greatest mountain,learning,learning great,love,love watch,love work,machine,machine learning,mountain,name,name rama,nexon,nexon car,not,not good,party,party tasty,pizza,pizza cheap,pizza party,pro,pro wrestling,project,python,python awsome,rama,rama shyama,sales,sales service,science,science project,service,service not,shyama,strong,strong build,stylish,stylish dynamic,tacco,tacco always,tasty,tasty cheesy,tasty delicious,tata,tata nexon,watch,watch wwe,work,work data,world,world greatest,wrestling,wwe,wwe pro
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
2,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0


##### **CASE-II :: Unigrams, Bi-grams and Tri-grams**

In [213]:
ngrams_cv3 = CountVectorizer(ngram_range=(1,3))

In [214]:
ngrams_cv3_features = ngrams_cv3.fit_transform(cleaned_df['Lemmas'])

In [215]:
ngrams_cv3_features

<11x111 sparse matrix of type '<class 'numpy.int64'>'
	with 117 stored elements in Compressed Sparse Row format>

In [216]:
ngrams_cv3_features = pd.DataFrame(ngrams_cv3_features.toarray(),columns=ngrams_cv3.get_feature_names())
ngrams_cv3_features

Unnamed: 0,affordable,always,always crispy,always crispy fingerlicious,awsome,awsome machine,awsome machine learning,build,car,car affordable,car stylish,car stylish dynamic,cheap,cheap tasty,cheap tasty delicious,cheesy,childhood,childhood day,childhood day love,crispy,crispy fingerlicious,data,data science,data science project,day,day love,day love watch,delicious,dominoz,dominoz tacco,dominoz tacco always,dynamic,dynamic strong,dynamic strong build,fingerlicious,good,great,greatest,greatest mountain,learning,learning great,love,love watch,love watch wwe,love work,love work data,machine,machine learning,machine learning great,mountain,...,party tasty cheesy,pizza,pizza cheap,pizza cheap tasty,pizza party,pizza party tasty,pro,pro wrestling,project,python,python awsome,python awsome machine,rama,rama shyama,sales,sales service,sales service not,science,science project,service,service not,service not good,shyama,strong,strong build,stylish,stylish dynamic,stylish dynamic strong,tacco,tacco always,tacco always crispy,tasty,tasty cheesy,tasty delicious,tata,tata nexon,tata nexon car,watch,watch wwe,watch wwe pro,work,work data,work data science,world,world greatest,world greatest mountain,wrestling,wwe,wwe pro,wwe pro wrestling
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,1,1,1
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [217]:
ngrams_cv3.get_feature_names()

['affordable',
 'always',
 'always crispy',
 'always crispy fingerlicious',
 'awsome',
 'awsome machine',
 'awsome machine learning',
 'build',
 'car',
 'car affordable',
 'car stylish',
 'car stylish dynamic',
 'cheap',
 'cheap tasty',
 'cheap tasty delicious',
 'cheesy',
 'childhood',
 'childhood day',
 'childhood day love',
 'crispy',
 'crispy fingerlicious',
 'data',
 'data science',
 'data science project',
 'day',
 'day love',
 'day love watch',
 'delicious',
 'dominoz',
 'dominoz tacco',
 'dominoz tacco always',
 'dynamic',
 'dynamic strong',
 'dynamic strong build',
 'fingerlicious',
 'good',
 'great',
 'greatest',
 'greatest mountain',
 'learning',
 'learning great',
 'love',
 'love watch',
 'love watch wwe',
 'love work',
 'love work data',
 'machine',
 'machine learning',
 'machine learning great',
 'mountain',
 'name',
 'name rama',
 'name rama shyama',
 'nexon',
 'nexon car',
 'nexon car affordable',
 'nexon car stylish',
 'not',
 'not good',
 'party',
 'party tasty',
 'pa

##### **CASE-II :: Only Tri-grams**

In [218]:
ngrams_cv4 = CountVectorizer(ngram_range=(3,3))

In [219]:
ngrams_cv4_features = ngrams_cv4.fit_transform(cleaned_df['Lemmas'])

In [220]:
ngrams_cv4_features

<11x28 sparse matrix of type '<class 'numpy.int64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [221]:
ngrams_cv4_features = pd.DataFrame(ngrams_cv4_features.toarray(),columns=ngrams_cv4.get_feature_names())
ngrams_cv4_features

Unnamed: 0,always crispy fingerlicious,awsome machine learning,car stylish dynamic,cheap tasty delicious,childhood day love,data science project,day love watch,dominoz tacco always,dynamic strong build,love watch wwe,love work data,machine learning great,name rama shyama,nexon car affordable,nexon car stylish,party tasty cheesy,pizza cheap tasty,pizza party tasty,python awsome machine,sales service not,service not good,stylish dynamic strong,tacco always crispy,tata nexon car,watch wwe pro,work data science,world greatest mountain,wwe pro wrestling
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
5,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0


## **Different Tokenizers and Vectorizers**