# ***`Understand NLP`***

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import scipy
import nltk

%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os
import sys

#### **Reading the files from current location**

In [2]:
import glob

In [3]:
file_names = [file_name.split("\\")[-1] for file_name in glob.glob(os.getcwd()+'\\doc*.txt')]

In [4]:
file_names

['doc1.txt', 'doc2.txt']

#### **Creating the corpus from files**

In [5]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [6]:
corpus = PlaintextCorpusReader(root=os.getcwd(),fileids=file_names)

In [7]:
corpus.fileids()

['doc1.txt', 'doc2.txt']

#### ***`Courpus Paras`***

In [8]:
print([para for para in corpus.paras()])

[[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."']], [['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]]


#### ***`Courpus Sentences`***

In [9]:
corpus_sents = [sent for sent in corpus.sents()]
print(corpus_sents)

[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."'], ['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]


#### ***`Courpus Words`***

In [10]:
corpus_words = [word for word in corpus.words()]
print(corpus_words)

['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."', '"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.', 'But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']


#### ***`English Stopwords`***

In [11]:
eng_stopwords = stopwords.words('english')

In [12]:
for word in ['not','nor','no']:
    eng_stopwords.remove(word)

In [13]:
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

#### ***`Cleaning the Corpus`***
- ##### **Removing special characters**
- ##### **Removing unwanted spaces**
- ##### **Lower case the words**
- ##### **Tokenizing the words**

In [14]:
print(corpus_sents)

[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."'], ['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]


In [15]:
cleaned_sent = []
for sent in corpus_sents:
    process_sent = [re.sub('[^A-Za-z]+', ' ', str(sent)).strip().lower()]
    cleaned_sent.append(process_sent)
    
print(cleaned_sent)

[['my name is rajesh sharma'], ['i love working on data science projects'], ['the nexon car is very affordable'], ['the pizza was cheap tasty and delicious'], ['the dominoz pizza is tasty and loaded'], ['my name is raman revti sharma'], ['i love doing data analytics'], ['the tata nexon car is very stylish dynamic and has a strong build'], ['but their after sales service is not good'], ['the pizza in the party was tasty and cheesy'], ['the dominoz tacco is always cripy and fingerlicious']]


#### ***`Storing text messages in a DataFrame`***

In [16]:
pd.set_option('display.max_colwidth',2000)

In [17]:
text_df = pd.DataFrame(cleaned_sent).reset_index()
text_df.columns = ['Id','Pre_processed_Message']
text_df

Unnamed: 0,Id,Pre_processed_Message
0,0,my name is rajesh sharma
1,1,i love working on data science projects
2,2,the nexon car is very affordable
3,3,the pizza was cheap tasty and delicious
4,4,the dominoz pizza is tasty and loaded
5,5,my name is raman revti sharma
6,6,i love doing data analytics
7,7,the tata nexon car is very stylish dynamic and has a strong build
8,8,but their after sales service is not good
9,9,the pizza in the party was tasty and cheesy


#### ***`Removing Stopwords and Tokenization`***

In [18]:
text_df['Tokens'] = text_df['Pre_processed_Message'].apply(lambda row: [word for word in row.split(" ") if word not in eng_stopwords])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens
0,0,my name is rajesh sharma,"[name, rajesh, sharma]"
1,1,i love working on data science projects,"[love, working, data, science, projects]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]"
6,6,i love doing data analytics,"[love, data, analytics]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]"


### ***`Stemming`***
#### ***`Porter Stemmer`***

In [19]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [20]:
port_stem = PorterStemmer()

In [21]:
text_df['Porter_Stems'] = text_df['Tokens'].apply(lambda row: [port_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems
0,0,my name is rajesh sharma,"[name, rajesh, sharma]","[name, rajesh, sharma]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]","[dominoz, pizza, tasti, load]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]","[name, raman, revti, sharma]"
6,6,i love doing data analytics,"[love, data, analytics]","[love, data, analyt]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]"


### **Porter Stemmer is simply chopping off the tails of the words. Not a good way!!**

#### ***`Snowball Stemmer`***

In [22]:
snow_stem = SnowballStemmer(language='english',ignore_stopwords=True)

In [23]:
text_df['Snowball_Stems'] = text_df['Tokens'].apply(lambda row: [snow_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems
0,0,my name is rajesh sharma,"[name, rajesh, sharma]","[name, rajesh, sharma]","[name, rajesh, sharma]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]","[dominoz, pizza, tasti, load]","[dominoz, pizza, tasti, load]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]","[name, raman, revti, sharma]","[name, raman, revti, sharma]"
6,6,i love doing data analytics,"[love, data, analytics]","[love, data, analyt]","[love, data, analyt]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]"


### **Snowball Stemmer is considered as a strongest method for taking a word to its root form. However, here, I didn't oberserved any difference with Porter Stemmer.**

#### ***`Lancaster Stemmer`***

In [24]:
lanc_stem = LancasterStemmer()

In [25]:
text_df['Lancaster_Stems'] = text_df['Tokens'].apply(lambda row: [lanc_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems
0,0,my name is rajesh sharma,"[name, rajesh, sharma]","[name, rajesh, sharma]","[name, rajesh, sharma]","[nam, rajesh, sharm]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]","[dominoz, pizza, tasti, load]","[dominoz, pizza, tasti, load]","[dominoz, pizz, tasty, load]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]","[name, raman, revti, sharma]","[name, raman, revti, sharma]","[nam, ram, revt, sharm]"
6,6,i love doing data analytics,"[love, data, analytics]","[love, data, analyt]","[love, data, analyt]","[lov, dat, analys]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]"


### **Definitely Lancaster Stemmer is not a good approach to bring a word to its root form. It is chopping-off the vowels from the tails of the words.**

### ***`Lemmatizers`***
##### **Reference Links**

- https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    
- https://www.nltk.org/book/ch05.html


#### ***`Wordnet Lemmatizer`***

In [26]:
from nltk.stem.wordnet import WordNetLemmatizer

In [27]:
wnet_lemma = WordNetLemmatizer()

##### **Trying to understand the working of Wordnet Lemmatizer on some examples**

In [28]:
example_words = ['cows','cow','boys','males','females','accesses','accessed','caring','cares','watches','watched','watching','is','are',\
                 'were','we','did','does','fruits','fruity','tastes','tasty','beauties','beautification','beauty','beautiful','unreliable',\
                 'unreliability','explain','explaines','explanation']

print("Given word along its Stems(Snowball) and Lemmas(Wordnet):\n")
for word in example_words:
    print("'{}' ---> '{}' ---> '{}'".format(word,snow_stem.stem(word),wnet_lemma.lemmatize(word)))

Given word along its Stems(Snowball) and Lemmas(Wordnet):

'cows' ---> 'cow' ---> 'cow'
'cow' ---> 'cow' ---> 'cow'
'boys' ---> 'boy' ---> 'boy'
'males' ---> 'male' ---> 'male'
'females' ---> 'femal' ---> 'female'
'accesses' ---> 'access' ---> 'access'
'accessed' ---> 'access' ---> 'accessed'
'caring' ---> 'care' ---> 'caring'
'cares' ---> 'care' ---> 'care'
'watches' ---> 'watch' ---> 'watch'
'watched' ---> 'watch' ---> 'watched'
'watching' ---> 'watch' ---> 'watching'
'is' ---> 'is' ---> 'is'
'are' ---> 'are' ---> 'are'
'were' ---> 'were' ---> 'were'
'we' ---> 'we' ---> 'we'
'did' ---> 'did' ---> 'did'
'does' ---> 'does' ---> 'doe'
'fruits' ---> 'fruit' ---> 'fruit'
'fruity' ---> 'fruiti' ---> 'fruity'
'tastes' ---> 'tast' ---> 'taste'
'tasty' ---> 'tasti' ---> 'tasty'
'beauties' ---> 'beauti' ---> 'beauty'
'beautification' ---> 'beautif' ---> 'beautification'
'beauty' ---> 'beauti' ---> 'beauty'
'beautiful' ---> 'beauti' ---> 'beautiful'
'unreliable' ---> 'unreli' ---> 'unreliable'


### **Here, I have observed that Lemmatizer works good only in the case of plural words that belongs to the category of 'Inflectional Bound Morphemes'. It doesn't really work with other kind or forms of words. On the other hand, Stemmers firmly believe on chopping the tails of words which many times leads to incorrect words.**

In [29]:
test_sentence = ("Waaooo, such an amazing match. I really enjoyed watching this event of WWE and pro wrestling. \
And, they are my stress busters too.")

In [30]:
[wnet_lemma.lemmatize(word.replace(".","")) for word in test_sentence.split(" ")]      ## Lemmatizer

['Waaooo,',
 'such',
 'an',
 'amazing',
 'match',
 'I',
 'really',
 'enjoyed',
 'watching',
 'this',
 'event',
 'of',
 'WWE',
 'and',
 'pro',
 'wrestling',
 'And,',
 'they',
 'are',
 'my',
 'stress',
 'buster',
 'too']

### **Nothing really changed here other than busters got converted to buster.**

In [31]:
[snow_stem.stem(word.replace(".","")) for word in test_sentence.split(" ")]      ## Stemmer

['waaooo,',
 'such',
 'an',
 'amaz',
 'match',
 'i',
 'realli',
 'enjoy',
 'watch',
 'this',
 'event',
 'of',
 'wwe',
 'and',
 'pro',
 'wrestl',
 'and,',
 'they',
 'are',
 'my',
 'stress',
 'buster',
 'too']

### **Good amount of words tail chopping have been performed here and it looks like Stemmer lowercase the words before performing any action.**

### ***`Wordnet Lemmatizer with POS(Part-of-speech) tagging`***

##### **CASE-I**

In [32]:
wnet_lemma.lemmatize('watching',pos='v')

'watch'

In [33]:
wnet_lemma.lemmatize('watching',pos='a')

'watching'

In [34]:
wnet_lemma.lemmatize('watching',pos='s')

'watching'

In [35]:
wnet_lemma.lemmatize('watching',pos='n')

'watching'

In [36]:
wnet_lemma.lemmatize('watching',pos='r')

'watching'

### **In the previous and above examples lemmatizer didn't performed really well. Earlier 'watching' was not converted to 'watch' but here it got converted to 'watch' when we used the POS tag as 'verb'. This is beacuse we provided the correct ‘part-of-speech’ tag (POS tag) as the second argument to lemmatize(). Sometimes, the same word can have a multiple lemmas based on the meaning / context.**

##### **CASE-II :: POS-TAG as VERB**

In [37]:
example_words = ['cows','cow','boys','males','females','accesses','accessed','caring','cares','watches','watched','watching','is','are',\
                 'were','we','did','does','fruits','fruity','tastes','tasty','beauties','beautes','beautification','beauty','beautiful',\
                 'unreliable','unreliability','explain','explaines','explanation','refuse','deny','good','better','best']

print("Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):\n")
for word in example_words:
    print("{} --> '{}' ---> '{}'".format(nltk.pos_tag([word]),snow_stem.stem(word),wnet_lemma.lemmatize(word,pos='v')))

Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):

[('cows', 'NNS')] --> 'cow' ---> 'cow'
[('cow', 'NN')] --> 'cow' ---> 'cow'
[('boys', 'NNS')] --> 'boy' ---> 'boys'
[('males', 'NNS')] --> 'male' ---> 'males'
[('females', 'NNS')] --> 'femal' ---> 'females'
[('accesses', 'NNS')] --> 'access' ---> 'access'
[('accessed', 'VBN')] --> 'access' ---> 'access'
[('caring', 'VBG')] --> 'care' ---> 'care'
[('cares', 'NNS')] --> 'care' ---> 'care'
[('watches', 'NNS')] --> 'watch' ---> 'watch'
[('watched', 'VBN')] --> 'watch' ---> 'watch'
[('watching', 'VBG')] --> 'watch' ---> 'watch'
[('is', 'VBZ')] --> 'is' ---> 'be'
[('are', 'VBP')] --> 'are' ---> 'be'
[('were', 'VBD')] --> 'were' ---> 'be'
[('we', 'PRP')] --> 'we' ---> 'we'
[('did', 'VBD')] --> 'did' ---> 'do'
[('does', 'VBZ')] --> 'does' ---> 'do'
[('fruits', 'NNS')] --> 'fruit' ---> 'fruit'
[('fruity', 'NN')] --> 'fruiti' ---> 'fruity'
[('tastes', 'NNS')] --> 'tast' ---> 'taste'
[('tasty', 'NN')] --> 'tasti' ---> 'tasty

##### **CASE-II :: POS-TAG as ADVERB**

In [43]:
example_words = ['cows','cow','boys','males','females','accesses','accessed','caring','cares','watches','watched','watching','is','are',\
                 'were','we','did','does','fruits','fruity','tastes','tasty','beauties','beautes','beautification','beauty','beautiful',\
                 'unreliable','unreliability','explain','explaines','explanation','refuse','deny','good','better','best']

print("Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):\n")
for word in example_words:
    print("{} --> '{}' ---> '{}'".format(nltk.pos_tag([word]),snow_stem.stem(word),wnet_lemma.lemmatize(word,pos='r')))

Given word with its POS tag along, Stems(Snowball) and Lemmas(Wordnet):

[('cows', 'NNS')] --> 'cow' ---> 'cows'
[('cow', 'NN')] --> 'cow' ---> 'cow'
[('boys', 'NNS')] --> 'boy' ---> 'boys'
[('males', 'NNS')] --> 'male' ---> 'males'
[('females', 'NNS')] --> 'femal' ---> 'females'
[('accesses', 'NNS')] --> 'access' ---> 'accesses'
[('accessed', 'VBN')] --> 'access' ---> 'accessed'
[('caring', 'VBG')] --> 'care' ---> 'caring'
[('cares', 'NNS')] --> 'care' ---> 'cares'
[('watches', 'NNS')] --> 'watch' ---> 'watches'
[('watched', 'VBN')] --> 'watch' ---> 'watched'
[('watching', 'VBG')] --> 'watch' ---> 'watching'
[('is', 'VBZ')] --> 'is' ---> 'is'
[('are', 'VBP')] --> 'are' ---> 'are'
[('were', 'VBD')] --> 'were' ---> 'were'
[('we', 'PRP')] --> 'we' ---> 'we'
[('did', 'VBD')] --> 'did' ---> 'did'
[('does', 'VBZ')] --> 'does' ---> 'does'
[('fruits', 'NNS')] --> 'fruit' ---> 'fruits'
[('fruity', 'NN')] --> 'fruiti' ---> 'fruity'
[('tastes', 'NNS')] --> 'tast' ---> 'tastes'
[('tasty', 'NN')] 

### **Clearly, POS TAG plays a crucial role in understanding the meaning or context of the sentence.**

##### **Check the TAG type**

In [41]:
nltk.help.upenn_tagset('RBR')

RBR: adverb, comparative
    further gloomier grander graver greater grimmer harder harsher
    healthier heavier higher however larger later leaner lengthier less-
    perfectly lesser lonelier longer louder lower more ...


In [42]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


### **This way we see some help on TAG codes.**

##### **CASE-III :: POS-TAG of words in a sentence**

In [150]:
test_sent_tokens_tags = nltk.pos_tag(nltk.word_tokenize(re.sub('[^A-Za-z]+',' ',test_sentence)))
test_sent_tokens_tags

[('Waaooo', 'NNP'),
 ('such', 'PDT'),
 ('an', 'DT'),
 ('amazing', 'JJ'),
 ('match', 'NN'),
 ('I', 'PRP'),
 ('really', 'RB'),
 ('enjoyed', 'VBD'),
 ('watching', 'VBG'),
 ('this', 'DT'),
 ('event', 'NN'),
 ('of', 'IN'),
 ('WWE', 'NNP'),
 ('and', 'CC'),
 ('pro', 'JJ'),
 ('wrestling', 'NN'),
 ('And', 'CC'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('my', 'PRP$'),
 ('stress', 'JJ'),
 ('busters', 'NNS'),
 ('too', 'RB')]

In [54]:
from nltk.corpus import wordnet

In [148]:
def get_wordnet_pos(tag):
    """Map POS tag to first character lemmatize() accepts"""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag,wordnet.NOUN)

In [100]:
get_wordnet_pos('better')

'r'

In [101]:
wnet_lemma.lemmatize('better',get_wordnet_pos('better'))

'well'

In [102]:
get_wordnet_pos('watching')

'v'

In [103]:
wnet_lemma.lemmatize('watching',get_wordnet_pos('watching'))

'watch'

In [151]:
print([wnet_lemma.lemmatize(word,get_wordnet_pos(tag)) for word,tag in test_sent_tokens_tags])

['Waaooo', 'such', 'an', 'amazing', 'match', 'I', 'really', 'enjoy', 'watch', 'this', 'event', 'of', 'WWE', 'and', 'pro', 'wrestling', 'And', 'they', 'be', 'my', 'stress', 'buster', 'too']


### **Now, the results are quite good. Here, if you closely examine 'amazing' is unchanged because its context in the snetence belongs to the Adjective class and in this category its lemma is 'amazing'. Therefore, it remains the same.**

### **Lets check the same with a sentence having some homonyms.**

In [123]:
homonym_sent = ("They refuse to permit us to obtain the refuse permit")

In [127]:
homonym_sent_tokens = nltk.word_tokenize(homonym_sent,preserve_line=False)
homonym_sent_tokens

['They',
 'refuse',
 'to',
 'permit',
 'us',
 'to',
 'obtain',
 'the',
 'refuse',
 'permit']

In [145]:
homonym_sent_tokens_tags = nltk.pos_tag(homonym_sent_tokens)
homonym_sent_tokens_tags

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [144]:
wnet_lemma.lemmatize('refuse','v')

'refuse'

In [149]:
print([wnet_lemma.lemmatize(word,get_wordnet_pos(tag)) for word,tag in homonym_sent_tokens_tags])

['They', 'refuse', 'to', 'permit', 'u', 'to', 'obtain', 'the', 'refuse', 'permit']


### **Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning "deny," while REFuse is a noun meaning "trash" (i.e. they are not homophones).**

### **Thus, we need to know which word is being used in order to pronounce the text correctly. (For this reason, text-to-speech systems usually perform POS-tagging.)**

### ***`TextBlob Lemmatizer`***

In [36]:
from textblob import TextBlob, Word

In [37]:
word='beauti'

In [38]:
w = Word(word)

In [39]:
sent = TextBlob(test_sentence)

In [40]:
" ". join([w.lemmatize() for w in sent.words])

'Waaooo what a beautifull fight I really enjoy watching WWE and pro-wrestling And they are my stress buster too'

### ***`TreeTaggerWrapper Lemmatizer`***

In [41]:
import treetaggerwrapper as ttpw

In [42]:
tagger = ttpw.TreeTagger(TAGLANG='en',TAGDIR="C:\TreeTagger")

In [43]:
tags = tagger.tag_text(text=test_sentence)

In [44]:
lemmas = [t.split('\t')[-1] for t in tags]

In [45]:
lemmas

['Waaooo',
 ',',
 'what',
 'a',
 'beautifull',
 'fight',
 '.',
 'I',
 'really',
 'enjoy',
 'watch',
 'WWE',
 'and',
 'pro-wrestling',
 '.',
 'and',
 ',',
 'they',
 'be',
 'my',
 'stress',
 'buster',
 'too',
 '.']

In [46]:
text_df['Wordnet_Lemmas'] = text_df[''].apply(lambda row: [wnet_lemma.lemmatize(word) for word in row])
text_df

KeyError: ''

# ***`Featurization`***
### **1. BAG of WORDS (BOW)**

In [None]:
cv = CountVectorizer()

In [None]:
''.join(str(preprocess_sents))

In [None]:
BOW = cv.fit_transform(''.join(preprocess_sents))

In [None]:
BOW

In [None]:
np.array(cv.get_feature_names()).shape

In [None]:
print(cv.get_feature_names())

In [None]:
print(cv.get_stop_words())       ## Here, in countvectoriser we can also initialize the stopwords but in this case I have kept it blank

In [None]:
pd.set_option('display.max_columns',100)

In [None]:
bow_features = pd.DataFrame(BOW.toarray(),columns=cv.get_feature_names())
bow_features.head(10)

### **2. N-grams**

In [None]:
cv2 = CountVectorizer(ngram_range=(1,2))

In [None]:
print([' '.join(final_corpus_words)])

In [None]:
ngrams = cv2.fit_transform([' '.join(final_corpus_words)])

In [None]:
ngrams.toarray()

In [None]:
cv2.get_feature_names()

In [None]:
ngrams_features = pd.DataFrame(ngrams.toarray(),columns=cv2.get_feature_names())
ngrams_features.head(10)