# ***`Understand NLP`***

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import scipy
import nltk

%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os
import sys

#### **Reading the files from current location**

In [2]:
import glob

In [3]:
file_names = [file_name.split("\\")[-1] for file_name in glob.glob(os.getcwd()+'\\doc*.txt')]

In [4]:
file_names

['doc1.txt', 'doc2.txt']

#### **Creating the corpus from files**

In [5]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [6]:
corpus = PlaintextCorpusReader(root=os.getcwd(),fileids=file_names)

In [7]:
corpus.fileids()

['doc1.txt', 'doc2.txt']

#### ***`Courpus Paras`***

In [8]:
print([para for para in corpus.paras()])

[[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."']], [['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]]


#### ***`Courpus Sentences`***

In [9]:
corpus_sents = [sent for sent in corpus.sents()]
print(corpus_sents)

[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."'], ['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]


#### ***`Courpus Words`***

In [10]:
corpus_words = [word for word in corpus.words()]
print(corpus_words)

['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."', '"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.', 'But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']


#### ***`English Stopwords`***

In [11]:
eng_stopwords = stopwords.words('english')

In [12]:
for word in ['not','nor','no']:
    eng_stopwords.remove(word)

In [13]:
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

#### ***`Cleaning the Corpus`***
- ##### **Removing special characters**
- ##### **Removing unwanted spaces**
- ##### **Lower case the words**
- ##### **Tokenizing the words**

In [40]:
print(corpus_sents)

[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."'], ['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]


In [41]:
cleaned_sent = []
for sent in corpus_sents:
    process_sent = [re.sub('[^A-Za-z]+', ' ', str(sent)).strip().lower()]
    cleaned_sent.append(process_sent)
    
print(cleaned_sent)

[['my name is rajesh sharma'], ['i love working on data science projects'], ['the nexon car is very affordable'], ['the pizza was cheap tasty and delicious'], ['the dominoz pizza is tasty and loaded'], ['my name is raman revti sharma'], ['i love doing data analytics'], ['the tata nexon car is very stylish dynamic and has a strong build'], ['but their after sales service is not good'], ['the pizza in the party was tasty and cheesy'], ['the dominoz tacco is always cripy and fingerlicious']]


#### ***`Storing text messages in a DataFrame`***

In [59]:
pd.set_option('display.max_colwidth',2000)

In [86]:
text_df = pd.DataFrame(cleaned_sent).reset_index()
text_df.columns = ['Id','Pre_processed_Message']
text_df

Unnamed: 0,Id,Pre_processed_Message
0,0,my name is rajesh sharma
1,1,i love working on data science projects
2,2,the nexon car is very affordable
3,3,the pizza was cheap tasty and delicious
4,4,the dominoz pizza is tasty and loaded
5,5,my name is raman revti sharma
6,6,i love doing data analytics
7,7,the tata nexon car is very stylish dynamic and has a strong build
8,8,but their after sales service is not good
9,9,the pizza in the party was tasty and cheesy


#### ***`Removing Stopwords and Tokenization`***

In [87]:
text_df['Tokens'] = text_df['Pre_processed_Message'].apply(lambda row: [word for word in row.split(" ") if word not in eng_stopwords])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens
0,0,my name is rajesh sharma,"[name, rajesh, sharma]"
1,1,i love working on data science projects,"[love, working, data, science, projects]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]"
6,6,i love doing data analytics,"[love, data, analytics]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]"


### ***`Stemming`***
#### ***`Porter Stemmer`***

In [93]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [89]:
port_stem = PorterStemmer()

In [90]:
text_df['Porter_Stems'] = text_df['Tokens'].apply(lambda row: [port_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems
0,0,my name is rajesh sharma,"[name, rajesh, sharma]","[name, rajesh, sharma]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]","[dominoz, pizza, tasti, load]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]","[name, raman, revti, sharma]"
6,6,i love doing data analytics,"[love, data, analytics]","[love, data, analyt]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]"


### **Porter Stemmer is simply chopping off the tails of the words. Not a good way!!**

#### ***`Snowball Stemmer`***

In [91]:
snow_stem = SnowballStemmer(language='english',ignore_stopwords=True)

In [92]:
text_df['Snowball_Stems'] = text_df['Tokens'].apply(lambda row: [snow_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems
0,0,my name is rajesh sharma,"[name, rajesh, sharma]","[name, rajesh, sharma]","[name, rajesh, sharma]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]","[dominoz, pizza, tasti, load]","[dominoz, pizza, tasti, load]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]","[name, raman, revti, sharma]","[name, raman, revti, sharma]"
6,6,i love doing data analytics,"[love, data, analytics]","[love, data, analyt]","[love, data, analyt]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]"


### **Snowball Stemmer is considered as a strongest method for taking a word to its root form. However, here, I didn't oberserved any difference with Porter Stemmer.**

#### ***`Lancaster Stemmer`***

In [95]:
lanc_stem = LancasterStemmer()

In [97]:
text_df['Lancaster_Stems'] = text_df['Tokens'].apply(lambda row: [lanc_stem.stem(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems
0,0,my name is rajesh sharma,"[name, rajesh, sharma]","[name, rajesh, sharma]","[name, rajesh, sharma]","[nam, rajesh, sharm]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]","[dominoz, pizza, tasti, load]","[dominoz, pizza, tasti, load]","[dominoz, pizz, tasty, load]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]","[name, raman, revti, sharma]","[name, raman, revti, sharma]","[nam, ram, revt, sharm]"
6,6,i love doing data analytics,"[love, data, analytics]","[love, data, analyt]","[love, data, analyt]","[lov, dat, analys]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]"


### **Definitely Lancaster Stemmer is not a good approach to bring a word to its root form. It is chopping-off the vowels from the tails of the words.**

### ***`Lemmatizers`***
##### **Reference Links**

- https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    
- https://www.nltk.org/book/ch05.html


#### ***`Wordnet Lemmatizer`***

In [100]:
from nltk.stem.wordnet import WordNetLemmatizer

In [255]:
test_sentence = ("Waaooo, what a beautifull fight. I really enjoy watching WWE and pro-wrestling. And, they are my stress busters too.")

In [101]:
wnet_lemma = WordNetLemmatizer()

In [230]:
nltk.pos_tag(test_sentence.split(" "))

[('I', 'PRP'),
 ('really', 'RB'),
 ('enjoy', 'VB'),
 ('watching', 'VBG'),
 ('WWE', 'NNP'),
 ('and', 'CC'),
 ('pro-wrestling.', 'JJ'),
 ('And,', 'NNP'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('my', 'PRP$'),
 ('stress', 'JJ'),
 ('busters', 'NNS'),
 ('too.', 'VBP')]

In [233]:
nltk.pos_tag(test_sentence.split(" "))[0][1][0]

'P'

In [235]:
wnet_lemma.lemmatize(test_sentence)

'I really enjoy watching WWE and pro-wrestling. And, they are my stress busters too.'

In [236]:
from nltk.corpus import wordnet

In [237]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag)#, wordnet.ADV)

In [219]:
get_wordnet_pos('beautifull')

'n'

In [221]:
wnet_lemma.lemmatize('beautifull',get_wordnet_pos('beautifull'))

'beautifull'

!python -m pip install treetaggerwrapper

In [171]:
from textblob import TextBlob, Word

In [225]:
word='beauti'

In [247]:
w = Word(word)

In [256]:
sent = TextBlob(test_sentence)

In [257]:
" ". join([w.lemmatize() for w in sent.words])

'Waaooo what a beautifull fight I really enjoy watching WWE and pro-wrestling And they are my stress buster too'

In [185]:
import treetaggerwrapper as ttpw

In [188]:
tagger = ttpw.TreeTagger(TAGLANG='en',TAGDIR="C:\TreeTagger")

In [258]:
tags = tagger.tag_text(text=test_sentence)

In [259]:
lemmas = [t.split('\t')[-1] for t in tags]

In [260]:
lemmas

['Waaooo',
 ',',
 'what',
 'a',
 'beautifull',
 'fight',
 '.',
 'I',
 'really',
 'enjoy',
 'watch',
 'WWE',
 'and',
 'pro-wrestling',
 '.',
 'and',
 ',',
 'they',
 'be',
 'my',
 'stress',
 'buster',
 'too',
 '.']

In [107]:
text_df['Wordnet_Lemmas'] = text_df[''].apply(lambda row: [wnet_lemma.lemmatize(word) for word in row])
text_df

Unnamed: 0,Id,Pre_processed_Message,Tokens,Porter_Stems,Snowball_Stems,Lancaster_Stems,Wordnet_Lemmas
0,0,my name is rajesh sharma,"[name, rajesh, sharma]","[name, rajesh, sharma]","[name, rajesh, sharma]","[nam, rajesh, sharm]","[nam, rajesh, sharm]"
1,1,i love working on data science projects,"[love, working, data, science, projects]","[love, work, data, scienc, project]","[love, work, data, scienc, project]","[lov, work, dat, sci, project]","[lov, work, dat, sci, project]"
2,2,the nexon car is very affordable,"[nexon, car, affordable]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]","[nexon, car, afford]"
3,3,the pizza was cheap tasty and delicious,"[pizza, cheap, tasty, delicious]","[pizza, cheap, tasti, delici]","[pizza, cheap, tasti, delici]","[pizz, cheap, tasty, delicy]","[pizz, cheap, tasty, delicy]"
4,4,the dominoz pizza is tasty and loaded,"[dominoz, pizza, tasty, loaded]","[dominoz, pizza, tasti, load]","[dominoz, pizza, tasti, load]","[dominoz, pizz, tasty, load]","[dominoz, pizz, tasty, load]"
5,5,my name is raman revti sharma,"[name, raman, revti, sharma]","[name, raman, revti, sharma]","[name, raman, revti, sharma]","[nam, ram, revt, sharm]","[nam, ram, revt, sharm]"
6,6,i love doing data analytics,"[love, data, analytics]","[love, data, analyt]","[love, data, analyt]","[lov, dat, analys]","[lov, dat, analys]"
7,7,the tata nexon car is very stylish dynamic and has a strong build,"[tata, nexon, car, stylish, dynamic, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tata, nexon, car, stylish, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]","[tat, nexon, car, styl, dynam, strong, build]"
8,8,but their after sales service is not good,"[sales, service, not, good]","[sale, servic, not, good]","[sale, servic, not, good]","[sal, serv, not, good]","[sal, serv, not, good]"
9,9,the pizza in the party was tasty and cheesy,"[pizza, party, tasty, cheesy]","[pizza, parti, tasti, cheesi]","[pizza, parti, tasti, cheesi]","[pizz, party, tasty, cheesy]","[pizz, party, tasty, cheesy]"


# ***`Featurization`***
### **1. BAG of WORDS (BOW)**

In [18]:
cv = CountVectorizer()

In [23]:
''.join(str(preprocess_sents))

"[['name', 'rajesh', 'sharma'], ['love', 'working', 'data', 'science', 'projects'], ['nexon', 'car', 'affordable'], ['pizza', 'cheap', 'tasty', 'delicious'], ['dominoz', 'pizza', 'tasty', 'loaded'], ['name', 'raman', 'revti', 'sharma'], ['love', 'data', 'analytics'], ['tata', 'nexon', 'car', 'stylish', 'dynamic', 'strong', 'build'], ['sales', 'service', 'not', 'good'], ['pizza', 'party', 'tasty', 'cheesy'], ['dominoz', 'tacco', 'always', 'cripy', 'fingerlicious']]"

In [21]:
BOW = cv.fit_transform(''.join(preprocess_sents))

TypeError: sequence item 0: expected str instance, list found

In [None]:
BOW

In [None]:
np.array(cv.get_feature_names()).shape

In [None]:
print(cv.get_feature_names())

In [None]:
print(cv.get_stop_words())       ## Here, in countvectoriser we can also initialize the stopwords but in this case I have kept it blank

In [None]:
pd.set_option('display.max_columns',100)

In [None]:
bow_features = pd.DataFrame(BOW.toarray(),columns=cv.get_feature_names())
bow_features.head(10)

### **2. N-grams**

In [None]:
cv2 = CountVectorizer(ngram_range=(1,2))

In [None]:
print([' '.join(final_corpus_words)])

In [None]:
ngrams = cv2.fit_transform([' '.join(final_corpus_words)])

In [None]:
ngrams.toarray()

In [None]:
cv2.get_feature_names()

In [None]:
ngrams_features = pd.DataFrame(ngrams.toarray(),columns=cv2.get_feature_names())
ngrams_features.head(10)