# ***`Understand NLP`***

In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import scipy
import nltk

%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os
import sys

#### **Reading the files from current location**

In [133]:
import glob

In [136]:
file_names = [file_name.split("\\")[-1] for file_name in glob.glob(os.getcwd()+'\\doc*.txt')]

In [137]:
file_names

['doc1.txt', 'doc2.txt']

#### **Creating the corpus from files**

In [92]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [138]:
corpus = PlaintextCorpusReader(root=os.getcwd(),fileids=file_names)

In [139]:
corpus.fileids()

['doc1.txt', 'doc2.txt']

#### ***`Courpus Paras`***

In [124]:
print([para for para in corpus.paras()])

[[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."']], [['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]]


#### ***`Courpus Sentences`***

In [235]:
corpus_sents = [sent for sent in corpus.sents()]
print(corpus_sents)

[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."'], ['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]


#### ***`Courpus Words`***

In [145]:
corpus_words = [word for word in corpus.words()]
print(corpus_words)

['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."', '"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.', 'But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']


#### ***`English Stopwords`***

In [161]:
eng_stopwords = stopwords.words('english')

In [162]:
for word in ['not','nor','no']:
    eng_stopwords.remove(word)

In [163]:
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

#### ***`Cleaning the Corpus`***
- ##### **Removing special characters**
- ##### **Removing unwanted spaces**
- ##### **Lower case the words**
- ##### **Tokenizing the words**

In [236]:
print(corpus_sents)

[['"', 'My', 'Name', 'is', 'Rajesh', 'Sharma', '.'], ['",', '"', 'I', 'love', 'working', 'on', 'data', 'Science', 'projects', '.'], ['",', '"', 'The', 'nexon', 'car', 'is', 'very', 'affordable', '.'], ['",', '"', 'The', 'pizza', 'was', 'cheap', ',', 'tasty', 'and', 'delicious', '.'], ['",', '"', 'The', 'dominoz', 'pizza', 'is', 'tasty', 'and', 'loaded', '."'], ['"', 'My', 'Name', 'is', 'Raman', 'Revti', 'Sharma', '.'], ['",', '"', 'I', 'love', 'doing', 'data', 'analytics', '.'], ['",', '"', 'The', 'tata', 'nexon', 'car', 'is', 'very', 'stylish', ',', 'dynamic', 'and', 'has', 'a', 'strong', 'build', '.'], ['But', 'their', 'after', 'sales', 'service', 'is', 'not', 'good', '.'], ['",', '"', 'The', 'pizza', 'in', 'the', 'party', 'was', 'tasty', 'and', 'cheesy', '.'], ['",', '"', 'The', 'dominoz', 'tacco', 'is', 'always', 'cripy', 'and', 'fingerlicious', '."']]


In [244]:
cleaned_sent = []
for sent in corpus_sents:
    process_sent = [re.sub('[^A-Za-z]+', ' ', str(sent)).strip().lower()]
    cleaned_sent.append(process_sent)
    
print(cleaned_sent)

[['my name is rajesh sharma'], ['i love working on data science projects'], ['the nexon car is very affordable'], ['the pizza was cheap tasty and delicious'], ['the dominoz pizza is tasty and loaded'], ['my name is raman revti sharma'], ['i love doing data analytics'], ['the tata nexon car is very stylish dynamic and has a strong build'], ['but their after sales service is not good'], ['the pizza in the party was tasty and cheesy'], ['the dominoz tacco is always cripy and fingerlicious']]


#### ***`Removing Stopwords`***

In [281]:
preprocess_sents = []

for sent in cleaned_sent:
    sent_words = []
    for word in re.sub('[^A-Za-z]+', ' ',str(sent)).strip().split(" "):
        if word not in eng_stopwords:
            sent_words.append(word)
    preprocess_sents.append(sent_words)

In [282]:
print(preprocess_sents)

[['name', 'rajesh', 'sharma'], ['love', 'working', 'data', 'science', 'projects'], ['nexon', 'car', 'affordable'], ['pizza', 'cheap', 'tasty', 'delicious'], ['dominoz', 'pizza', 'tasty', 'loaded'], ['name', 'raman', 'revti', 'sharma'], ['love', 'data', 'analytics'], ['tata', 'nexon', 'car', 'stylish', 'dynamic', 'strong', 'build'], ['sales', 'service', 'not', 'good'], ['pizza', 'party', 'tasty', 'cheesy'], ['dominoz', 'tacco', 'always', 'cripy', 'fingerlicious']]


# ***`Featurization`***
### **1. BAG of WORDS (BOW)**

In [283]:
cv = CountVectorizer()

In [284]:
BOW = cv.fit_transform([' '.join(preprocess_sents)])

TypeError: sequence item 0: expected str instance, list found

In [221]:
BOW

<1x35 sparse matrix of type '<class 'numpy.int64'>'
	with 35 stored elements in Compressed Sparse Row format>

In [177]:
np.array(cv.get_feature_names()).shape

(35,)

In [215]:
print(cv.get_feature_names())

['affordable', 'always', 'analytics', 'build', 'car', 'cheap', 'cheesy', 'cripy', 'data', 'delicious', 'dominoz', 'dynamic', 'fingerlicious', 'good', 'loaded', 'love', 'name', 'nexon', 'not', 'party', 'pizza', 'projects', 'rajesh', 'raman', 'revti', 'sales', 'science', 'service', 'sharma', 'strong', 'stylish', 'tacco', 'tasty', 'tata', 'working']


In [216]:
print(cv.get_stop_words())       ## Here, in countvectoriser we can also initialize the stopwords but in this case I have kept it blank

None


In [217]:
pd.set_option('display.max_columns',100)

In [218]:
bow_features = pd.DataFrame(BOW.toarray(),columns=cv.get_feature_names())
bow_features.head(10)

Unnamed: 0,affordable,always,analytics,build,car,cheap,cheesy,cripy,data,delicious,dominoz,dynamic,fingerlicious,good,loaded,love,name,nexon,not,party,pizza,projects,rajesh,raman,revti,sales,science,service,sharma,strong,stylish,tacco,tasty,tata,working
0,1,1,1,1,2,1,1,1,2,1,2,1,1,1,1,2,2,2,1,1,3,1,1,1,1,1,1,1,2,1,1,1,3,1,1


### **2. N-grams**

In [222]:
cv2 = CountVectorizer(ngram_range=(1,2))

In [223]:
print([' '.join(final_corpus_words)])

['name rajesh sharma love working data science projects nexon car affordable pizza cheap tasty delicious dominoz pizza tasty loaded name raman revti sharma love data analytics tata nexon car stylish dynamic strong build sales service not good pizza party tasty cheesy dominoz tacco always cripy fingerlicious']


In [224]:
ngrams = cv2.fit_transform([' '.join(final_corpus_words)])

In [225]:
ngrams.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1,
        3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1,
        1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [226]:
cv2.get_feature_names()

['affordable',
 'affordable pizza',
 'always',
 'always cripy',
 'analytics',
 'analytics tata',
 'build',
 'build sales',
 'car',
 'car affordable',
 'car stylish',
 'cheap',
 'cheap tasty',
 'cheesy',
 'cheesy dominoz',
 'cripy',
 'cripy fingerlicious',
 'data',
 'data analytics',
 'data science',
 'delicious',
 'delicious dominoz',
 'dominoz',
 'dominoz pizza',
 'dominoz tacco',
 'dynamic',
 'dynamic strong',
 'fingerlicious',
 'good',
 'good pizza',
 'loaded',
 'loaded name',
 'love',
 'love data',
 'love working',
 'name',
 'name rajesh',
 'name raman',
 'nexon',
 'nexon car',
 'not',
 'not good',
 'party',
 'party tasty',
 'pizza',
 'pizza cheap',
 'pizza party',
 'pizza tasty',
 'projects',
 'projects nexon',
 'rajesh',
 'rajesh sharma',
 'raman',
 'raman revti',
 'revti',
 'revti sharma',
 'sales',
 'sales service',
 'science',
 'science projects',
 'service',
 'service not',
 'sharma',
 'sharma love',
 'strong',
 'strong build',
 'stylish',
 'stylish dynamic',
 'tacco',
 'tacc

In [227]:
ngrams_features = pd.DataFrame(ngrams.toarray(),columns=cv2.get_feature_names())
ngrams_features.head(10)

Unnamed: 0,affordable,affordable pizza,always,always cripy,analytics,analytics tata,build,build sales,car,car affordable,car stylish,cheap,cheap tasty,cheesy,cheesy dominoz,cripy,cripy fingerlicious,data,data analytics,data science,delicious,delicious dominoz,dominoz,dominoz pizza,dominoz tacco,dynamic,dynamic strong,fingerlicious,good,good pizza,loaded,loaded name,love,love data,love working,name,name rajesh,name raman,nexon,nexon car,not,not good,party,party tasty,pizza,pizza cheap,pizza party,pizza tasty,projects,projects nexon,rajesh,rajesh sharma,raman,raman revti,revti,revti sharma,sales,sales service,science,science projects,service,service not,sharma,sharma love,strong,strong build,stylish,stylish dynamic,tacco,tacco always,tasty,tasty cheesy,tasty delicious,tasty loaded,tata,tata nexon,working,working data
0,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,2,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,1,1,2,1,1,2,2,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,1,1,3,1,1,1,1,1,1,1
