In [70]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('datafinalcleaned1.csv')

In [5]:
del df['Unnamed: 0']

# Data Preparation - Split the data into train and test set

In [6]:
y=df['Score']
X=df[['ReviewText']]

In [7]:
X.head()

Unnamed: 0,ReviewText
0,I have bought several of the Vitality canned ...
1,This is a very healthy dog food Good for thei...
2,I fed this to my Golden Retriever and he hate...
3,I have to admit I was a sucker for the large ...
4,We have a 7 week old He had gas and constipat...


In [8]:
#spliting into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=50)

In [9]:
X_train.head()

Unnamed: 0,ReviewText
447215,It is good and soothing to drink I have not t...
253653,Whenever I need something a little special fo...
566123,I bought this almost a month ago and my dog a...
381706,Extract is listed as an ingredient Sounds har...
547077,I purchased these nuts as a gift and was disa...


# Data Prepration - Text Preprocessing

In [10]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [11]:
## initialise the inbuilt Stemmer
stemmer = PorterStemmer()
## We can also use Lemmatizer instead of Stemmer
lemmatizer = WordNetLemmatizer()


# Step by Step explanation of Preprocessing

In [12]:
raw_text = 'I was buying my white pepper from a gourmet store and 6 ounces cost This pepper is just as good at a fraction of the price'
raw_text

'I was buying my white pepper from a gourmet store and 6 ounces cost This pepper is just as good at a fraction of the price'

In [13]:

sentence = re.sub("[^a-zA-Z]", " ", raw_text)
print(sentence)

I was buying my white pepper from a gourmet store and   ounces cost This pepper is just as good at a fraction of the price


In [14]:

sentence = sentence.lower()
print(sentence)

i was buying my white pepper from a gourmet store and   ounces cost this pepper is just as good at a fraction of the price


In [15]:

tokens = sentence.split()
print(tokens)

['i', 'was', 'buying', 'my', 'white', 'pepper', 'from', 'a', 'gourmet', 'store', 'and', 'ounces', 'cost', 'this', 'pepper', 'is', 'just', 'as', 'good', 'at', 'a', 'fraction', 'of', 'the', 'price']


In [16]:

clean_tokens = [t for t in tokens if t not in stopwords.words("english")]
print(clean_tokens)

['buying', 'white', 'pepper', 'gourmet', 'store', 'ounces', 'cost', 'pepper', 'good', 'fraction', 'price']


In [17]:

clean_tokens_stem = [stemmer.stem(word) for word in clean_tokens]
print(clean_tokens_stem)

['buy', 'white', 'pepper', 'gourmet', 'store', 'ounc', 'cost', 'pepper', 'good', 'fraction', 'price']


In [18]:

clean_tokens_lem = [lemmatizer.lemmatize(word) for word in clean_tokens]
print(clean_tokens_lem)

['buying', 'white', 'pepper', 'gourmet', 'store', 'ounce', 'cost', 'pepper', 'good', 'fraction', 'price']


In [88]:
def preprocess(raw_text, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [20]:
from tqdm import tqdm, tqdm_notebook

tqdm.pandas()

temp_df = X_train["ReviewText"].progress_apply(lambda x: preprocess(x, 'stem'))

temp_df.head()

temp_df.columns = ['clean_text_stem', 'text_length_stem']

temp_df.head()

X_train = pd.concat([X_train, temp_df], axis=1)

X_train.head()


X_train.to_csv('X_train_stem.csv')

In [21]:
X_train_stem=pd.read_csv("X_train_stem.csv")
X_train_stem

Unnamed: 0.1,Unnamed: 0,ReviewText,clean_text_stem,text_length_stem
0,447215,It is good and soothing to drink I have not t...,good sooth drink tri mani time yet share sister,9
1,253653,Whenever I need something a little special fo...,whenev need someth littl special dinner mario ...,37
2,566123,I bought this almost a month ago and my dog a...,bought almost month ago dog small corgi mix st...,27
3,381706,Extract is listed as an ingredient Sounds har...,extract list ingredi sound harmless right spec...,33
4,547077,I purchased these nuts as a gift and was disa...,purchas nut gift disappoint arriv small brown ...,15
...,...,...,...,...
425573,385156,I had no problem eating these bars they taste...,problem eat bar tast ok thought wow nice addit...,33
425574,321502,All 3 of my dogs just love them I think they ...,dog love think good cost whole lot less,8
425575,441633,I love all of the Happy Baby products but thi...,love happi babi product one favorit highlight ...,34
425576,239499,Great price and a really good hot addition to...,great price realli good hot addit meal like th...,16


## X_train_lemma

 temp_df = X_train['ReviewText'].progress_apply(lambda x: preprocess(x, 'lemma'))

temp_df.head()

temp_df.columns = ['clean_text_lemma', 'text_length_lemma']

temp_df.head()

X_train = pd.concat([X_train, temp_df], axis=1)

X_train.head()

X_train.to_csv("X_train_lemma.csv")

In [22]:
X_train=pd.read_csv("X_train_lemma.csv")

X_train

Unnamed: 0.1,Unnamed: 0,ReviewText,clean_text_lemma,text_length_lemma
0,447215,It is good and soothing to drink I have not t...,good soothing drink tried many time yet sharin...,9
1,253653,Whenever I need something a little special fo...,whenever need something little special dinner ...,37
2,566123,I bought this almost a month ago and my dog a...,bought almost month ago dog small corgi mix st...,27
3,381706,Extract is listed as an ingredient Sounds har...,extract listed ingredient sound harmless right...,33
4,547077,I purchased these nuts as a gift and was disa...,purchased nut gift disappointed arrived small ...,15
...,...,...,...,...
425573,385156,I had no problem eating these bars they taste...,problem eating bar tasted ok thought wow nice ...,33
425574,321502,All 3 of my dogs just love them I think they ...,dog love think good cost whole lot le,8
425575,441633,I love all of the Happy Baby products but thi...,love happy baby product one favorite highlight...,34
425576,239499,Great price and a really good hot addition to...,great price really good hot addition meal like...,16


In [23]:
del X_train['Unnamed: 0']

In [24]:
X_train

Unnamed: 0,ReviewText,clean_text_lemma,text_length_lemma
0,It is good and soothing to drink I have not t...,good soothing drink tried many time yet sharin...,9
1,Whenever I need something a little special fo...,whenever need something little special dinner ...,37
2,I bought this almost a month ago and my dog a...,bought almost month ago dog small corgi mix st...,27
3,Extract is listed as an ingredient Sounds har...,extract listed ingredient sound harmless right...,33
4,I purchased these nuts as a gift and was disa...,purchased nut gift disappointed arrived small ...,15
...,...,...,...
425573,I had no problem eating these bars they taste...,problem eating bar tasted ok thought wow nice ...,33
425574,All 3 of my dogs just love them I think they ...,dog love think good cost whole lot le,8
425575,I love all of the Happy Baby products but thi...,love happy baby product one favorite highlight...,34
425576,Great price and a really good hot addition to...,great price really good hot addition meal like...,16


# Using Bag Of Words(BOW)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vocab = CountVectorizer()

X_train_bow= vocab.fit_transform(X_train['clean_text_lemma'])

In [26]:
X_train_bow

<425578x83171 sparse matrix of type '<class 'numpy.int64'>'
	with 13510900 stored elements in Compressed Sparse Row format>

In [27]:
X_train_bow[0]

<1x83171 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

## Preprocessing the test data

In [28]:
X_test.head()

Unnamed: 0,ReviewText
346436,Ive tried tons of cheap cat litter brands and...
132237,My 35 pound pit bull mix chewed the rope in h...
261415,This product I love However the product pictu...
76796,The Switch Kiwi Berry tastes metallic and fak...
498830,i love ordering on line esp from amazon just ...


temp_df = X_test['ReviewText'].progress_apply(lambda x: preprocess(x, 'lemma'))

temp_df.head()

temp_df.columns = ['clean_text_lemma', 'text_length_lemma']

temp_df.head()

X_test = pd.concat([X_test, temp_df], axis=1)

X_test.head()

X_test.to_csv("X_test.csv")

In [29]:
X_test=pd.read_csv("X_test.csv")

In [30]:
del X_test['Unnamed: 0']

In [31]:
X_test

Unnamed: 0,ReviewText,clean_text_lemma,text_length_lemma
0,Ive tried tons of cheap cat litter brands and...,ive tried ton cheap cat litter brand cheap yea...,25
1,My 35 pound pit bull mix chewed the rope in h...,pound pit bull mix chewed rope half le minute ...,54
2,This product I love However the product pictu...,product love however product pictured amazon s...,25
3,The Switch Kiwi Berry tastes metallic and fak...,switch kiwi berry taste metallic fake neither ...,67
4,i love ordering on line esp from amazon just ...,love ordering line esp amazon run coffee get e...,21
...,...,...,...
141855,These are the best of the mauna loa collectio...,best mauna loa collection u like coffee one so...,30
141856,I really like this cookie A bit dry but good ...,really like cookie bit dry good granddaughter ...,9
141857,These bones are awesome My dogs love them it ...,bone awesome dog love great purchased went bac...,19
141858,Very tasty No sugar or junk that I don want J...,tasty sugar junk want real food make great sna...,10


In [32]:
X_test_bow = vocab.transform(X_test['clean_text_lemma'])


In [33]:
X_test_bow

<141860x83171 sparse matrix of type '<class 'numpy.int64'>'
	with 4486102 stored elements in Compressed Sparse Row format>

In [137]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_vector = vectorizer.fit_transform(X_train['clean_text_lemma'])

In [35]:
X_test_vector = vectorizer.transform(X_test['clean_text_lemma'])

In [36]:
X_test_vector

<141860x83171 sparse matrix of type '<class 'numpy.float64'>'
	with 4486102 stored elements in Compressed Sparse Row format>

# Using Models

## Logistic Regression Using TF-IDF

In [138]:
from sklearn.linear_model import LogisticRegression
classifier= LogisticRegression()
classifier.fit(X_train_vector,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [139]:
y_test_pred = classifier.predict(X_test_vector)

In [140]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.7351191315381362
              precision    recall  f1-score   support

           1       0.65      0.67      0.66     13085
           2       0.45      0.20      0.28      7401
           3       0.45      0.28      0.35     10672
           4       0.51      0.26      0.34     20264
           5       0.79      0.95      0.86     90438

    accuracy                           0.74    141860
   macro avg       0.57      0.47      0.50    141860
weighted avg       0.70      0.74      0.70    141860



In [141]:
import joblib
from joblib import dump, load

In [142]:
joblib.dump(classifier,'logistic_Regression_TFIDF')

['logistic_Regression_TFIDF']

## Logistic Regression using BOW

In [40]:
from sklearn.linear_model import LogisticRegression
classifier= LogisticRegression()
classifier.fit(X_train_bow,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
y_test_pred = classifier.predict(X_test_bow)

In [42]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.7395883265191033
              precision    recall  f1-score   support

           1       0.67      0.67      0.67     13085
           2       0.44      0.26      0.33      7401
           3       0.47      0.30      0.37     10672
           4       0.53      0.27      0.35     20264
           5       0.80      0.95      0.87     90438

    accuracy                           0.74    141860
   macro avg       0.58      0.49      0.52    141860
weighted avg       0.70      0.74      0.71    141860



In [43]:
import joblib
from joblib import dump, load

In [44]:
joblib.dump(classifier,'logistic_Regression')

['logistic_Regression']

## Decision Tree Classifier

In [45]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train_bow, y_train)

In [46]:
y_test_pred = classifier.predict(X_test_bow)

In [47]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.7526011560693642
              precision    recall  f1-score   support

           1       0.64      0.62      0.63     13085
           2       0.54      0.45      0.49      7401
           3       0.54      0.48      0.51     10672
           4       0.56      0.51      0.53     20264
           5       0.84      0.88      0.86     90438

    accuracy                           0.75    141860
   macro avg       0.62      0.59      0.60    141860
weighted avg       0.74      0.75      0.75    141860



In [48]:
joblib.dump(classifier,'Decision_Tree')

['Decision_Tree']

## SVC

In [49]:
#from sklearn.svm import SVC
#classifier = SVC()
#classifier.fit(X_train_bow, y_train)

In [50]:
#y_test_pred = classifier.predict(X_test_bow)
#print(accuracy_score(y_test, y_test_pred))

#print(classification_report(y_test, y_test_pred))

# MODEL DEPLOYMENT

In [56]:
model=joblib.load("logistic_Regression")

In [66]:
x=model.predict(X_test_bow)

In [77]:
np. unique(x,return_counts=True)

(array([1, 2, 3, 4, 5], dtype=int64),
 array([ 12993,   4409,   6902,  10197, 107359], dtype=int64))

In [85]:
X_test_bow[0]

<1x83171 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

# Model Texting

In [145]:
new_input=input("Enter your Review:")
new_input_pro=preprocess(new_input,'lemma')
df=pd.DataFrame(new_input_pro)
df.columns=["Review"]
new_input_vec=vocab.transform(df)
new_output =model.predict(new_input_vec)

print(new_input)
print("Score:",new_output)

Enter your Review:It was amazing, never tried it.
It was amazing, never tried it.
Score: [5]


In [143]:
model2=joblib.load("logistic_Regression_TFIDF")

In [144]:
new_input=input("Enter your Review:")
new_input_pro=preprocess(new_input,'lemma')
df=pd.DataFrame(new_input_pro)
df.columns=["Review"]
new_input_vec=vocab.transform(df)
new_output =model2.predict(new_input_vec)

print(new_input)
print("Score:",new_output)

Enter your Review:It was a bad food
It was a bad food
Score: [1]
