In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import re
import math

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
import pickle


In [3]:
data = pd.read_csv("Data.csv")
data.head(10)

Unnamed: 0.1,Unnamed: 0,overall,verified,reviewText,summary,Binary_Overall
0,0,5.0,True,This is the best novel I have read in 2 or 3 y...,A star is born,1.0
1,1,3.0,True,"Pages and pages of introspection, in the style...",A stream of consciousness novel,0.0
2,2,5.0,False,This is the kind of novel to read when you hav...,I'm a huge fan of the author and this one did ...,1.0
3,3,5.0,False,What gorgeous language! What an incredible wri...,The most beautiful book I have ever read!,1.0
4,4,3.0,True,I was taken in by reviews that compared this b...,A dissenting view--In part.,0.0
5,5,4.0,True,I read this probably 50 years ago in my youth ...,Above average mystery,1.0
6,6,5.0,True,I read every Perry mason book voraciously. Fin...,Lam is cool!,1.0
7,7,5.0,True,I love this series of Bertha and Lamb.. Great...,Five Stars,1.0
8,8,5.0,True,Great read!,Five Stars,1.0
9,9,4.0,False,"Crows Can't Count, A.A. Fair\n\nMr. Harry Shar...",A Fast and Far Moving Adventure,1.0


In [4]:
#sample 
data = data.sample(100000)
data

Unnamed: 0.1,Unnamed: 0,overall,verified,reviewText,summary,Binary_Overall
4406660,4407685,5.0,False,Great little case. I am using the Inateck&nbs...,Great Case!,1.0
1900861,1901188,5.0,False,Please know what you are getting. This is a d...,Have had this for three years...,1.0
2563043,2563506,5.0,True,"This is the smallest bag I have, great for day...",TBC-410,1.0
1896809,1897134,5.0,True,"I tend to take more natural light photos, but ...",I'm still getting the hang of it (I really don...,1.0
4456532,4457577,5.0,True,Very easy to set up. Simple interface and much...,Good Buy,1.0
...,...,...,...,...,...,...
4676944,4678082,5.0,True,I was a bit worried from all the reviews on ho...,Easy set up and works great,1.0
1925638,1925974,5.0,True,Excellent product as described,Five Stars,1.0
6214761,6216571,5.0,True,This is the second fastest processor my mother...,Pentium D 945,1.0
4032533,4033419,5.0,True,"Great charger and batteries, Since the GoPro H...",nice replacement batteries and charger for the...,1.0


In [5]:
data.overall.value_counts()

5.0    64125
4.0    16871
3.0     7535
1.0     6920
2.0     4549
Name: overall, dtype: int64

In [6]:
#pip install imblearn

In [7]:
#! conda install --yes nltk==3.4.5
#! conda install --yes unidecode

In [8]:
#prepare data for model
import nltk

In [9]:
data.drop(columns = "Unnamed: 0",inplace = True)

In [10]:
X = data.reviewText
Y = data.overall

In [11]:
X = X.replace(r"\n", ' ', regex = True)

In [12]:
xtrain,xtest,y_train,y_test=train_test_split(X,Y); 
print('train size:',len(xtrain))
print('test size:',len(xtest))

train size: 75000
test size: 25000


In [13]:
#balance dataset. I tested with normal dataset but confusion matrix was pretty bad so i decided to balance dataset
from imblearn.under_sampling import RandomUnderSampler

undersampler=RandomUnderSampler(sampling_strategy='majority');

X_train_us,y_train_us=undersampler.fit_resample(xtrain.values.reshape(-1, 1),y_train);
X_train_us,y_train_us=undersampler.fit_resample(X_train_us,y_train_us);

print('Composición del training set:')
print(y_train_us.value_counts())

print('\nComposición del test set:')
print(y_test.value_counts())

Composición del training set:
3.0    5655
1.0    5249
5.0    3419
4.0    3419
2.0    3419
Name: overall, dtype: int64

Composición del test set:
5.0    16146
4.0     4173
3.0     1880
1.0     1671
2.0     1130
Name: overall, dtype: int64


In [14]:
from nltk.corpus import stopwords 
stop_words=stopwords.words('english');

In [15]:
vectorizer=TfidfVectorizer(stop_words=stop_words,strip_accents='unicode',ngram_range = (1,2));
X_train_us = vectorizer.fit_transform(X_train_us.ravel())
xtest = vectorizer.transform(xtest.values.astype("U"))

In [16]:
#use bayes for predicting.Used few parameters and sample instead of complete data to make prediction faster and dont make my PC implode

from sklearn.naive_bayes import MultinomialNB

params={'alpha':[0.1,0.5,1]};

GS_CV=GridSearchCV(MultinomialNB(),params,cv=10);

GS_CV.fit(X_train_us, y_train_us);

print('best score:',GS_CV.best_score_)
print('best params:',GS_CV.best_params_)

best score: 0.4421819539902473
best params: {'alpha': 0.1}


In [17]:
#accuracy score in train and test sucks. f1 also sucks so im gonna try a binary problem 
from sklearn.metrics import accuracy_score

pred = GS_CV.predict (xtest)
accuracy_score(y_test,pred)

0.29188

In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[1223,   20,  419,    5,    4],
       [ 483,   22,  614,    6,    5],
       [ 406,   22, 1387,   40,   25],
       [ 452,   12, 3112,  252,  345],
       [1780,   69, 8764, 1120, 4413]], dtype=int64)

In [36]:
#testing model with self-made reviews

texto_prueba=['worst keyboard i have seen in my life']
texto_vec=vectorizer.transform(texto_prueba);
k = GS_CV.predict(texto_vec)
print('Comentario:',texto_prueba,'\n')
print("overall:" , k)
print(' proba:',GS_CV.predict_proba(texto_vec).max())

Comentario: ['worst keyboard i have seen in my life'] 

overall: [1.]
 proba: 0.6830395057759229


In [42]:
texto_prueba=['best computer ever']
texto_vec=vectorizer.transform(texto_prueba);
k = GS_CV.predict(texto_vec)
print('Comentario:',texto_prueba,'\n')
print("overall:" , k)
print(' proba:',GS_CV.predict_proba(texto_vec).max())

Comentario: ['best computer ever'] 

overall: [1.]
 proba: 0.47185773287705524


In [38]:
texto_prueba=['mediocre tv']
texto_vec=vectorizer.transform(texto_prueba);
k = GS_CV.predict(texto_vec)
print('Comentario:',texto_prueba,'\n')
print("overall:" , k)
print(' proba:',GS_CV.predict_proba(texto_vec).max())

Comentario: ['mediocre tv'] 

overall: [3.]
 proba: 0.48235393303720275


In [39]:
texto_prueba=['it has some flaws but its a good overall thing']
texto_vec=vectorizer.transform(texto_prueba);
k = GS_CV.predict(texto_vec)
print('Comentario:',texto_prueba,'\n')
print("overall:" , k)
print(' proba:',GS_CV.predict_proba(texto_vec).max())

Comentario: ['it has some flaws but its a good overall thing'] 

overall: [3.]
 proba: 0.4382314985378033


In [58]:
texto_prueba=['this is awesome']
texto_vec=vectorizer.transform(texto_prueba);
k = GS_CV.predict(texto_vec)
print('Comentario:',texto_prueba,'\n')
print("overall:" , k)
print(' proba:',GS_CV.predict_proba(texto_vec).max())

Comentario: ['this is awesome'] 

overall: [5.]
 proba: 0.5250702632242529


In [59]:
texto_prueba=['My mom has more vital signs than Hz of this monitor']
texto_vec=vectorizer.transform(texto_prueba);
k = GS_CV.predict(texto_vec)
print('Comentario:',texto_prueba,'\n')
print("overall:" , k)
print(' proba:',GS_CV.predict_proba(texto_vec).max())

Comentario: ['My mom has more vital signs than Hz of this monitor'] 

overall: [3.]
 proba: 0.43170109093819825


In [61]:
texto_prueba=['Good product, I would buy it again']
texto_vec=vectorizer.transform(texto_prueba);
k = GS_CV.predict(texto_vec)
print('Comentario:',texto_prueba,'\n')
print("overall:" , k)
print(' proba:',GS_CV.predict_proba(texto_vec).max())

Comentario: ['Good product, I would buy it again'] 

overall: [3.]
 proba: 0.32665313838168797
