data source: https://www.kaggle.com/sid321axn/amazon-alexa-reviews#amazon_alexa.tsv

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('amazon_alexa.tsv',sep='\t')

In [3]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [4]:
df.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [7]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
import re

In [13]:
corpus = []
for i in range(0,len(df)):
    review = re.sub('[^A-Za-z]',' ',df['verified_reviews'][i])
    review = review.lower()
    review = review.split()
    review = [wl.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [97]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [98]:
X = cv.fit_transform(corpus).toarray()
y = df.feedback

In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [100]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [101]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [102]:
y_pred = lr.predict(X_test)

In [103]:
y_pred[333:344]

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [104]:
y_test.iloc[333:344]

2453    1
2918    1
3105    1
2674    1
166     1
124     1
2594    0
682     1
2329    1
572     1
1198    1
Name: feedback, dtype: int64

In [105]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

In [106]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[ 27,  61],
       [  6, 851]], dtype=int64)

In [107]:
classification_report (y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.82      0.31      0.45        88\n           1       0.93      0.99      0.96       857\n\n    accuracy                           0.93       945\n   macro avg       0.88      0.65      0.70       945\nweighted avg       0.92      0.93      0.91       945\n'

In [108]:
accuracy_score(y_test,y_pred)

0.9291005291005291

In [109]:
from imblearn.over_sampling import SMOTE

In [110]:
sm = SMOTE()

In [111]:
X_train_smote, y_train_smote = sm.fit_sample(X_train.astype('float'),y_train)

In [112]:
from collections import Counter

In [113]:
print('before smote :',Counter(y_train))
print('after smote: ',Counter(y_train_smote))

before smote : Counter({1: 2036, 0: 169})
after smote:  Counter({1: 2036, 0: 2036})


In [114]:
lr.fit(X_train_smote,y_train_smote)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [115]:
y_pred = lr.predict(X_test)

In [85]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[ 60,  28],
       [109, 748]], dtype=int64)

In [116]:
accuracy_score(y_test,y_pred)

0.8962962962962963