In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
   

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns

import warnings
warnings.filterwarnings("ignore") 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.dummy import DummyClassifier
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
df = pd.read_csv("/kaggle/input/amazon-customer-reviews/Reviews.csv")
df.head()

In [None]:
df.columns

## Data Pre-processing

In [None]:
# if helpfulness denominator is greater than '0', perform operation, otherwise append '-1'
# this will give an array ranging between '-1' to '+1'
df['Helpful %'] = np.where(df['HelpfulnessDenominator']>0, df['HelpfulnessNumerator']/df['HelpfulnessDenominator'], -1)
df.head()

In [None]:
# convert them into bins
df['Upvote %'] = pd.cut(df['Helpful %'], bins=[-1,0,0.2,0.4,0.6,0.8,1], labels=['Empty', '0-20%','20-40%', '40-60%', '60-80%', '80-100%'])
df.head()

## Exploratory Data Analysis

In [None]:
# analyse the upvotes % for different scores
df.groupby(['Score', 'Upvote %']).agg('count')

In [None]:
df_s = df.groupby(['Score', 'Upvote %']).agg({'Id':'count'}).reset_index()
df_s

In [None]:
# create pivot table
pivot = df_s.pivot(index='Upvote %', columns='Score')
pivot

In [None]:
# create heatmap
sns.heatmap(pivot, annot=True, cmap='YlGnBu')

- More than half of reviews are with '0' score
- Many people agree with score '5'

## Bag-of-Words  
### Manual Approach

In [None]:
# apply BOW
df['Score'].unique()

In [None]:
# when score is '3' means 'Neutral', so filter it out
df2 = df[df['Score'] != 3]

In [None]:
X = df2['Text']
df2['Score'].unique()

In [None]:
# create a dictionary for score with '0' and '1'
y_dict = {1:0, 2:0, 4:1, 5:1}
y = df2['Score'].map(y_dict)

In [None]:
# convert text
c = CountVectorizer(stop_words='english')
X_c = c.fit_transform(X)

In [None]:
X_c.shape # no. of features increased after applying CountVectorizer

In [None]:
# check model accuracy
X_train, X_test, y_train, y_test = train_test_split(X_c, y)

In [None]:
X_train.shape

In [None]:
log_reg = LogisticRegression(solver='liblinear')
model = log_reg.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
# fetch top 20 positive and  top 20 negative words from the model
w = c.get_feature_names()
# w

In [None]:
coef = model.coef_.tolist()[0]
# coef

In [None]:
coef_df = pd.DataFrame({'Word': w, 'Coefficient': coef})
coef_df

In [None]:
coef_df = coef_df.sort_values(['Coefficient', 'Word'], ascending=False)

In [None]:
# top 20 positive words
coef_df.head(20)

In [None]:
# top 20 nagative words
coef_df.tail(20)

### Automated Approach

In [None]:
def text_fit(X, y, nlp_model, ml_model, coef_show=1):
    X_c = nlp_model.fit_transform(X)
    print('features: {}'.format(X_c.shape[1]))
    
    X_train, X_test, y_train, y_test = train_test_split(X_c, y)
    model = ml_model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(acc)
    
    if coef_show == 1:    
        w = c.get_feature_names()
        coef = model.coef_.tolist()[0]
        coef_df = pd.DataFrame({'Word': w, 'Coefficient': coef})
        coef_df = coef_df.sort_values(['Coefficient', 'Word'], ascending=False)
        print('\n')
        print('Top 20 Positive Words')
        print(coef_df.head(20))
        print('\n')
        print('Top 20 Negative Words')
        print(coef_df.tail(20))

In [None]:
# nlp model
c = CountVectorizer(stop_words='english')

# ml model
log_reg = LogisticRegression(solver='liblinear')

In [None]:
text_fit(X, y, c, log_reg)

In [None]:
def predict(X, y, nlp_model, ml_model):
    X_c = nlp_model.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_c, y)
    model = ml_model.fit(X_train, y_train)
    pred = model.predict(X_test)
    cm = confusion_matrix(pred, y_test)
    print(cm)
    acc = accuracy_score(pred, y_test)
    print(acc)   

In [None]:
c = CountVectorizer()
lr = LogisticRegression(solver='liblinear')

predict(X, y, c, lr)

## TF-IDF

In [None]:
c = CountVectorizer()
text_fit(X, y, c, DummyClassifier(), 0)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
lr = LogisticRegression(solver='liblinear')

text_fit(X, y, tfidf, lr, 0)

In [None]:
predict(X, y, tfidf, lr)

## Data preparation for Modeling

In [None]:
# data preparation for predicting the upvotes, as high number of users with score '5'
df.head()

In [None]:
data = df[df['Score'] == 5]
data.head()

In [None]:
data['Upvote %'].unique()

In [None]:
# get rid of neutral and empty votes
data2 = data[data['Upvote %'].isin(['80-100%','60-80%','20-40%', '0-20%'])]
data2.head()

In [None]:
# seperate independent to give it to model
X = data2['Text']

In [None]:
data2['Upvote %'].unique()

In [None]:
y_dict = {'80-100%':1, '60-80%':1, '20-40%':0, '0-20%':0}
y = data2['Upvote %'].map(y_dict)

## Over Sampling - handle Imbalanced data

In [None]:
# if dataset is imbalanced,we can use -
# Undersampling - same ratio of data for both class, but data loss,
# Oversampling - add extra data points to make same ratio,
# SMOTE TOMEK, 
# Ensemble techniques i.e., Random Forest - multiple decision trees

In [None]:
# check value counts to gain insight if data is 'imbalanced'
y.value_counts()

In [None]:
os = RandomOverSampler()

X_train_res, y_train_res = os.fit_resample(X_c, y)

In [None]:
X_train_res.shape

In [None]:
y_train_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_train_res)))

## Cross-Validation (GridSeachCV)

In [None]:
log_class = LogisticRegression(solver='liblinear')

In [None]:
grid = {'C':10.0**np.arange(-2,3), 'penalty':['l1','l2']}

In [None]:
clf = GridSearchCV(estimator=log_class,
                  param_grid = grid,
                  cv=5,
                  n_jobs=-1, # use all resources of CPU
                  scoring='f1_macro')

In [None]:
clf.fit(X_train_res, y_train_res)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_c, y)

In [None]:
pred = clf.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
accuracy_score(y_test, pred)