In [None]:
#All the imports for this program
import numpy as np # linear algebra
import pandas as pd 
import os
import random
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import  accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns 

#Check for files present
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
random.seed(30)
#read review data 
reviews = pd.read_csv("../input/amazon-music-reviews/Musical_instruments_reviews.csv")

reviews.head()


In [None]:
#remove unncessary features
del reviews['reviewerID']
del reviews['asin']
del reviews['unixReviewTime']
del reviews['reviewTime']
del reviews['reviewerName']
del reviews['helpful']

In [None]:
#check for missing value
reviews.isna().sum()

#fill in missing values with ""
reviews.reviewText.fillna("",inplace = True)

#combine the summary with review text and delete the summary and reviewText fiels
reviews['review'] = reviews['reviewText'] + ' ' + reviews['summary']
del reviews['reviewText']
del reviews['summary']


In [None]:
#for ratings 4 & 5 consider then good 1 otherwise bad 0
reviews['overall'] = (reviews['overall'] >3).astype(int)

In [None]:
#At this point we have the overall field which is binary and review which is a text field
reviews.head()

In [None]:
#lets check the distribution of good and bad reviews
reviews.overall.value_counts()

In [None]:
#lets prep the data for modeling

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

featureCounts = tfidf.fit_transform(reviews.review)

tfidf_transformer = TfidfTransformer()

features = tfidf_transformer.fit_transform(featureCounts)

labels = reviews.overall
features.shape


In [None]:
#use chi square test to list out the words used for good and bad reviews
N = 2

for i in range(2):
    features_chi2 = chi2(features, labels == i)
    indices = np.argsort(features_chi2[i])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    
    print("# '{}':".format(i))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))


In [None]:
#split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 0)


In [None]:
#I am planning to run multiple model and then choose the best, in order to do so let me write a function 
# which can train model, test the prediction, cross validate etc
#please note that I am using stratified KFold because the distribution of classes is not normal
def performClassification(name, estimator, X, y, X_train, y_train, X_test, y_test):
    
    model = estimator.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    kFold = StratifiedKFold(n_splits=8)
    
    cv_score = round((cross_val_score(estimator, X ,y.values.ravel(), cv=kFold, scoring='roc_auc').mean())*100,3)
    
    accuracy = round((accuracy_score(y_test, y_pred))*100,3)
    
    recall = round((recall_score(y_test, y_pred))*100,3)
    
    precision = round((precision_score(y_test, y_pred))*100,3)
    f1 = round((f1_score(y_test, y_pred))*100,3)
    
    roc_auc = round((roc_auc_score(y_test, y_pred))*100,3)

    returnArray = pd.array([name,cv_score,accuracy,recall,precision,f1,roc_auc])
    
    return returnArray

#reate a data frame to store the scores
modelScores = pd.DataFrame(columns =['Name','CV','Accuracy','Recall','Precision','F1','Roc_Auc'])

In [None]:
#lets try different models
#execute models one by one
lm = LogisticRegression()
modelScores = modelScores.\
    append(pd.Series(performClassification('Logistic Regression',lm,features,labels, X_train, y_train, X_test, y_test),\
                     index=modelScores.columns), ignore_index=True)

MNB = MultinomialNB()
modelScores = modelScores.\
    append(pd.Series(performClassification('Multinomial Naive Bayes',MNB,features,labels, X_train, y_train, X_test, y_test),\
                     index=modelScores.columns), ignore_index=True)
        
SVC = LinearSVC()
modelScores = modelScores.\
    append(pd.Series(performClassification('Linear SVM',SVC,features,labels, X_train, y_train, X_test, y_test),\
                     index=modelScores.columns), ignore_index=True)


In [None]:
#lets check the outcome
print(modelScores)
#we can choose model based on multiple scores here, I would go ahead and choose Linear SVM base don ROc_AUC score

As we can see, the best model is Linear SVM with 91% accuracy

In [None]:
#lets first check the classifucation report
y_pred = SVC.predict(X_test)

print(metrics.classification_report(y_test, y_pred, target_names=reviews['overall'].unique().astype(str)))

In [None]:
#now lets buld the confusion matrix and plot the heatmap
conf_mat = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_mat.ravel()
(tn, fp, fn, tp)

I have tried to achieve few things in this notebook
* run models and validate them using stratified Kfold cross validation
* compare multiple models using f1 score, AUC and accuracy

Few this I will try later,
* use more models, use hyper parameter tuning for some of these models
* clean up of data and use other ways to generate vectors from the text