# Problem Statement

In this problem we want to study the effectiveness of traditional models in utilizing text data as predictor variable in machine learning purposes. We are given a dataset of user reviews about certain products with overall Score for each product.

We want to predict product score based on the comments provided for that product

# Approach

* The overall scores are binned into 2 labels and named Overall Sentiment - 1 indicating a positive score (>3) and 0 indicating a negative score (<3)
* All scores =3 are removed as they are neutral and won't give any insight into positivity/negativity towards a product
* The text data is also converted into a corpus. Corpus is a collection of sentences (called documents) and is the data type understood by textmining library in R
* Corpus data is then cleaned by removing commonly used symbols and punctuation
* The cleaned corpus is then converted into a Matrix containing TF-IDF values. Term Frequence - Inverse Document Frequency (TF-IDF) is a metric used to indentify relative importance of each word in a sentence based on it's frequency of appearance in the sentence
* The words with their IDF weights are used as predictors while the Overall Sentiment is treated as a dependent variable
* The text data is then also fed into Sentiment analyser <code>VaderSentiment</code> that scores each sentence based on a predefined dictionary and labels them into <code>Positive</code>,<code>Negative</code> & <code>Neutral</code> classes
* Since sentiment data renders imbalance class output, we will use <code>SMOTE</code> to balance both classes
* This data is then split into test and train and different classifiers are trained on the training data and used to predict Overall Sentiment score (0 or 1) for test data
* This exercise done using Traditional as well as Deep learning models
* AUC-ROC (Area Under Curve for Reciever Operating Characteristics) is used to finalize a classifier of the lot
* Confusion matrix is then used to validate the final model results

# Solution

## Global Imports

In [None]:
import numpy as np #Linear Algebra
import pandas as pd #Importing and manipulating datasets

#String manipulation and search
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from statistics import mean

from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

#Library to show progress bar for specific operations
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [None]:
reviews = pd.read_csv('../input/reviews/Reviews.csv', header = 0)

In [None]:
reviews.head(2)

In [None]:
#Selecting only the relevant columns
baseData = reviews[['Text','Score']]

#### Creating datasets for different use-cases

* Positive (4&5 scores) and Negative (1&2 scores) buckets and filtering out Neutral (3 scores)
* Positive (4&5 scores), Neutral (3 scores) and Negative (1&2 scores) buckets
* Multiclass dataset (without any bucketing)
* 3 class dataset (Positive, Negative & Neutral) using <code>VaderSentiment</code>

In [None]:
#Creating a function to bucket score values based on a flag
def bucket(flag,x):
    bucket=-1
    if ((flag==0) & (x<3)):
        bucket=0 #Negative
    elif ((flag==0)&(x>3)):
        bucket=1 #Positive
    elif ((flag==1)&(x<3)):
        bucket=0 #Negative
    elif ((flag==1)&(x==3)):
        bucket=1 #Neutral
    elif ((flag==1)&(x>3)):
        bucket=2 #Positive
    return bucket

In [None]:
# Creating Positive and Negative buckets from data and filtering out Neutral scores
print ("Creating Positive/Negative dataset")
%time baseData['PosNegFlag'] = [bucket(0,score) for score in baseData['Score']]
posnegdata = baseData[baseData['PosNegFlag']!=-1][['Text','PosNegFlag']].rename(columns={'PosNegFlag':'Flag'})

# Creating Positive, Neutral & Negative buckets from data
print ("-"*80,"\nCreating Positive/Negative/Neutral dataset")
%time baseData['PosNegNeuFlag'] = [bucket(1,score) for score in baseData['Score']]
posnegneudata = baseData[['Text','PosNegNeuFlag']].rename(columns={'PosNegNeuFlag':'Flag'})

Creating 3 buckets using <code>VADER</code> library

In [None]:
print("Installing vadersentiment library\n")
%time !pip install vadersentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Creating a function to build sentiment score
def generateSentimentScore(sentence):
    sentence = re.sub(r"<.*?>", "", sentence) #Removing HTML tags
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(sentence)
    return sentiment_scores["compound"]

def bucketScores(score):
    if score>=0.05:
        return 2 #Positive
    elif score<=-0.05:
        return 0 #Negative
    else:
        return 1 #Neutral

#Creating a subset of data to run Vader Sentiment on
vaderData = baseData.sample(30000,random_state=10)
print("-"*80,"\nScoring reviews with Vader sentiment library to generate sentiment scores")
%time vaderData['Sent'] = vaderData['Text'].progress_apply(generateSentimentScore)


print("-"*80,"\nCreating sentiment buckets (0-Negative/1-Neutral/2-Positive)")
%time vaderData['Score_Sent'] = vaderData['Sent'].progress_apply(bucketScores)
vaderData['Score_Sent_5'] = pd.qcut(vaderData.Sent,5,labels=False)+1
vaderData.head(3)

In [None]:
from textblob import TextBlob

#Creating a function to build sentiment score
def generateSentimentScore2(sentence):
    sentence = re.sub(r"<.*?>", "", sentence) #Removing HTML tags
    sentiment_scores = TextBlob(sentence)
    return sentiment_scores.sentiment[0]

#Creating a function to bucket sentiment score
def bucketScores2(score):
    if score > 0:
        return 2 #Positive
    elif score < 0:
        return 0 #Negative
    else:
        return 1 #Neutral
    
#Creating a subset of data to run Vader Sentiment on
textblobData = baseData.sample(30000,random_state=10)
%time textblobData['Sent'] = textblobData['Text'].progress_apply(generateSentimentScore2)

print("-"*80,"\nCreating sentiment buckets (0-Negative/1-Neutral/2-Positive)")
%time textblobData['Score_Sent'] = textblobData['Sent'].progress_apply(bucketScores2)
textblobData['Score_Sent_5'] = pd.qcut(textblobData.Sent,5,labels=False)+1
textblobData.head(3)

### Comparing the effectiveness of Sentiment Analysis libraries

In [None]:
#Normalizing scores
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
vaderData['ScoreScaled'] = 0
vaderData['SentScaled'] = 0
vaderData[['ScoreScaled','SentScaled']] = scaler.fit_transform(vaderData[['Score','Sent']])

#Plotting correlations
plt.figure(figsize=(3.5,2))
sns.heatmap(vaderData[['ScoreScaled','SentScaled']].corr(),annot=True,fmt='.2g',cmap='Blues')
plt.title('Correlation between Base Scores and Vader sentiment scores',size=14)
plt.xticks(rotation=0)
plt.show()

In [None]:
#Normalizing scores
scaler = MinMaxScaler()
textblobData['ScoreScaled'] = 0
textblobData['SentScaled'] = 0
textblobData[['ScoreScaled','SentScaled']] = scaler.fit_transform(textblobData[['Score','Sent']])

#Plotting correlations
plt.figure(figsize=(3.5,2))
sns.heatmap(textblobData[['ScoreScaled','SentScaled']].corr(),annot=True,fmt='.2g',cmap='Blues')
plt.title('Correlation between Base Scores and TextBlob sentiment scores',size=14)
plt.xticks(rotation=0)
plt.show()

As seen from the correlation plot, the VaderSentiment outputs slightly higher correlation with the actual score given to the text labels as compared to that of TextBlob.

#### Creating random and stratified samples from the data

In [None]:
#Creating a function to take stratified samples
def stratifiedSample(df,column,samplesize=1000):
    classes = len(set(df[column]))
    df_sample = df.groupby(column, group_keys=False).apply(lambda x: x.sample(min(len(x), int(samplesize/classes))))
    return df_sample

In [None]:
samplesize=1000

#Random sampling
%time posneg_sample = posnegdata.sample(samplesize,random_state=10).rename(columns={'Flag':'Y'}).reset_index(drop=True)
%time posnegneu_sample = posnegneudata.sample(samplesize,random_state=10).rename(columns={'Flag':'Y'}).reset_index(drop=True)
%time base_sample = baseData[['Text','Score']].sample(samplesize,random_state=10).rename(columns={'Score':'Y'}).reset_index(drop=True)
%time vader_sample = vaderData[['Text','Score_Sent']].sample(samplesize,random_state=10).rename(columns={'Score_Sent':'Y'}).reset_index(drop=True)

#Stratified sampling
%time posneg_st_sample = stratifiedSample(posnegdata,'Flag',samplesize).rename(columns={'Flag':'Y'}).reset_index(drop=True)
%time posnegneu_st_sample = stratifiedSample(posnegneudata,'Flag',samplesize).rename(columns={'Flag':'Y'}).reset_index(drop=True)
%time base_st_sample = stratifiedSample(baseData[['Text','Score']],'Score',samplesize).rename(columns={'Score':'Y'}).reset_index(drop=True)
%time vader_st_sample = stratifiedSample(vaderData[['Text','Score_Sent']],'Score_Sent',samplesize).rename(columns={'Score_Sent':'Y'}).reset_index(drop=True)

#### Text pre-processing

In [None]:
#Defining functions and variables required for processing of Texts

stemmer = PorterStemmer()
from nltk.corpus import stopwords

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return ' '.join(stems)

symbols = string.punctuation
replacement = "                                " # No. of spaces equal to number of symbols in "symbols" variable
mappingDict = str.maketrans(symbols, replacement)

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

def processData(data):
    corpus = []
    for comment in data:
        comment = str(comment).lower()
        comment = re.sub('[\W_]', '', comment)
        comment = comment.translate(mappingDict)
        comment=tokenize(comment)
        corpus.append(comment)
    
    data_counts = count_vect.fit_transform(corpus)
    data_tfidf = tfidf_transformer.fit_transform(data_counts)
    data = pd.concat([data.reset_index(drop=True),pd.DataFrame(data_tfidf.todense())], axis=1)
    return data

In [None]:
# Runing data processing steps on following dataframes one by one
# 1. posneg_sample
%time posneg_sample_x = processData(posneg_sample['Text'])
posneg_sample_y = posneg_sample['Y']

# 2. posnegneu_sample
%time posnegneu_sample_x = processData(posnegneu_sample['Text'])
posnegneu_sample_y = posnegneu_sample['Y']

# 3. base_sample
%time base_sample_x = processData(base_sample['Text'])
base_sample_y = base_sample['Y']

# 4. vader_sample
%time vader_sample_x = processData(vader_sample['Text'])
vader_sample_y = vader_sample['Y']

# 5. posneg_st_sample
%time posneg_st_sample_x = processData(posneg_st_sample['Text'])
posneg_st_sample_y = posneg_st_sample['Y']

# 6. posnegneu_st_sample
%time posnegneu_st_sample_x = processData(posnegneu_st_sample['Text'])
posnegneu_st_sample_y = posnegneu_st_sample['Y']

# 7. base_st_sample
%time base_st_sample_x = processData(base_st_sample['Text'])
base_st_sample_y = base_st_sample['Y']

# 8. vader_st_sample
%time vader_st_sample_x = processData(vader_st_sample['Text'])
vader_st_sample_y = vader_st_sample['Y']

#### Balancing the data using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

def balanceData(data_x,data_y,label=''):
    sm = SMOTE(random_state=12)
    data_x,data_y = sm.fit_sample(data_x.drop('Text',axis=1), data_y.ravel())
    data_x = pd.DataFrame(data_x)
    data_y = pd.Series(data_y)
    return data_x,data_y

In [None]:
%time posneg_smote_x,posneg_smote_y = balanceData(posneg_sample_x,posneg_sample_y,'Pos/Neg')
%time posnegneu_smote_x,posnegneu_smote_y = balanceData(posnegneu_sample_x,posnegneu_sample_y,'Pos/Neg/Neu')
%time base_smote_x,base_smote_y = balanceData(base_sample_x,base_sample_y,'5 Class')
%time vader_smote_x,vader_smote_y = balanceData(vader_sample_x,vader_sample_y,'3 Class Vader')

#### Visualizing the final datasets to be used for the exercise

In [None]:
fig,axes = plt.subplots(3,4,figsize=(20,15))
sns.countplot(posneg_sample_y,ax=axes[0,0])
axes[0,0].set(title="Pos/Neg Random")
sns.countplot(posnegneu_sample_y,ax=axes[0,1])
axes[0,1].set(title="Pos/Neg/Neu Random")
sns.countplot(base_sample_y,ax=axes[0,2])
axes[0,2].set(title="5 Class Random")
sns.countplot(vader_sample_y,ax=axes[0,3])
axes[0,3].set(title="Vader 3 Class Random")
sns.countplot(posneg_st_sample_y,ax=axes[1,0])
axes[1,0].set(title="Pos/Neg Stratified")
sns.countplot(posnegneu_st_sample_y,ax=axes[1,1])
axes[1,1].set(title="Pos/Neg/Neu Stratified")
sns.countplot(base_st_sample_y,ax=axes[1,2])
axes[1,2].set(title="5 Class Stratified")
sns.countplot(vader_st_sample_y,ax=axes[1,3])
axes[1,3].set(title="Vader 3 Class Stratified")
sns.countplot(posneg_smote_y,ax=axes[2,0])
axes[2,0].set(title="Pos/Neg with SMOTE")
sns.countplot(posnegneu_smote_y,ax=axes[2,1])
axes[2,1].set(title="Pos/Neg/Neu with SMOTE")
sns.countplot(base_smote_y,ax=axes[2,2])
axes[2,2].set(title="5 Class with SMOTE")
sns.countplot(vader_smote_y,ax=axes[2,3])
axes[2,3].set(title="Vader 3 Class with SMOTE")
plt.suptitle("Distribution of classes in all generated datasets (Sample size="+str(samplesize)+")",size=14)
plt.show()

In [None]:
#Creating some functions for use in modelling

def plot_roc_curve(fprs, tprs):
    """Plot the Receiver Operating Characteristic from a list
    of true positive rates and false positive rates."""
    
    # Initialize useful lists + the plot axes.
    tprs_interp = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    f, ax = plt.subplots(figsize=(5,5))
    
    # Plot ROC for each K-Fold + compute AUC scores.
    for i, (fpr, tpr) in enumerate(zip(fprs, tprs)):
        tprs_interp.append(np.interp(mean_fpr, fpr, tpr))
        tprs_interp[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        ax.plot(fpr, tpr, lw=1, alpha=0.3,
                 label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
        
    # Plot the luck line.
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Random', alpha=.8)
    
    # Plot the mean ROC.
    mean_tpr = np.mean(tprs_interp, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)
    
    # Plot the standard deviation around the mean ROC.
    std_tpr = np.std(tprs_interp, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')
    
    # Fine tune and show the plot.
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic')
    ax.legend(loc="lower right")
    plt.show()
    return (f, ax)

def compute_roc_auc(index,clf):
    y_predict = clf.predict_proba(X_train.iloc[index])[:,1]
    fpr, tpr, thresholds = roc_curve(y_train.iloc[index], y_predict)
    auc_score = auc(fpr, tpr)
    return fpr, tpr, auc_score

def runCrossVal(X_train,y_train,clf):
    cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
    results = pd.DataFrame(columns=['training_score', 'test_score'])
    fprs, tprs, scores = [], [], []
    for (train, test), i in zip(cv.split(X_train, y_train), range(10)):
        clf.fit(X_train.iloc[train], y_train.iloc[train])
        _, _, auc_score_train = compute_roc_auc(train,clf)
        fpr, tpr, auc_score = compute_roc_auc(test,clf)
        scores.append((auc_score_train, auc_score))
        fprs.append(fpr)
        tprs.append(tpr)
    return fprs,tprs,scores

def runCrossVal_multiclass(X_train,y_train,clf):
    #Define tasks to be performed
    pipe= Pipeline([('clf', clf)])
    #Binarizing multiple classes
    y_bin = label_binarize(y_train, classes=list(np.unique(y_train)))
    n_classes = y_bin.shape[1]
    
    #Running cross_val
    y_score = cross_val_predict(pipe, X_train, y_bin, cv=10 ,method='predict')
    
    #Plotting results
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    colors = cycle(['blue', 'red', 'green','cyan','black'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i+1, roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic for multi-class data')
    plt.legend(loc="lower right")
    plt.show()
    return roc_auc

def splitData(data_x,data_y):
    return train_test_split(data_x,data_y, test_size=0.25, random_state=12)

### Model selection using Cross Validation

Now, we have 12 datasets in total:
* 2 class random sample
* 2 class stratified sample
* 2 class SMOTE balanced sample
* 3 class Vader Sentiment random sample
* 3 class Vader Sentiment stratified sample
* 3 class Vader Sentiment SMOTE balanced sample
* 5 class random sample
* 5 class stratified sample
* 5 class SMOTE balanced sample


We'll be using a list of binary classifiers : RandomForest, NaiveBays, LogisticRegression, SVM, KNN and MLP & a list of multiclass classifiers : RandomForest, KNN & MLP. We'll be generating all combinations of 2 class datasets with binary classifiers & multiclass datasets with multiclass classifiers and running them on 10-fold cross validation and plotting ROC-AUC for every combination.

This will give us a series of graphs to look at and help us decide which Classifier-Dataset combination gives the best result.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

clf_rf = RandomForestClassifier(
    n_estimators=50, criterion='gini',  max_depth=5,  min_samples_split=2,  min_samples_leaf=1,  min_weight_fraction_leaf=0.0,
    max_features='auto',  max_leaf_nodes=None,  min_impurity_decrease=0.0,  min_impurity_split=None,  bootstrap=True,
    oob_score=False,  n_jobs=-1,  random_state=0,  verbose=0,  warm_start=False,  class_weight='balanced'
)

clf_nb = BernoulliNB(alpha=1.0)

clf_lr = linear_model.LogisticRegression(C=1e5)

clf_svm = SVC(gamma='auto',probability=True)

clf_knn = KNeighborsClassifier(n_neighbors=2)

clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(25,), random_state=1)

#Creating a list of all classifiers
binary_classifiers = [clf_rf,clf_nb,clf_lr,clf_svm,clf_knn,clf_mlp]
multiclass_classifiers = [clf_rf,clf_knn,clf_mlp]

In [None]:
binary_datasets = {'posneg_sample':[posneg_sample_x,posneg_sample_y],
                      'posneg_st_sample':[posneg_st_sample_x,posneg_st_sample_y],
                      'posneg_smote':[posneg_smote_x,posneg_smote_y]}

multiclass_datasets = {'posnegneu_sample':[posnegneu_sample_x,posnegneu_sample_y],
                       'base_sample':[base_sample_x,base_sample_y],
                       'vader_sample':[vader_sample_x,vader_sample_y],
                       'posnegneu_st_sample':[posnegneu_st_sample_x,posnegneu_st_sample_y],
                       'base_st_sample':[base_st_sample_x,base_st_sample_y],
                       'vader_st_sample':[vader_st_sample_x,vader_st_sample_y],
                       'posnegneu_smote':[posnegneu_smote_x,posnegneu_smote_y],
                       'base_smote':[base_smote_x,base_smote_y],
                       'vader_smote':[vader_smote_x,vader_smote_y]}

#### Binary classification

In [None]:
#Binary classifications

accdf = pd.DataFrame(columns=["Model","Data","AUC"])
i = 0

for clf in binary_classifiers:
    for data,dataxy in binary_datasets.items():
        print(str(clf).split('(')[0]," with ",data)
        if "smote" not in data:
            X_train, X_test, y_train, y_test = splitData(dataxy[0].drop('Text',axis=1),dataxy[1])
        else:
            X_train, X_test, y_train, y_test = splitData(dataxy[0],dataxy[1])
        %time fprs,tprs,score = runCrossVal(X_train,y_train,clf)
        plot_roc_curve(fprs, tprs)
        %time clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print ("Model performance metrics for ",str(clf).split('(')[0]," with ",data)
        print(metrics.classification_report(y_test, y_pred))
        accdf.loc[i] = [str(clf).split('(')[0], data, mean(score[1])]
        i+=1
        print("-"*80)

In [None]:
accdf.sort_values(by="AUC",ascending=False)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import label_binarize
from sklearn.pipeline import Pipeline
from itertools import cycle

#### Multiclass Classification

In [None]:
accdf = pd.DataFrame(columns=["Model","Data"])
i = 0

for clf in multiclass_classifiers:
    for data,dataxy in multiclass_datasets.items():
        print(str(clf).split('(')[0]," with ",data)
        if "smote" not in data:
            %time score = runCrossVal_multiclass(dataxy[0].drop('Text',axis=1),dataxy[1],clf)
            X_train, X_test, y_train, y_test = splitData(dataxy[0].drop('Text',axis=1),dataxy[1])
        else:
            %time score = runCrossVal_multiclass(dataxy[0],dataxy[1],clf)
            X_train, X_test, y_train, y_test = splitData(dataxy[0],dataxy[1])
        %time clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print ("Model performance metrics for ",str(clf).split('(')[0]," with ",data)
        print(metrics.classification_report(y_test, y_pred))
        accdf = accdf.append(score, ignore_index=True)
        accdf.loc[i,"Model"] = str(clf).split('(')[0]
        accdf.loc[i,"Data"] = data
        i+=1
        print("-"*80)

In [None]:
for i,row in accdf.iterrows():
    accdf.loc[i,"AvgAUC"] = np.mean(row[2:])

In [None]:
accdf.sort_values(by="AvgAUC",ascending=False)

From these results, following combinations gave the best results:

__Binary Classification:__
* <code>LogisticRegression Classifier</code> with 2 class (Positive/Negative) dataset balanced with <code>SMOTE</code> : __AUC 0.99__
* <code>MultiLayerPerceptron Classifier</code> with 2 class (Positive/Negative) dataset balanced with <code>SMOTE</code> : __AUC 0.99__
* <code>RandomForest Classifier</code> with 2 class (Positive/Negative) dataset balanced with <code>SMOTE</code> : __AUC 0.93__
* <code>Bernouli NaiveBays Classifier</code> with 2 class (Positive/Negative) dataset balanced with <code>SMOTE</code> : __AUC 0.90__

__Multiclass Classification:__
* <code>MultiLayerPerceptron Classifier</code> with 3 class (Positive/Negative/Neutral) dataset balanced with <code>SMOTE</code> : __AUC 0.81,0.99,0.78__
* <code>KNN Classifier</code> with 3 class (Positive/Negative/Neutral) dataset balanced with <code>SMOTE</code> : __AUC 0.93,0.87,0.50__
* <code>MultiLayerPerceptron Classifier</code> with 3 class <code>VaderSentiment</code> (Positive/Negative/Neutral) dataset balanced with <code>SMOTE</code> : __AUC 0.75,0.95,0.97__
* <code>KNN Classifier</code> with 3 class <code>VaderSentiment</code> (Positive/Negative/Neutral) dataset balanced with <code>SMOTE</code> : __AUC 0.51,0.96,0.83__
* <code>MultiLayerPerceptron Classifier</code> with 5 class dataset balanced with <code>SMOTE</code> : __AUC 0.99,1,0.78,0.97,0.83__
* <code>KNN Classifier</code> with 5 class dataset balanced with <code>SMOTE</code> : __AUC 0.99,0.98,0.99,0.96,0.50__

# Modelling

#### Binary Classification

Let's select the best performing model and plot classification metrics for the test datasets

In [None]:
# MLP with PosNeg SMOTE dataset
X_train, X_test, y_train, y_test = splitData(posneg_smote_x,posneg_smote_y)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(25,), random_state=1)
%time clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics

print ("Model performance metrics for MLP with 2 class (pos/neg) dataset balanced with SMOTE")
print(metrics.classification_report(y_test, y_pred))

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, label='%s: AUC %0.2f'% ("MLP",roc_auc))

plt.title('ROC curve for MLP Classifier with 2 class (Pos/Neg) dataset balanced with SMOTE')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
np.set_printoptions(precision=2)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,3))
ax1.set(title = "Confusion Matrix")
ax2.set(title = "Normalized Confusion Matrix")
cm_plt1 = sns.heatmap(cm, annot=True, ax = ax1, cmap='Blues',fmt = 'g',linewidths = 0.4,linecolor='white',xticklabels = ['Positive','Negative'], yticklabels = ['Positive','Negative']);
cm_plt2 = sns.heatmap(cm_normalized, annot=True, ax = ax2, cmap='Blues',fmt = 'g',linewidths = 0.4,linecolor='white',xticklabels = ['Positive','Negative'], yticklabels = ['Positive','Negative']);
cm_plt1.set_xlabel('Predicted')
cm_plt1.set_ylabel('True')
cm_plt2.set_xlabel('Predicted')
cm_plt2.set_ylabel('True')
plt.suptitle('Confusion matrices for MLP Classifier',size=14).set_position([.5, 1.05])
plt.show()

In [None]:
#Misclassified records
c = 0
for i in y_test[y_test!=y_pred].index:
    print("Mis-classified record:")
    print(posneg_sample_x.iloc[[i]].Text)
    print("Actual Label : ",y_test[i])
    print("Predicted Label : ",y_pred[y_test!=y_pred][c])
    c+=1
    print("-"*80)
    

#### Multiclass classification

In [None]:
# MLP with PosNeg SMOTE dataset
X_train, X_test, y_train, y_test = splitData(vader_smote_x,vader_smote_y)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(25,), random_state=1)
%time clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics

print ("Model performance metrics for MLP with 2 class (pos/neg) dataset balanced with SMOTE")
print(metrics.classification_report(y_test, y_pred))

In [None]:
#Binarizing multiple classes
y_test_bin = label_binarize(y_test, classes=list(np.unique(y_test)))
y_pred_bin = label_binarize(y_pred, classes=list(np.unique(y_pred)))
n_classes = y_test_bin.shape[1]


#Plotting results
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green','cyan','black'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i+1, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()

In [None]:
def plot_cm(cm,cm_normalized,cls):
    fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,3))
    ax1.set(title = "Confusion Matrix")
    ax2.set(title = "Normalized Confusion Matrix")
    cm_plt1 = sns.heatmap(cm, annot=True, ax = ax1, cmap='Blues',fmt = 'g',linewidths = 0.4,linecolor='white',xticklabels = ['Positive','Negative'], yticklabels = ['Positive','Negative']);
    cm_plt2 = sns.heatmap(cm_normalized, annot=True, ax = ax2, cmap='Blues',fmt = 'g',linewidths = 0.4,linecolor='white',xticklabels = ['Positive','Negative'], yticklabels = ['Positive','Negative']);
    cm_plt1.set_xlabel('Predicted')
    cm_plt1.set_ylabel('True')
    cm_plt2.set_xlabel('Predicted')
    cm_plt2.set_ylabel('True')
    plt.suptitle('Confusion matrices for MLP Classifier for class '+str(cls),size=14).set_position([.5, 1.05])
    plt.show()

for i in range(n_classes):
    cm = confusion_matrix(y_test_bin[:, i], y_pred_bin[:, i])
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    np.set_printoptions(precision=2)
    plot_cm(cm,cm_normalized,i)

In [None]:
#Misclassified records
c = 0
for i in y_test[y_test!=y_pred].index:
    print("Mis-classified record:")
    print(vader_sample_x.iloc[[i]].Text)
    print("Actual Label : ",y_test[i])
    print("Predicted Label : ",y_pred[y_test!=y_pred][c])
    c+=1
    print("-"*80)

# Interpretation of results

### Traditional Models:

__Binary Classification__: In case of 2 classes, tree based classifiers like <code>RandomForest</code> do not perform as good as the simple linear classifiers like <code>LogisticRegression</code>. Also, balancing the data with <code>SMOTE</code> improves the accuracy by a huge amount.

__Multiclass Classification__: <code>K-Nearest Neighbours</code> classifier performed the best among the selected classifiers in both 3 and 5 class datasets. Balancing the data with <code>SMOTE</code> turns out to be a bonus here as well.

### Deep Learning Models:
<code>MultiLayer Perceptron</code> based neural network classifiers outperformed the traditional classifiers in every aspect. Their performance is aided by presence of class balancing techniques like <code>SMOTE</code>

# 3rd Party dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
yelpreviews = pd.read_csv('../input/yelp-reviews-dataset/yelp.csv')
yelpreviews = yelpreviews[['text','stars']]
yelpreviews.head(2)

In [None]:
%time yelpdata_x = processData(yelpreviews['text'])
yelpdata_y = yelpreviews['stars']

In [None]:
from imblearn.over_sampling import SMOTE

def balanceData(data_x,data_y,label=''):
    sm = SMOTE(random_state=12)
    data_x,data_y = sm.fit_sample(data_x.drop('text',axis=1), data_y.ravel())
    data_x = pd.DataFrame(data_x)
    data_y = pd.Series(data_y)
    return data_x,data_y

In [None]:
%time yelpdata_smote_x,yelpdata_smote_y = balanceData(yelpdata_x,yelpdata_y,'Yelp 5 class')

In [None]:
# MLP with PosNeg SMOTE dataset
X_train, X_test, y_train, y_test = splitData(yelpdata_smote_x,yelpdata_smote_y)
clf = MLPClassifier(solver='sgd',learning_rate='adaptive', alpha=1e-5, hidden_layer_sizes=(35,), random_state=1)
%time clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics

print ("Model performance metrics for yelp 5 class dataset balanced with SMOTE")
print(metrics.classification_report(y_test, y_pred))

In [None]:
from sklearn import metrics

print ("Model performance metrics for yelp 5 class dataset balanced with SMOTE")
print(metrics.classification_report(y_test, y_pred))

In [None]:
#Binarizing multiple classes
y_test_bin = label_binarize(y_test, classes=list(np.unique(y_test)))
y_pred_bin = label_binarize(y_pred, classes=list(np.unique(y_pred)))
n_classes = y_test_bin.shape[1]


#Plotting results
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green','cyan','black'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i+1, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()

In [None]:
def plot_cm(cm,cm_normalized,cls):
    fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,3))
    ax1.set(title = "Confusion Matrix")
    ax2.set(title = "Normalized Confusion Matrix")
    cm_plt1 = sns.heatmap(cm, annot=True, ax = ax1, cmap='Blues',fmt = 'g',linewidths = 0.4,linecolor='white',xticklabels = ['Positive','Negative'], yticklabels = ['Positive','Negative']);
    cm_plt2 = sns.heatmap(cm_normalized, annot=True, ax = ax2, cmap='Blues',fmt = 'g',linewidths = 0.4,linecolor='white',xticklabels = ['Positive','Negative'], yticklabels = ['Positive','Negative']);
    cm_plt1.set_xlabel('Predicted')
    cm_plt1.set_ylabel('True')
    cm_plt2.set_xlabel('Predicted')
    cm_plt2.set_ylabel('True')
    plt.suptitle('Confusion matrices for MLP Classifier for class '+str(cls),size=14).set_position([.5, 1.05])
    plt.show()

for i in range(n_classes):
    cm = confusion_matrix(y_test_bin[:, i], y_pred_bin[:, i])
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    np.set_printoptions(precision=2)
    plot_cm(cm,cm_normalized,i)

In [None]:
y_test[y_test!=y_pred].index

In [None]:
#Misclassified records
c = 0
for i in y_test[y_test!=y_pred].index:
    print("Mis-classified record:")
    print(yelpdata_x.iloc[[i]].text)
    print("Actual Label : ",y_test[i])
    print("Predicted Label : ",y_pred[y_test!=y_pred][c])
    c+=1
    print("-"*80)
    if c==5: break