In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
#Import dataset
inputData=pd.read_excel('BOYCE_Data_Preperation.xlsx')

In [None]:
##Function to check the balance of models
#Returns number and percentage of each Label

#Total_Reads
def check_balance_reads(dataframe):
    print('Number of 0 reads: ', len(dataframe[dataframe['Total_Reads'] == 0]))
    print('Number of greater-than 0 reads: ', len(dataframe[dataframe['Total_Reads'] > 0]))
    print('The ratio of the dataset is (zero / greater-than zero): %.2f%% / %.2f%%' % 
          ((round((len(dataframe[dataframe['Total_Reads'] == 0]) / len(dataframe)) * 100 , 2)), 
          (round((len(dataframe[dataframe['Total_Reads'] > 0]) / len(dataframe)) * 100 , 2))))
    print("")

#Helpfulness_Label
def check_balance_help(dataframe):
    print('Number of Helpful: ', len(dataframe[dataframe['Helpfulness_Label'] == 0]))
    print('Number of Unhelpful: ', len(dataframe[dataframe['Helpfulness_Label'] ==1]))
    print('The ratio of the dataset is (helpful / unhelpful): %.2f%% / %.2f%%' %
          ((round((len(dataframe[dataframe['Helpfulness_Label'] == 0]) / len(dataframe)) * 100 , 2)),
           (round((len(dataframe[dataframe['Helpfulness_Label'] ==1]) / len(dataframe)) * 100 , 2))))
    print("")

#Function to rebalance and slpit inital dataset (Class/Pred)
def set_balance(inputData, title):
    ##Prepare data for Classification & Prediction models
    #Remove other Target attribute for each model
    classData = inputData.drop(["Total_Reads"], axis = 1)
    predData = inputData.drop(["Helpfulness_Label"], axis = 1)
    
    ##Classification data
    #Check initial balance
    print('\033[34m' + '\033[1m' + '\033[4m' + "Helpfulness_Label Balance Check for: "+ title + '\033[0m')
    print('\033[1m' + "Intitial: "+ '\033[0m')
    check_balance_help(classData)
    
    ##Rebalance classification data
    #Split dataframes
    helpfulData = classData[classData['Helpfulness_Label'] == 0]
    unhelpfulData = classData[classData['Helpfulness_Label'] == 1]

    #Get smaller UNHELPFUL sample
    desired_HELPFUL_ratio = 0.2   #(0.2/0.8 etc...)
    newFrac = (((len(classData[classData['Helpfulness_Label'] == 0]) / desired_HELPFUL_ratio) 
               - len(classData[classData['Helpfulness_Label'] == 0])) / 
               len(classData[classData['Helpfulness_Label'] == 1]))
    
    #Rebalance
    unhelpfulData = unhelpfulData.sample(frac=newFrac, random_state=1)
    
    #Rejoin
    data_balanced = pd.concat([helpfulData, unhelpfulData])

    #Shuffle dataframe and reset indexs to match number of rows
    classData = data_balanced.sample(frac=1).reset_index(drop=True)

    #Check new balance
    print('\033[1m' + "Rebalanced: "+ '\033[0m')
    check_balance_help(classData)
    
    
    ##Prediction data
    #Check initial balance
    print('\033[34m' + '\033[1m' + '\033[4m' + "Total_Reads Balance Check for: "+ title + '\033[0m')
    print('\033[1m' + "Intitial: "+ '\033[0m')
    check_balance_reads(predData)
    
    ##Rebalancing prediction data
    #Split dataframes
    zeroReads = predData[predData['Total_Reads'] == 0]
    largerReads = predData[predData['Total_Reads'] > 0]
    
    #Calculate percent to increase by (etc.. if zeroReads/nonZeroReads > 1)
    if (len(predData[predData['Total_Reads'] == 0]) / len(predData[predData['Total_Reads'] > 0]) >= 1):
        newFrac = len(predData[predData['Total_Reads'] == 0]) / len(predData[predData['Total_Reads'] > 0])
        largerReads = largerReads.sample(frac=newFrac, random_state=1, replace=True)
        predData = pd.concat([largerReads, zeroReads])
    else:
        predData = pd.concat([largerReads, zeroReads])

    #Shuffle dataframe and reset indexs to match number of rows
    predData = predData.sample(frac=1).reset_index(drop=True)
    print('\033[1m' + "Rebalanced: "+ '\033[0m')
    check_balance_reads(predData)
    
    return classData, predData

In [None]:
def get_train_test(classData, predData):
    #Set role for each model
    class_y = classData['Helpfulness_Label']
    pred_y = predData['Total_Reads']
    
    #Drop target attribute
    class_X = classData.drop(['Helpfulness_Label'], axis = 1)
    pred_X = predData.drop(['Total_Reads'], axis = 1)
    
    #Normalize
    from sklearn.preprocessing import MinMaxScaler
    class_X = MinMaxScaler().fit_transform(class_X)
    pred_X = MinMaxScaler().fit_transform(pred_X)
    
    #Split data for each model
    from sklearn.model_selection import train_test_split
    class_X_train, class_X_test, class_y_train, class_y_test = train_test_split(class_X, class_y, test_size = 0.2)
    pred_X_train, pred_X_test, pred_y_train, pred_y_test = train_test_split(pred_X, pred_y, test_size = 0.2)

    return class_X_train, class_X_test, class_y_train, class_y_test, pred_X_train, pred_X_test, pred_y_train, pred_y_test

In [None]:
#Function to print results of classification models
def show_results(title, data, prediction):
    from sklearn.metrics import classification_report, confusion_matrix
    import itertools
    from sklearn.metrics import accuracy_score

    #Print title of model
    print('\033[34m' + '\033[1m' + '\033[4m' + title + '\033[0m')
    
    #Get confusion matrix
    matrix = confusion_matrix(data, prediction)

    #Print confusion matrix
    print('Confusion matrix')
    print(confusion_matrix(data, prediction))
    
    #Setup sensitivity and specificity
    total=sum(sum(matrix))

    accuracy=(matrix[0,0]+matrix[1,1])/total
    print ('Accuracy : ', round(accuracy, 2), '%')

    sensitivity = matrix[0,0]/(matrix[0,0]+matrix[0,1])
    print('Sensitivity : ', round(sensitivity, 2), '%')

    specificity = matrix[1,1]/(matrix[1,0]+matrix[1,1])
    print('Specificity : ', round(specificity, 2), '%')
    print('')
    print('Classification Report:')
    from sklearn.metrics import precision_recall_fscore_support
    print(classification_report(data, prediction, zero_division=0))

    
#Function to setup data for ROC curves
def setup_roc(data, model, probability):
    from sklearn.metrics import roc_curve, auc
    
    #Allocate ROC values
    fpr, tpr, threshold = roc_curve(data, model.predict_proba(probability)[:,1])
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc


def get_cluster_classification(title, class_X_train, class_y_train, class_X_test, class_y_test):
    ##Setup Models
    #Initialise models 
    from sklearn.neural_network import MLPClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    NN = MLPClassifier(hidden_layer_sizes=(25, 25, 25), max_iter=10000, shuffle=True)
    DT=DecisionTreeClassifier(random_state=42)
    RF = RandomForestClassifier(n_estimators= 100, random_state=42)
    
    #Fit models
    NN = NN.fit(class_X_train, class_y_train)
    DT=DT.fit(class_X_train,class_y_train)
    RF.fit(class_X_train, class_y_train)

    #Get predictions
    NN_predict = NN.predict(class_X_test)   
    DT_predict=DT.predict(class_X_test)
    RF_predict=RF.predict(class_X_test)
  
    ##Evaluate data
    #Display results
    show_results("MLP Test of Cluster: %s" % (title), class_y_test, NN_predict)
    show_results("DT Test of Cluster: %s" % (title), class_y_test, DT_predict)
    show_results("RF Test of Cluster: %s" % (title), class_y_test, RF_predict)

    ##Evaluate Classification Models
    #Evaluate with cross-validation
    from sklearn.model_selection import cross_val_score
    NN_scores = cross_val_score(NN, class_X_train, class_y_train, cv=5)
    DT_scores = cross_val_score(DT, class_X_train, class_y_train, cv=5)
    RF_scores = cross_val_score(RF, class_X_train, class_y_train, cv=5)

    print('\033[34m' + '\033[1m' + '\033[4m' + "Cross-Validation Comparisons for: " + title + '\033[0m')
    print("NN Accuracy: %0.2f (+/- %0.2f)" % (NN_scores.mean(), NN_scores.std() * 2))
    print("DT Accuracy: %0.2f (+/- %0.2f)" % (DT_scores.mean(), DT_scores.std() * 2))
    print("RF Accuracy: %0.2f (+/- %0.2f)" % (RF_scores.mean(), RF_scores.std() * 2))

    #Evaluate with ROC curves
    import matplotlib.pyplot as plt

    fpr_mean_NN, tpr_mean_NN, roc_auc_mean_NN = 0,0,0
    fpr_mean_DT, tpr_mean_DT, roc_auc_mean_DT = 0,0,0
    fpr_mean_RF, tpr_mean_RF, roc_auc_mean_RF = 0,0,0

    iterations = 5
    for r in range(0, iterations):
        ##Setup each model
        #MLP Test
        fpr1, tpr1, roc_auc1 = setup_roc(class_y_test, NN, class_X_test)
        fpr_mean_NN += fpr1
        tpr_mean_NN += tpr1 
        roc_auc_mean_NN += roc_auc1

        #DT Test
        fpr2, tpr2, roc_auc2 = setup_roc(class_y_test, DT, class_X_test)
        fpr_mean_DT += fpr2
        tpr_mean_DT += tpr2
        roc_auc_mean_DT += roc_auc2

        #RF Test
        fpr3, tpr3, roc_auc3 = setup_roc(class_y_test, RF, class_X_test)
        fpr_mean_RF += fpr3
        tpr_mean_RF += tpr3 
        roc_auc_mean_RF += roc_auc3
    
    #Average all values
    fpr_mean_NN/=iterations
    tpr_mean_NN/=iterations
    roc_auc_mean_NN/=iterations 
    fpr_mean_DT/=iterations 
    tpr_mean_DT/=iterations
    roc_auc_mean_DT/=iterations 
    fpr_mean_RF/=iterations
    tpr_mean_RF/=iterations 
    roc_auc_mean_RF/=iterations
    
    #Plot ROC
    plt.figure()
    plt.title('Helpfulness Label Classfication ROC Curves of: %s' % (title))
    plt.plot(fpr_mean_NN, tpr_mean_NN, label = 'NN Test AUC = %0.2f' % roc_auc_mean_NN)
    plt.plot(fpr_mean_DT, tpr_mean_DT, label = 'DT Test AUC = %0.2f' % roc_auc_mean_DT)
    plt.plot(fpr_mean_RF, tpr_mean_RF, label = 'RF Test AUC = %0.2f' % roc_auc_mean_RF)
    plt.plot([0, 1], [0, 1], 'r--', label='Base AUC = 0.5')
    plt.legend(loc = 'lower right')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('Sensitivity')
    plt.xlabel('Specificity')
    plt.show()

In [None]:
def get_cluster_prediction(title, pred_X_train, pred_y_train, pred_X_test, pred_y_test):
    ##Prediction Model (Total_Reads)
    #Set up prediciton 
    from sklearn.neural_network import MLPRegressor
    from sklearn.ensemble import RandomForestRegressor
    pred = MLPRegressor(hidden_layer_sizes=(25, 25, 25), max_iter=10000)
    RF_pred = RandomForestRegressor(n_estimators = 100, random_state = 1)
    
    #Fit model
    pred.fit(pred_X_train, pred_y_train)
    
    #Get prediction
    RF_pred.fit(pred_X_train, pred_y_train)
    
    rmse_validation = 0
    rmse_val_RF = 0
    prediction_validation = 0
    RF_pred_val= 0
    iterations = 100
    for k in range(0, iterations):
        #Get prediction
        prediction=pred.predict(pred_X_test)
        RF_prediction = RF_pred.predict(pred_X_test)
        #Filter for negative reads to 0
        for num, value in enumerate(prediction):
            prediction[num] = max(0, value)
            prediction[num] = round(prediction[num])
            
        for num, value in enumerate(RF_prediction):
            RF_prediction[num] = max(0, value)
            RF_prediction[num] = round(RF_prediction[num])
            
        prediction_validation += prediction
        RF_pred_val += RF_prediction

        #Get RMSE
        from sklearn.metrics import mean_squared_error
        RF_mse= mean_squared_error(pred_y_test, RF_prediction)
        mse= mean_squared_error(pred_y_test, prediction)
        from math import sqrt
        rmse_validation += sqrt(mse)
        rmse_val_RF += sqrt(RF_mse)

    #Average values
    prediction_validation/=iterations
    RF_pred_val/=iterations
    
    #Print RMSE values
    print('\033[34m' + '\033[1m' + '\033[4m' + "Mean's of RMSE & r^2 after %d iterations of: %s" \
          % (iterations, title) + '\033[0m')
    print('Mean NN RMSE:', rmse_validation/iterations)
    print('Mean RF RMSE:', rmse_val_RF/iterations)
    
    ##Initialise residual plots
    #NN Residual
    import yellowbrick
    from yellowbrick.regressor import ResidualsPlot
    visualizer = ResidualsPlot(pred, hist=False)
    
    #Fit data
    visualizer.fit(pred_X_train, pred_y_train)  # Fit the training data to the visualizer
    visualizer.score(pred_X_test, pred_y_test)  # Evaluate the model on the test data
    visualizer.show()                 # Finalize and render the figure
    
    #Repeat for RF
    visualizer_RF = ResidualsPlot(RF_pred, hist=False)

    visualizer_RF.fit(pred_X_train, pred_y_train)  # Fit the training data to the visualizer
    visualizer_RF.score(pred_X_test, pred_y_test)  # Evaluate the model on the test data
    visualizer_RF.show()                 # Finalize and render the figure

In [None]:
#Retrieve balanced cluster dataframes
classData_cNorm, predData_cNorm = set_balance(inputData, "No Clusters")
classData_c0, predData_c0 = set_balance(inputData[inputData['cluster_number'] == 0], "Cluster 0")
classData_c1, predData_c1 = set_balance(inputData[inputData['cluster_number'] == 1], "Cluster 1")

#Remove cluster label now data is split (lead to inaccuracies )
classData_cNorm = classData_cNorm.drop(["cluster_number"], axis = 1)
predData_cNorm = predData_cNorm.drop(["cluster_number"], axis = 1)
classData_c0 = classData_c0.drop(["cluster_number"], axis = 1)
predData_c0 = predData_c0.drop(["cluster_number"], axis = 1)
classData_c1 = classData_c1.drop(["cluster_number"], axis = 1)
predData_c1 = predData_c1.drop(["cluster_number"], axis = 1)

In [None]:
#Retrieve training and testing arrays for each cluster
#No Cluster
class_X_train_cNorm, class_X_test_cNorm, class_y_train_cNorm, class_y_test_cNorm, pred_X_train_cNorm, pred_X_test_cNorm, \
pred_y_train_cNorm, pred_y_test_cNorm = get_train_test(classData_cNorm, predData_cNorm)

#Cluster 0
class_X_train_c0, class_X_test_c0, class_y_train_c0, class_y_test_c0, pred_X_train_c0, pred_X_test_c0, \
pred_y_train_c0, pred_y_test_c0 = get_train_test(classData_c0, predData_c0)

#CLuster 1
class_X_train_c1, class_X_test_c1, class_y_train_c1, class_y_test_c1, pred_X_train_c1, pred_X_test_c1, \
pred_y_train_c1, pred_y_test_c1 = get_train_test(classData_c1, predData_c1)


In [None]:
#No Cluster Classification
get_cluster_classification("No Clusters", class_X_train_cNorm, class_y_train_cNorm, \
                           class_X_test_cNorm, class_y_test_cNorm)

In [None]:
#Cluster 0 Classification
get_cluster_classification("Cluster 0", class_X_train_c0, class_y_train_c0, class_X_test_c0, class_y_test_c0)

In [None]:
#Cluster 1 Classification
get_cluster_classification("Cluster 1", class_X_train_c1, class_y_train_c1, class_X_test_c1, class_y_test_c1)

In [None]:
#No Cluster Prediction
get_cluster_prediction("No Clusters", pred_X_train_cNorm, pred_y_train_cNorm, pred_X_test_cNorm, pred_y_test_cNorm)

In [None]:
#Cluster 0 Prediction
get_cluster_prediction("Cluster 0", pred_X_train_c0, pred_y_train_c0, pred_X_test_c0, pred_y_test_c0)

In [None]:
#Cluster 1 Prediction
get_cluster_prediction("Cluster 1", pred_X_train_c1, pred_y_train_c1, pred_X_test_c1, pred_y_test_c1)