# The Journey

This section describes the decisions taken to get to our final solution
 

## The Baseline

This section is the code we were given

In [16]:
# requires sciket-learn 0.18
# if required, conda update scikit-learn

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.grid_search import GridSearchCV   #Performing grid search


def readFiles():
    #Reading files
    X = pd.read_csv("trainingData.txt",sep='\t',header=None)
    Y = pd.read_csv("trainingTruth.txt",sep='\t',header=None)
    Y = np.array(Y).ravel()
    
    return (X,Y)


def preprocessData1( X, Y ):
    print('Preprocessing data (1).')
    
    # Replace NAs with 0
    X = X.fillna(0) 
    
    return (X,Y)


def runModel1(X,Y):
    print('Run model')
    
    clf = OneVsRestClassifier(RandomForestClassifier(n_estimators = 10,random_state=25))
    clf.fit(X,Y)
    Y_predict = clf.predict_proba(X)
    
    return Y_predict, clf


def calculateROC(Y, Y_predict):
    print('Calc ROC')
    # Binarize the output
    y_bin = label_binarize(Y, classes=[1, 2, 3,4])

    #Calculate AUC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(4):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], Y_predict[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    return roc_auc

def createSubmission(clf, preprocessData=preprocessData1):
    #Create submission
    Xtest = pd.read_csv("testData.txt",sep="\t",header=None)
    
    (Xtest) = preprocessData(Xtest)
    y_final_prob = clf.predict_proba(Xtest)
    y_final_label = clf.predict(Xtest)

    sample = pd.DataFrame(np.hstack([y_final_prob.round(5),y_final_label.reshape(y_final_prob.shape[0],1)]))
    sample.columns = ["prob1","prob2","prob3","prob4","label"]
    sample.label = sample.label.astype(int)
    
    #Submit this file to dropbox
    sample.to_csv("Johnston_Memic.csv",sep="\t" ,index=False,header=None)
    

# Read the files in.   
(X,Y) = readFiles()


# Clean up the data
(X,Y) = preprocessData1(X,Y)

# Why does this not work?
X.describe()

# Run the model
Y_predict, clf = runModel1(X,Y)

baselineAUC = calculateROC(Y, Y_predict)

print('Baseline AUC is: ', baselineAUC)

#createSubmission(clf)




Preprocessing data (1).
Run model
Calc ROC
Baseline AUC is:  {0: 0.99996212682662722, 1: 0.99993172956870668, 2: 0.99996736488783788, 3: 0.99986086891635195}


## Pre-processing the data



### Analysing NAs

First we analyse the prevalance on NAs in our features. For each feature, we calculate the frequency of NAs. This will help determine whether we think the feature is worth keeping as too many NAs indicates that the feature is not giving us enough information.


In [2]:
from scipy.stats import itemfreq

# Read the files in.   
(X,Y) = readFiles()

sums = {}

total = X.shape[0]

for col in X.columns:
    # Count the NAs
    sums[col] = total - X[col].count()
    
# Do something more clever here? Plot a distribution?
#print(sums)
print('Maximum number of NAs in one column: ', max(sums.values()))
print('Minimum number of NAs in one column: ', min(sums.values()))

# Check the rows to see if there are any rows with excessive NAs
rowSums = X.isnull().sum(axis=1).tolist()
print('Maximum number of NAs in one row: ', max(rowSums))
print('Minimum number of NAs in one row: ', min(rowSums))

X.describe()

# Histogram of each variable?

Maximum number of NAs in one column:  81
Minimum number of NAs in one column:  40
Maximum number of NAs in one row:  7
Minimum number of NAs in one row:  0




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
count,17321.0,17325.0,17312.0,17322.0,17324.0,17324.0,17317.0,17333.0,17326.0,17297.0,...,17321.0,17311.0,17331.0,17316.0,17314.0,17312.0,17310.0,17314.0,17319.0,17334.0
mean,0.264572,0.456596,0.435167,0.263299,0.496509,0.356921,0.36299,0.275144,0.423023,0.454524,...,0.448383,0.481281,0.277807,0.333366,0.474403,0.27357,0.447165,0.487169,0.267626,0.389999
std,0.455943,0.513085,0.509448,0.452074,0.513279,0.495707,0.498107,0.463826,0.50352,0.513399,...,0.512189,0.520883,0.460411,0.485206,0.518349,0.460108,0.51364,0.512102,0.456374,0.503755
min,-1.5608,-1.5064,-1.399,-1.4882,-1.4421,-1.3064,-1.8844,-1.7212,-1.4441,-1.5784,...,-1.4486,-1.3986,-1.5701,-1.8473,-1.6267,-1.8658,-1.4036,-1.6289,-1.6968,-1.4666
25%,,,,,,,,,,,...,,,,,,,,,,
50%,,,,,,,,,,,...,,,,,,,,,,
75%,,,,,,,,,,,...,,,,,,,,,,
max,2.0546,2.5191,2.4137,2.1299,2.3358,2.2518,2.3356,2.086,2.4521,2.4869,...,2.4336,2.3727,2.0284,2.5224,2.3033,2.0208,2.5782,2.4072,2.07,2.3871


In [3]:
# What's the distribution of outcomes?
print('\nDistribution of truth values.')
itemFrequencies = itemfreq(Y)
for freq, count in itemFrequencies:
    print('Frequencies:  ', freq, ': ', count/total, '%')


Distribution of truth values.
Frequencies:   1 :  0.326101968006 %
Frequencies:   2 :  0.224018874439 %
Frequencies:   3 :  0.300207158476 %
Frequencies:   4 :  0.149671999079 %


### NAs Replaced with Mean
We first looked at the treatment of nulls. The first step is, instead of replacing NAs with 0, replace them with the mean of the feature.

In [13]:


def preprocessData3( X ):
    print('Preprocessing data (3).')

    # Q: Normalise data for SVMs - what about decision trees?
    

    #Rewrite this from the HW ipython book
    # Replace any NaN in X with the mean of the column
    # Replacing with the mean gives a better score
    xMean = []
    for col in X.columns:
        xMean = X[col].mean()
        #print(col, ' ', xMean)
        X.loc[X[col].isnull(), col] = xMean
    
    return (X)


# Read the files in.   
(X,Y) = readFiles()


# Clean up the data
(X,Y) = preprocessData3(X,Y)

# Run the model
Y_predict, clf = runModel1(X,Y)

meanNAAUC = calculateROC(Y, Y_predict)

print('Baseline AUC is: ', baselineAUC)
print('Mean NA AUC is: ', meanNAAUC)




TypeError: preprocessData3() takes 1 positional argument but 2 were given

## First Submission

Our first submission on Wednesday was a basic RFC. The intention was to understand the process of sumbitting a file for the homework and to see just how bad a classifier this was!

In [5]:
# This takes 86 min to run. Set to True if you want to run it, otherwise
# the results will be taken from a previous run. The purpose of this was to run
# a Grid Search to find best parameters for a random forest classifier.
runLongRunningTest = False

def runModelWedSubmission(X,Y):
    print('Run model')
    
    model_to_set = OneVsRestClassifier(RandomForestClassifier())

    param_test1 = {'estimator__n_estimators':[10,20,30,40,50], 'estimator__max_depth':[3,6,8,12,24,32], 
               'estimator__min_samples_split':[2,4,6],'estimator__min_samples_leaf':[1,2,4]}
    
        
    gsearch1 = GridSearchCV(estimator = model_to_set, 
                        param_grid = param_test1,n_jobs=8,iid=False, cv=5,verbose=2)
    gsearch1.fit(X,Y)

    Y_predict = gsearch1.predict_proba(X)
    
    print('Best Params: ', gsearch1.best_params_)
    print( 'Best Score: ', gsearch1.best_score_)

    #clf = OneVsRestClassifier(RandomForestClassifier(n_estimators = 10,random_state=25))
    #clf.fit(X,Y)
    #Y_predict = clf.predict_proba(X)
    
    return Y_predict, clf


# Read the files in.   
(X,Y) = readFiles()


# Clean up the data
(X,Y) = preprocessData3(X,Y)



if runLongRunningTest:
    # Run the model
    Y_predict, clf = runModelWedSubmission(X,Y)

    wedSubmissionAAUC = calculateROC(Y, Y_predict)
    createSubmission(clf)
else:
    # Results from previous run:
    #  [Parallel(n_jobs=8)]: Done 1350 out of 1350 | elapsed: 89.6min finished
    # Best Params:  {'estimator__max_depth': 32, 'estimator__min_samples_split': 4, 'estimator__n_estimators': 50, 'estimator__min_samples_leaf': 4}
    # Best Score:  0.6728624493424956
    wedSubmissionAAUC =  {0: 0.99999891511100747, 1: 1.0, 2: 0.99999372674492126, 3: 1.0}


print('Baseline AUC is: ', baselineAUC)
print('Wednesday AUC is: ', wedSubmissionAAUC)



Preprocessing data (3).
Baseline AUC is:  {0: 0.99996212682662722, 1: 0.99993172956870668, 2: 0.99996736488783788, 3: 0.99986086891635195}
Wednesday AUC is:  {0: 0.9999989151110075, 1: 1.0, 2: 0.9999937267449213, 3: 1.0}


## Thursday Submission

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

def runModelThurSubmission(X,Y):
    print('Run model')
    
    # This is taken from the section notebook. The only modification is for the RFC which we have already
    # run a hyperparameter search on.
    
    #Build Model1 - Level 0
    Model1 = OneVsRestClassifier(RandomForestClassifier(
            n_estimators=50, 
            max_depth=32, 
            min_samples_split=4, 
            min_samples_leaf=4))
    Model1.fit(X,Y)

    #Predict on X_train, X_test
    Model1_pred_test = Model1.predict_proba(X)
    #Model1_pred_train = Model1.predict_proba(X_train)

    #Build Model3 - Level 0
    Model3 = OneVsRestClassifier(QuadraticDiscriminantAnalysis())
    Model3.fit(X,Y)
    Model3_pred_test = Model3.predict_proba(X)
    #Model3_pred_train = Model3.predict_proba(X_train)

    #Build Model5 - Level 0
    Model5 = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=15, weights='distance'))
    Model5.fit(X,Y)
    Model5_pred_test = Model5.predict_proba(X)
    #Model5_pred_train = Model5.predict_proba(X_train)
    
    #Model 4 - Level 1 
    #Creating training attributes for Model4 (based on Model1, Model3, Model5 )
    FeaturesTest1 = np.hstack([Model1_pred_test,Model3_pred_test,Model5_pred_test])  
    Model4 = OneVsRestClassifier(LogisticRegression(random_state=49))
    Model4.fit(FeaturesTest1,Y)

    #Creating test attributes for Model3 (based on Model1 and Model2)
    #FeaturesTest1 = np.hstack([Model1_pred_test,Model3_pred_test,Model5_pred_test])

    #Final predictions
    final_pred = Model4.predict_proba(FeaturesTest1)

    #AUC
    #fpr, tpr, thresholds = roc_curve(Y, final_pred[:, 1])
    #roc_auc = auc(fpr, tpr)
    #print("AUC with Stacking: " , roc_auc)
    #Minor improvement with stacking over stand alone models
    
    return final_pred, Model4

    
# Read the files in.   
(X,Y) = readFiles()


# Clean up the data
(X,Y) = preprocessData3(X,Y)

# Run the model
Y_predict, clf = runModelThurSubmission(X,Y)

thurSubmissionAAUC = calculateROC(Y, Y_predict)

createSubmission(clf,preprocessData3)

Preprocessing data (3).
Run model
Calc ROC
Preprocessing data (3).


ValueError: X has 334 features per sample; expecting 12

In [17]:
createSubmission(clf,preprocessData3)

Preprocessing data (3).



# The Final Solution

TBD

NB: When you run the sample (before upgrading) these are the values it produces: {0: 0.95755745500532141, 1: 0.94345356758244103, 2: 0.95754489510952012, 3: 0.935902875654121}