# Model Selection

In [1]:
%matplotlib inline 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC # "Support Vector Classifier"
import sklearn.linear_model
import sklearn.svm

# special matplotlib command for global plot configuration
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
from matplotlib.colors import ListedColormap

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),            
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

cmap_set1 = ListedColormap(['#e41a1c', '#377eb8', '#4daf4a'])
dark2_cmap=ListedColormap(dark2_colors)

def set_mpl_params():
    rcParams['figure.figsize'] = (10, 6)
    rcParams['figure.dpi'] = 150
    rcParams['axes.prop_cycle'].by_key()['color'][1]
    rcParams['lines.linewidth'] = 2
    rcParams['axes.facecolor'] = 'white'
    rcParams['font.size'] = 14
    rcParams['patch.edgecolor'] = 'white'
    rcParams['patch.facecolor'] = dark2_colors[0]
    rcParams['font.family'] = 'StixGeneral'

set_mpl_params()



#First lets reading the dataset in
X = pd.read_csv("trainingData.txt",sep='\t',header=None)
Y = pd.read_csv("trainingTruth.txt",sep='\t',header=None)
Y = np.array(Y).ravel()
#X = X.fillna(0) ## imputing nan's as 0
X.describe()

# Read in test submission file
X_testsub = pd.read_csv("testData.txt",sep="\t",header=None)

# Read in blind submission
X_blindsub = pd.read_csv("blindData.txt",sep="\t",header=None)
X_blindsub.drop(X_blindsub.columns[len(X_blindsub.columns)-1], axis=1, inplace=True)







In [2]:
# Running the estimators takes time 
# set to true to run the code
# set to false and the output from the previous run will printed
runEstimators = False

In [3]:
type(X)

pandas.core.frame.DataFrame

In [4]:
type(Y)

numpy.ndarray

In [5]:
def preprocessFeatures3( X ):
    print('Preprocessing data (3).')

    # Q: Should we take a log of the data?
    
    # Replace any NaN in X with the mean of the column
    # Replacing with the mean gives a better score
    xMean = []
    for col in X.columns:
        xMean = X[col].mean()
        #print(col, ' ', xMean)
        X.loc[X[col].isnull(), col] = xMean
    

    
    # Lets normalize the data to accomodate those classification methods 
    #  that can benefit from it (e.g. SVM)
    #X = (X - X.mean(axis=0)) /  X.std(axis=0)
    
    return (X)


def removeNans(X):
    #print(X.isnull())
    inds = X.isnull().any()
    print('Nans ', inds)
    
    X2 = X[inds]
    print(X.shape)
    print(X2.shape)
    #X = X.dropna()
    
    return X


X = preprocessFeatures3( X )
X_testsub = preprocessFeatures3( X_testsub )
X_blindsub = preprocessFeatures3( X_blindsub )

# Normalize all data to mean 0 and SD of 1
std_scale = StandardScaler().fit(X_testsub)
X_testsub = std_scale.transform(X_testsub)

std_scale = StandardScaler().fit(X_blindsub)
X_blindsub = std_scale.transform(X_blindsub)

Preprocessing data (3).
Preprocessing data (3).
Preprocessing data (3).


In [6]:
X.shape

(17378, 334)

In [7]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
0,0.6343,0.3623,0.435167,0.4771,0.1597,0.2117,0.9309,0.3619,0.5584,0.1471,...,0.7431,0.333,0.1783,-0.7218,1.0397,0.3064,-0.1804,0.5108,-0.7427,0.7402
1,-0.1128,0.2567,-0.315,0.0312,0.4733,0.1741,0.1306,0.1011,0.5484,0.1618,...,0.6729,0.6554,-0.0108,-0.1236,-0.2452,-0.0694,0.185,0.986,0.6855,0.7555
2,-0.1147,-0.2147,1.0796,0.6069,0.3323,0.8456,0.3082,0.7404,0.1146,0.3531,...,-0.1409,-0.0531,0.0121,0.7682,0.506,-0.372,0.0644,0.2841,0.0834,0.146
3,-0.1393,0.3778,0.6667,1.1136,0.697,-1.0491,0.1121,0.855,-0.4056,-0.4072,...,0.3824,1.0743,0.1053,0.4585,-0.399,0.517,-0.0985,0.7276,0.0813,-0.2179
4,-0.1739,-0.2137,0.4118,0.28,0.1626,0.4143,-0.057,0.6324,1.0733,-0.4641,...,0.281,-0.0898,-0.2685,0.8918,-0.316,0.4253,-0.3345,-0.0639,0.2184,0.2293


In [8]:
X.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
17373,0.325,0.964,-0.6586,-0.081,0.8684,-0.1383,0.1966,0.4916,-0.1731,0.3044,...,0.2738,-0.6897,0.2765,0.104,0.0796,-0.2519,0.447165,1.1893,0.6007,0.8424
17374,0.484,1.0259,0.6091,0.4844,0.8739,0.9806,0.3355,-0.5224,1.3731,0.233,...,1.2168,1.5069,0.2802,0.6469,0.4491,1.0596,-0.2852,-0.4714,0.6295,0.3336
17375,0.317,0.4889,0.1417,0.818,1.2458,-0.2427,0.3136,0.3064,0.8588,0.6,...,0.0735,1.1703,-0.3599,-0.6184,0.6256,0.5277,0.2202,0.2174,0.4387,0.2579
17376,-0.1188,0.8574,-0.0358,0.4499,0.9471,-0.1764,0.6991,0.5083,-0.18,0.6998,...,0.2942,0.4487,-0.1378,0.7742,0.0397,-0.2275,0.757,0.2092,0.4246,0.777
17377,-0.3133,0.7629,0.4839,0.5712,-0.4183,0.5916,0.9065,0.1442,0.7223,1.0139,...,0.4759,0.3724,0.3312,0.7245,0.7625,0.8673,1.6282,0.8187,-0.5652,0.4126


In [9]:
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
count,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,...,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0
mean,0.264572,0.456596,0.435167,0.263299,0.496509,0.356921,0.36299,0.275144,0.423023,0.454524,...,0.448383,0.481281,0.277807,0.333366,0.474403,0.27357,0.447165,0.487169,0.267626,0.389999
std,0.455194,0.512302,0.508479,0.451345,0.512481,0.494936,0.497232,0.463225,0.502766,0.512201,...,0.511349,0.519878,0.459788,0.48434,0.517393,0.459233,0.512634,0.511158,0.455598,0.503117
min,-1.5608,-1.5064,-1.399,-1.4882,-1.4421,-1.3064,-1.8844,-1.7212,-1.4441,-1.5784,...,-1.4486,-1.3986,-1.5701,-1.8473,-1.6267,-1.8658,-1.4036,-1.6289,-1.6968,-1.4666
25%,-0.043575,0.106125,0.090225,-0.038925,0.145875,0.0209,0.023125,-0.0339,0.08435,0.098025,...,0.0951,0.12775,-0.0292,0.008125,0.109625,-0.0383,0.1004,0.1341,-0.039325,0.04435
50%,0.2639,0.4537,0.432,0.263299,0.4974,0.3494,0.35465,0.275144,0.41785,0.4511,...,0.4444,0.481281,0.277807,0.3208,0.474403,0.2734,0.4396,0.4821,0.2659,0.3833
75%,0.567975,0.798,0.779375,0.5626,0.8443,0.684075,0.6931,0.5852,0.760375,0.802675,...,0.798675,0.836675,0.5847,0.652475,0.831375,0.584375,0.79335,0.8407,0.572025,0.726225
max,2.0546,2.5191,2.4137,2.1299,2.3358,2.2518,2.3356,2.086,2.4521,2.4869,...,2.4336,2.3727,2.0284,2.5224,2.3033,2.0208,2.5782,2.4072,2.07,2.3871


In [10]:
#X = (X - X.mean(axis=0)) /  X.std(axis=0)
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
count,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,...,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0,17378.0
mean,0.264572,0.456596,0.435167,0.263299,0.496509,0.356921,0.36299,0.275144,0.423023,0.454524,...,0.448383,0.481281,0.277807,0.333366,0.474403,0.27357,0.447165,0.487169,0.267626,0.389999
std,0.455194,0.512302,0.508479,0.451345,0.512481,0.494936,0.497232,0.463225,0.502766,0.512201,...,0.511349,0.519878,0.459788,0.48434,0.517393,0.459233,0.512634,0.511158,0.455598,0.503117
min,-1.5608,-1.5064,-1.399,-1.4882,-1.4421,-1.3064,-1.8844,-1.7212,-1.4441,-1.5784,...,-1.4486,-1.3986,-1.5701,-1.8473,-1.6267,-1.8658,-1.4036,-1.6289,-1.6968,-1.4666
25%,-0.043575,0.106125,0.090225,-0.038925,0.145875,0.0209,0.023125,-0.0339,0.08435,0.098025,...,0.0951,0.12775,-0.0292,0.008125,0.109625,-0.0383,0.1004,0.1341,-0.039325,0.04435
50%,0.2639,0.4537,0.432,0.263299,0.4974,0.3494,0.35465,0.275144,0.41785,0.4511,...,0.4444,0.481281,0.277807,0.3208,0.474403,0.2734,0.4396,0.4821,0.2659,0.3833
75%,0.567975,0.798,0.779375,0.5626,0.8443,0.684075,0.6931,0.5852,0.760375,0.802675,...,0.798675,0.836675,0.5847,0.652475,0.831375,0.584375,0.79335,0.8407,0.572025,0.726225
max,2.0546,2.5191,2.4137,2.1299,2.3358,2.2518,2.3356,2.086,2.4521,2.4869,...,2.4336,2.3727,2.0284,2.5224,2.3033,2.0208,2.5782,2.4072,2.07,2.3871


In [11]:
print(type(Y))

<class 'numpy.ndarray'>


In [12]:
Y1 = np.copy(Y)
Y1[Y1!=1] = 0

Y2 = np.copy(Y)
Y2[Y2!=2] = 0
Y2[Y2==2] = 1

Y3 = np.copy(Y)
Y3[Y3!=3] = 0
Y3[Y3==3] = 1

Y4 = np.copy(Y)
Y4[Y4!=4] = 0
Y4[Y4==4] = 1


<b>LETS TEST WHETHER WE BINARIZED INDIVIDUAL CATEGORIES CORRECTLY:</b>

In [13]:
Y[0:10]

array([3, 3, 3, 1, 3, 1, 4, 2, 1, 3])

In [14]:
Y1[0:10]

array([0, 0, 0, 1, 0, 1, 0, 0, 1, 0])

In [15]:
Y2[0:10]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [16]:
Y3[0:10]

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1])

In [17]:
Y4[0:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [18]:
#nested cross-validation. In nested cross-validation, there is an outer
#loop over splits of the data into training and test sets. For each of them, a grid search
#is run (which might result in different best parameters for each split in the outer
#loop). Then, for each outer split, the test set score using the best settings is reported.

<b>WORKING WITH THE CLASS  "1": Class 1 vs Rest</b>

In [19]:
# As we are only provided with the "training" set (not taking into acount partial "test" data) a sensible approach to 
# compare efficiency of different models would be to hold-out some of this data for the "testing" purposes. 
# Because hte dataset is relatively large, we decided to leave out .33 of the data and not .50 as it is done in some scenarios

# We are using train_test_split function to hold out 33% or the randomly shuffled data
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X, Y1, test_size=.33, random_state=10) 

print (X.shape, X_train1.shape, X_test1.shape)

(17378, 334) (11643, 334) (5735, 334)


In [20]:
# Normalize all data to mean 0 and SD of 1
std_scale = StandardScaler().fit(X_train1)
X_train1 = std_scale.transform(X_train1)
X_test1 = std_scale.transform(X_test1)

#verification of the scaling
np.mean(X_train1[:,0])

-5.4924715467155387e-17

In [21]:
if runEstimators:
    estimators = [("RandomForest", RandomForestClassifier(n_estimators=10,max_depth=175,min_samples_split=9,min_samples_leaf=1,random_state=1)),
                  ("GaussianNB", GaussianNB()),
                  ("QDA",QuadraticDiscriminantAnalysis()),
                  ("KNN",KNeighborsClassifier(15, weights='distance')),
                  ("LogisticRegression",LogisticRegression(random_state=57)),
                  ("SVC",SVC(C = 10, gamma=0.01, kernel='rbf', probability=True))
                 ]
    for (name, estimator) in estimators:
        Y_pred_test1 = estimator.fit(X_train1,Y_train1).predict_proba(X_test1)
        #AUC
        fpr, tpr, thresholds = roc_curve(Y_test1, Y_pred_test1[:, 1])
        roc_auc = auc(fpr, tpr)
        print("AUC with",name, " : " ,roc_auc)
else:
    print('AUC with RandomForest  :  0.84063237365')
    print('AUC with GaussianNB  :  0.962701737206')
    print('AUC with QDA  :  0.966974122135')
    print('AUC with KNN  :  0.959179643084')
    print('AUC with LogisticRegression  :  0.964210112601')
    print('AUC with SVC  :  0.968478948574')

AUC with RandomForest  :  0.84063237365
AUC with GaussianNB  :  0.962701737206
AUC with QDA  :  0.966974122135
AUC with KNN  :  0.959179643084
AUC with LogisticRegression  :  0.964210112601
AUC with SVC  :  0.968478948574


<b>WORKING WITH THE CLASS  "2": Class 2 vs Rest</b>

In [22]:
# As we are only provided with the "training" set (not taking into acount partial "test" data) a sensible approach to 
# compare efficiency of different models would be to hold-out some of this data for the "testing" purposes. 
# Because hte dataset is relatively large, we decided to leave out .33 of the data and not .50 as it is done in some scenarios

# We are using train_test_split function to hold out 33% or the randomly shuffled data
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y2, test_size=.33, random_state=10) 

print (X.shape, X_train2.shape, X_test2.shape)

(17378, 334) (11643, 334) (5735, 334)


In [23]:
# Normalize all data to mean 0 and SD of 1
std_scale = StandardScaler().fit(X_train2)
X_train2 = std_scale.transform(X_train2)
X_test2 = std_scale.transform(X_test2)

In [24]:
if runEstimators:
    estimators = [("RandomForest", RandomForestClassifier(n_estimators=10,max_depth=10,min_samples_split=9,min_samples_leaf=1,random_state=1)),
                  ("GaussianNB", GaussianNB()),
                  ("QDA",QuadraticDiscriminantAnalysis()),
                  ("KNN",KNeighborsClassifier(15, weights='distance')),
                  ("LogisticRegression",LogisticRegression(random_state=57)),
                  ("SVC",SVC(C=8.3, gamma=0.0035, kernel='rbf', probability=True))
                 ]
    for (name, estimator) in estimators:
        Y_pred_test2 = estimator.fit(X_train2,Y_train2).predict_proba(X_test2)
        #AUC
        fpr, tpr, thresholds = roc_curve(Y_test2, Y_pred_test2[:, 1])
        roc_auc = auc(fpr, tpr)
        print("AUC with",name, " : " ,roc_auc)
else:
    print('AUC with RandomForest  :  0.717126395678')
    print('AUC with GaussianNB  :  0.871723143964')
    print('AUC with QDA  :  0.802773929661')
    print('AUC with KNN  :  0.789426180311')
    print('AUC with LogisticRegression  :  0.862200323207')
    print('AUC with SVC  :  0.891752720569')

AUC with RandomForest  :  0.717126395678
AUC with GaussianNB  :  0.871723143964
AUC with QDA  :  0.802773929661
AUC with KNN  :  0.789426180311
AUC with LogisticRegression  :  0.862200323207
AUC with SVC  :  0.891752720569


<b>WORKING WITH THE CLASS  "3": Class 3 vs Rest</b>

In [25]:
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X, Y3, test_size=.33, random_state=10) 

print (X.shape, X_train3.shape, X_test3.shape)

(17378, 334) (11643, 334) (5735, 334)


In [26]:
# Normalize all data to mean 0 and SD of 1
std_scale = StandardScaler().fit(X_train3)
X_train3 = std_scale.transform(X_train3)
X_test3 = std_scale.transform(X_test3)

In [27]:
if runEstimators:
    estimators = [("RandomForest", RandomForestClassifier(n_estimators=10,max_depth=10,min_samples_split=9,min_samples_leaf=1,random_state=1)),
                  ("GaussianNB", GaussianNB()),
                  ("QDA",QuadraticDiscriminantAnalysis()),
                  ("KNN",KNeighborsClassifier(15, weights='distance')),
                  ("LogisticRegression",LogisticRegression(random_state=57)),
                  ("SVC",SVC(C = 10, gamma=0.0033, kernel='rbf', probability=True))
                 ]
    for (name, estimator) in estimators:
        Y_pred_test3 = estimator.fit(X_train3,Y_train3).predict_proba(X_test3)
        #AUC
        fpr, tpr, thresholds = roc_curve(Y_test3, Y_pred_test3[:, 1])
        roc_auc = auc(fpr, tpr)
        print("AUC with",name, " : " ,roc_auc)
else:
    print('AUC with RandomForest  :  0.842445191369')
    print('AUC with GaussianNB  :  0.958046478008')
    print('AUC with QDA  :  0.91504109292')
    print('AUC with KNN  :  0.938682424374')
    print('AUC with LogisticRegression  :  0.955911525534')
    print('AUC with SVC  :  0.962418446637')    

AUC with RandomForest  :  0.842445191369
AUC with GaussianNB  :  0.958046478008
AUC with QDA  :  0.91504109292
AUC with KNN  :  0.938682424374
AUC with LogisticRegression  :  0.955911525534
AUC with SVC  :  0.962418446637


<b>WORKING WITH THE CLASS  "4": Class 4 vs Rest</b>

In [28]:
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X, Y4, test_size=.33, random_state=10) 

print (X.shape, X_train4.shape, X_test4.shape)

(17378, 334) (11643, 334) (5735, 334)


In [29]:
# Normalize all data to mean 0 and SD of 1
std_scale = StandardScaler().fit(X_train4)
X_train4 = std_scale.transform(X_train4)
X_test4 = std_scale.transform(X_test4)
#verification of the scaling
np.mean(X_train4[:,0])

-5.4924715467155387e-17

In [30]:
if runEstimators:
    estimators = [("RandomForest", RandomForestClassifier(n_estimators=10,max_depth=10,min_samples_split=9,min_samples_leaf=1,random_state=1)),
                  ("GaussianNB", GaussianNB()),
                  ("QDA",QuadraticDiscriminantAnalysis()),
                  ("KNN",KNeighborsClassifier(15, weights='distance')),
                  ("LogisticRegression",LogisticRegression(random_state=57)),
                  ("SVC",SVC(C=3.4, gamma=0.01, kernel='rbf', probability=True))
                 ]
    for (name, estimator) in estimators:
        Y_pred_test4 = estimator.fit(X_train4,Y_train4).predict_proba(X_test4)
        #AUC
        fpr, tpr, thresholds = roc_curve(Y_test4, Y_pred_test4[:, 1])
        roc_auc = auc(fpr, tpr)
        print("AUC with",name, " : " ,roc_auc)
else:
    print('AUC with RandomForest  :  0.615432098765')
    print('AUC with GaussianNB  :  0.811448324515')
    print('AUC with QDA  :  0.701939329806')
    print('AUC with KNN  :  0.704791887125')
    print('AUC with LogisticRegression  :  0.80424526749')
    print('AUC with SVC  :  0.829348383304')

AUC with RandomForest  :  0.615432098765
AUC with GaussianNB  :  0.811448324515
AUC with QDA  :  0.701939329806
AUC with KNN  :  0.704791887125
AUC with LogisticRegression  :  0.80424526749
AUC with SVC  :  0.829348383304


<b> Combining Classifiers via "Stacking" in order to imporove the accuracy. </b>

<b>Class 1 vs others</b>

In [None]:
%run "StackingAttributes.py"
%run "Ensemble.py"

In [None]:
attrsC1 = StackingAttributes(X_train1, Y_train1, X_test1, Y_test1, X_testsub, X_blindsub)

e1 = Ensemble( attrsC1 )
e1.run(verbose = 1)


Running ensemble
Running Random Forest Classifier
Running SVM Classifier


<b>Class 2 vs others.</b>

In [None]:
attrsC2 = StackingAttributes(X_train2, Y_train2, X_test2, Y_test2, X_testsub, X_blindsub)
attrsC2.rf_use_rfe = True
attrsC2.rf_n_estimators=100
attrsC2.rf_max_depth=50
attrsC2.rf_min_samples_split=2
attrsC2.rf_min_samples_leaf=1

attrsC2.svc_C = 10
attrsC2.svc_gamma=0.0033
attrsC2.svc_kernel='rbf'

attrsC2.kn_n_neighbors = 500

attrsC2.lr_C = 300

e2 = Ensemble( attrsC2 )
e2.run(verbose = 1)

<b>Class 3 vs others.</b>

In [None]:
attrsC3 = StackingAttributes(X_train3, Y_train3, X_test3, Y_test3, X_testsub, X_blindsub)
attrsC3.rf_n_estimators=100
attrsC3.rf_max_depth=20
attrsC3.rf_min_samples_split=10
attrsC3.rf_min_samples_leaf=5

attrsC3.svc_C = 10
attrsC3.svc_gamma=0.0033
attrsC3.svc_kernel='rbf'

attrsC2.kn_n_neighbors = 242

attrsC3.lr_C = 0.0004


e3 = Ensemble( attrsC3 )
e3.run(verbose = 1)

<b>Class 4 vs others.</b>

In [None]:
attrsC4 = StackingAttributes(X_train4, Y_train4, X_test4, Y_test4, X_testsub, X_blindsub)

attrsC4.rf_use_rfe = True
attrsC4.rf_n_estimators=100
attrsC4.rf_max_depth=70
attrsC4.rf_min_samples_split=2
attrsC4.rf_min_samples_leaf=25

attrsC4.svc_C = 3.4
attrsC4.svc_gamma=0.01
attrsC4.svc_kernel='rbf'

attrsC2.kn_n_neighbors = 925

e4 = Ensemble( attrsC4 )
e4.run(verbose = 1)

In [None]:
def submission(filename, y_final_prob):

    y_final_label = np.zeros((y_final_prob.shape[0], 1), dtype=np.float)  

    # Convert back to a class
    y_final_label = np.argmax(y_final_prob, axis=1)
    y_final_label += 1
            
    sample = pd.DataFrame(np.hstack([y_final_prob.round(5),y_final_label.reshape(y_final_prob.shape[0],1)]))
    sample.columns = ["prob1","prob2","prob3","prob4","label"]
    sample.label = sample.label.astype(int)
    
    #Submit this file to dropbox
    sample.to_csv(filename,sep="\t" ,index=False,header=None)


submission("Johnston_Memic_Test3.csv", np.column_stack([attrsC1.final_pred_testsub[:,1],
                                                        attrsC2.final_pred_testsub[:,1],
                                                        attrsC3.final_pred_testsub[:,1],
                                                        attrsC4.final_pred_testsub[:,1]]))
submission("Johnston_Memic_Blind3.csv", np.column_stack([attrsC1.final_pred_blindsub[:,1],
                                                         attrsC2.final_pred_blindsub[:,1],
                                                         attrsC3.final_pred_blindsub[:,1],
                                                         attrsC4.final_pred_blindsub[:,1]]))



# Results

In [None]:
# Plotting code taken from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
%run 'plot_learning_curve.py'

from sklearn.feature_selection import RFECV

def plotFeatureRankings(classifier, grid_scores):
    """ Plot number of features VS. cross-validation scores
    """
    plt.figure()
    plt.title('Classifier {}'.format(classifier))
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(grid_scores) + 1), grid_scores)
    plt.show()
        
def createRFEScores(model, modelName, i, X_train, Y_train, X_test, Y_test):
    """
    """

    rfecv = RFECV(model, scoring='roc_auc', verbose=2, n_jobs=-1)
    rfecv.fit(X_train, Y_train[:,i])
    rfecv.score(X_test, Y_test[:,i])
    print('Model {} Number Features {}'.format(i, rfecv.n_features_))
    print('Model {} Number Estimators {}'.format(i, rfecv.ranking_))
    print('Model {} Scores {}'.format(i, rfecv.grid_scores_))
    print('Model {} Support {}'.format(i, rfecv.support_))

    np.save('{}_class{}_ranking.npy'.format(modelName, i+1), rfecv.ranking_)
    np.save('{}_class{}_scores.npy'.format(modelName, i+1), rfecv.grid_scores_)

        
# Set to True to run the RFE (time consuming)
# Set to false to display pre-calculated results
runRFE = True

if runRFE:
    
    createRFEScores(attrsC1.final_model, 'final', 0, attrsC1.X_train, attrsC1.Y_train, attrsC1.X_test, attrsC1.Y_test)
    createRFEScores(attrsC2.final_model, 'final', 1, attrsC2.X_train, attrsC2.Y_train, attrsC2.X_test, attrsC2.Y_test)
    createRFEScores(attrsC3.final_model, 'final', 2, attrsC3.X_train, attrsC3.Y_train, attrsC3.X_test, attrsC3.Y_test)
    createRFEScores(attrsC4.final_model, 'final', 3, attrsC4.X_train, attrsC4.Y_train, attrsC4.X_test, attrsC4.Y_test)
    #createRFEScores(svc_model, 'svc', X_train, Y_train, X_test, Y_test)

else:
    for i in range(4):
        classifier = i+1
        ranking = np.load('final_class{}_ranking.npy'.format(classifier))
        grid_scores = np.load('final_class{}_scores.npy'.format(classifier))
        
        plotFeatureRankings(classifier, grid_scores)
        
    print('Class 1: Best score: ', 158)
    print('Class 2: Best score: ', 147)
    print('Class 3: Best score: ', 153)
    print('Class 4: Best score: ', 64)
    

In [None]:
Module created for script run in IPython
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-37-9f34b449be9e> in <module>()
     36 if runRFE:
     37 
---> 38     createRFEScores(attrsC1.final_model, 'final', 0, attrsC1.X_train, attrsC1.Y_train, attrsC1.X_test, attrsC1.Y_test)
     39     createRFEScores(attrsC2.final_model, 'final', 1, attrsC2.X_train, attrsC2.Y_train, attrsC2.X_test, attrsC2.Y_test)
     40     createRFEScores(attrsC3.final_model, 'final', 2, attrsC3.X_train, attrsC3.Y_train, attrsC3.X_test, attrsC3.Y_test)

<ipython-input-37-9f34b449be9e> in createRFEScores(model, modelName, i, X_train, Y_train, X_test, Y_test)
     19 
     20     rfecv = RFECV(model, scoring='roc_auc', verbose=2, n_jobs=-1)
---> 21     rfecv.fit(X_train, Y_train[:,i])
     22     rfecv.score(X_test, Y_test[:,i])
     23     print('Model {} Number Features {}'.format(i, rfecv.n_features_))

IndexError: too many indices for array