# Cancer Prognosis
## Necessary Imports

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, Math, Latex
from sklearn import svm, neural_network
from itertools import combinations, permutations
from sklearn.metrics import accuracy_score

import operator as op
from functools import reduce

def ncr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer / denom

## Loading the datasets

In [2]:
train_df = pd.read_csv("Datasets/Assign1_Training_Data.txt", sep='\t', lineterminator='\r')
test_df  = pd.read_csv("Datasets/Assign1_Testing_Data.txt", sep='\t', lineterminator='\r')

X_df = train_df.loc[:, list(train_df.loc[:,'AL080059':'FLJ11190'].columns.values)]
Y_df = train_df.loc[:, 'Label']

X_test_df = test_df.loc[:, list(test_df.loc[:,'AL080059':'FLJ11190'].columns.values)]
Y_test_df = test_df.loc[:, 'Label']

all_genes = X_df.columns.tolist()

display(train_df.head())
#display(test_df.head())
display(train_df.describe())
#display(train_df.info())

Unnamed: 0,Sample_Number,AL080059,Contig63649_RC,Contig46218_RC,LOC51203,AA555029_RC,ALDH4,Contig38288_RC,FGF18,Contig28552_RC,...,AKAP2,Contig63102_RC,PRC1,Contig20217_RC,CENPA,SM.20,CCNE2,ESM1,FLJ11190,Label
0,138,-0.227,-0.107,-0.086,-0.057,0.073,0.021,-0.002,0.135,-0.071,...,0.121,-0.187,-0.2,-0.127,-0.277,0.036,-0.095,0.123,-0.211,0
1,184,0.044,-0.031,0.381,0.226,-0.038,-0.167,0.103,-0.33,0.42,...,0.029,0.033,0.333,0.303,0.058,0.406,0.266,-0.129,-0.219,0
2,127,0.151,-0.21,0.034,0.037,-0.065,-0.048,-0.026,-0.425,0.204,...,-0.2,-0.07,0.166,-0.002,0.076,-0.027,0.278,-0.16,-0.144,0
3,166,0.335,-0.031,0.177,0.165,-0.372,0.34,0.112,-0.506,0.213,...,-0.251,-0.04,0.058,0.074,0.309,-0.227,0.186,0.104,0.214,0
4,318,-0.098,-0.492,-0.307,-0.097,-0.024,0.197,-0.001,0.369,-0.318,...,-0.003,0.178,-0.453,-0.285,-0.265,-0.237,-0.335,-0.603,-0.116,0


Unnamed: 0,Sample_Number,AL080059,Contig63649_RC,Contig46218_RC,LOC51203,AA555029_RC,ALDH4,Contig38288_RC,FGF18,Contig28552_RC,...,AKAP2,Contig63102_RC,PRC1,Contig20217_RC,CENPA,SM.20,CCNE2,ESM1,FLJ11190,Label
count,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,...,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0
mean,222.1625,-0.105275,-0.047012,-0.019113,-0.019087,-0.047725,-0.014562,0.020312,-0.102825,0.006362,...,-0.02515,-0.009275,-0.040638,-0.024913,-0.08035,0.000113,-0.041987,-0.070638,-0.110325,0.5
std,109.631886,0.336355,0.241922,0.240881,0.257494,0.181599,0.158567,0.241117,0.286405,0.249487,...,0.184635,0.188017,0.269357,0.229327,0.320232,0.182607,0.274181,0.250606,0.182423,0.503155
min,9.0,-1.083,-0.492,-0.508,-0.669,-0.372,-0.366,-0.386,-0.624,-0.61,...,-0.346,-0.401,-0.602,-0.409,-0.743,-0.501,-0.668,-0.897,-0.447,0.0
25%,140.75,-0.33025,-0.23525,-0.18125,-0.203,-0.177,-0.134,-0.13225,-0.32725,-0.164,...,-0.1785,-0.107,-0.26725,-0.19875,-0.31275,-0.13125,-0.232,-0.1875,-0.23775,0.0
50%,216.5,-0.074,-0.069,-0.028,0.005,-0.066,-0.019,-0.0175,-0.0925,0.0165,...,-0.059,-0.0235,-0.007,-0.052,-0.14,0.0005,-0.035,-0.074,-0.135,0.5
75%,324.0,0.135,0.085,0.1505,0.179,0.08325,0.06675,0.165,0.103,0.184,...,0.10075,0.10075,0.1835,0.145,0.11375,0.11975,0.16725,0.11,0.022,1.0
max,398.0,0.632,0.557,0.618,0.507,0.405,0.479,0.731,0.705,0.655,...,0.45,0.444,0.476,0.623,0.806,0.453,0.602,0.495,0.385,1.0


$$(a+b)^{2} = a^{2}+2ab+b^{2}$$
$(a+b)^{2} = a^{2}+2ab+b^{2}$

## Defining classifiers
1. Linear SVM, C=1
2. Non-Linear SVM with RBF Kernel, C=10
3. Neural Network with 2 hidden layers of 5 neurons each and logistic non-linearities.

In [3]:
clf_1 = svm.LinearSVC(C=1)
clf_2 = svm.SVC(C=10, kernel='rbf', gamma='auto')
clf_3 = neural_network.MLPClassifier(hidden_layer_sizes=(5, 5), activation='logistic', solver='lbfgs', random_state=0)

## Observations   
   1. Dimensionality - 70 features (genes)
   2. Clearly an exhaustive search for all subsets is not possible ($2^{70}-1$ possibilities)

## Task 1 - Find top 2 genes (exhaustive search)
1. Search criterion - Resubstitution error estimate (This is a wrapper approach, as it depends on the classification rule)
2. To search ${70}\choose{2}$ = 2415 combinations

In [4]:
ncr(70,2)

2415.0

## Generating all feature subsets of size 2

In [5]:
all_feature_list = list(train_df.loc[:,'AL080059':'FLJ11190'].columns.values)

# all subsets of size 2
feature_subset_2 = combinations(all_feature_list, 2)
f_sub_2_list = [each_comb for each_comb in feature_subset_2]
#print(f_sub_2_list)

## Computing Resubstitution errors for all size 2 subsets for all 3 classifiers

In [6]:
Y = train_df.loc[:, 'Label']
train_size = len(train_df.index)
errors_c1 = []
errors_c2 = []
errors_c3 = []

for each_comb in f_sub_2_list:
    X = train_df.loc[:, each_comb]
    
    clf_1.fit(X, Y)
    clf_2.fit(X, Y)
    clf_3.fit(X, Y)
    
    predictions_c1 = clf_1.predict(X)
    predictions_c2 = clf_2.predict(X)
    predictions_c3 = clf_3.predict(X)
    
    error_count_c1 = train_size - sum(train_df['Label'] == predictions_c1)
    error_count_c2 = train_size - sum(train_df['Label'] == predictions_c2)
    error_count_c3 = train_size - sum(train_df['Label'] == predictions_c3)
    
    training_error_c1 = error_count_c1/train_size
    training_error_c2 = error_count_c2/train_size
    training_error_c3 = error_count_c3/train_size
    
    errors_c1.append(training_error_c1)
    errors_c2.append(training_error_c2)
    errors_c3.append(training_error_c3)

### Identiying the gene subset(s) that gives minimum Resubstitution errors (Top 2 exhaustive)

In [7]:
Resub_df = pd.DataFrame()
Resub_df['Subset'] = f_sub_2_list
Resub_df['Resub_Lin_SVM'] = errors_c1
Resub_df['Resub_Non_lin_SVM'] = errors_c2
Resub_df['Resub_MLP'] = errors_c3


Lin_SVM_min_error = Resub_df['Resub_Lin_SVM'].min()
Non_Lin_SVM_min_error = Resub_df['Resub_Non_lin_SVM'].min()
MLP_min_error = Resub_df['Resub_MLP'].min()

print('Top 2 genes for Linear SVM:')
display(Resub_df.loc[Resub_df['Resub_Lin_SVM'] == Lin_SVM_min_error])

print('Top 2 genes for Non-Linear SVM:')
display(Resub_df.loc[Resub_df['Resub_Non_lin_SVM'] == Non_Lin_SVM_min_error])

print('Top 2 genes for MLP:')
display(Resub_df.loc[Resub_df['Resub_MLP'] == MLP_min_error])


Top 2 genes for Linear SVM:


Unnamed: 0,Subset,Resub_Lin_SVM,Resub_Non_lin_SVM,Resub_MLP
2367,"(IGFBP5.1, CCNE2)",0.2,0.2625,0.1625


Top 2 genes for Non-Linear SVM:


Unnamed: 0,Subset,Resub_Lin_SVM,Resub_Non_lin_SVM,Resub_MLP
797,"(KIAA1442, ORC6L)",0.425,0.2125,0.175
812,"(KIAA1442, PRC1)",0.3375,0.2125,0.1375


Top 2 genes for MLP:


Unnamed: 0,Subset,Resub_Lin_SVM,Resub_Non_lin_SVM,Resub_MLP
230,"(LOC51203, Contig32185_RC)",0.2625,0.225,0.0625


## Observation:
1. It appears there is a tie between two subsets for MLP classifier. We break the tie by selecting the subset with minimum feature index.

In [8]:
# To check if our exhaustive search works correctly

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
#clf_2 = svm.SVC(C=10, kernel='rbf', gamma='auto')

efs1 = EFS(clf_2, 
           min_features=2,
           max_features=2,
           scoring='accuracy',
           print_progress=True,
           cv=0)
efs1= efs1.fit(X_df, Y_df)

print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 2415/2415

Best subset (corresponding names): ('KIAA1442', 'ORC6L')


The above code block gives the following output for the 3 classifier models: (along with random_state = 0 for MLP)
1. Linear SVM - Best subset (corresponding names): ('IGFBP5.1', 'CCNE2')
2. Non-Linear SVM - Best subset (corresponding names): ('LOC51203', 'KIAA1442')
3. MLP - Best subset (corresponding names): ('LOC51203', 'Contig32185_RC')

## Top 2 Exhaustive search Results
    1. Note: values[0] picks the first element (to break tie by minimum feature index)

In [9]:
t2_lin_svm = list(Resub_df.loc[Resub_df['Resub_Lin_SVM'] == Lin_SVM_min_error, 'Subset'].values[0])
t2_non_lin_svm = list(Resub_df.loc[Resub_df['Resub_Non_lin_SVM'] == Non_Lin_SVM_min_error, 'Subset'].values[0])
t2_mlp = list(Resub_df.loc[Resub_df['Resub_MLP'] == MLP_min_error, 'Subset'].values[0])

print(t2_lin_svm)
print(t2_non_lin_svm)
print(t2_mlp)

['IGFBP5.1', 'CCNE2']
['KIAA1442', 'ORC6L']
['LOC51203', 'Contig32185_RC']


In [10]:
Resub_df.to_csv("Resub errors.csv")

## Sequential Forward Search to find Top 3, 4, 5 genes
1. Start with empty set (But for this project, we start with top 2 genes, found previously from exhaustive search)
2. Find the best gene along with the current set and add to the set
3. Repeat until number of features == k

## Lets define a function for Sequential Forward Search that takes the following arguments:
    1. Number of Features
    2. X_Training_data
    3. Y_Training_data
    4. Classifier model
    5. Pre-feed feature set (Starts the algorithm with this feature set)
    
and returns:
    1. a set of top features of size 'num_features'

In [11]:
def seq_fwd_search(num_features, X_train_df, Y_train, model, pre_feed_list = []):
    master_feature_list = X_train_df.columns.tolist()
    #print(master_feature_list)
    #final_list = []
    final_list = list(pre_feed_list).copy()
    train_size = len(X_train_df.index)
    i = 0
    assert(len(pre_feed_list) <= num_features) #"I'm sorry, Dave. I'm afraid I can't do that."
    while len(final_list) != num_features:
        #master_feature_list = master_feature_set - final_set
        #master_feature_list = list(master_feature_set) # cuz, we will be iterating over a set (trust me, you don't wanna iterate over a set)
        error_list  = []
        each_f_list = []
        curr_list = []
        for each_feature in master_feature_list:
            current_list = final_list + [each_feature]
            #print(current_list)
            #print(current_list)
            X_current_df = train_df.loc[:, current_list]
            
            model.fit(X_current_df, Y_train)
            predictions = model.predict(X_current_df)
            
            error_count = train_size - sum(Y_train == predictions)
            training_error = error_count/train_size
            #training_error = 1-accuracy_score(Y_train, predictions)
            #print(1-training_error, accuracy_score(Y_train, predictions))
            #assert(1- training_error == accuracy_score(Y_train, predictions))
            
            error_list.append(training_error)
            each_f_list.append(each_feature)
            curr_list.append(' '.join(current_list))
            
        df = pd.DataFrame()
        df['feat'] = curr_list
        df['error'] = error_list
        #df['each_f'] = each_f_list
        
        i += 1
        strs = 'sfs' + str(i) + '.csv'
        df.to_csv(strs)
        min_error = min(error_list)
        min_item_idx = error_list.index(min_error)
        selected_feature = each_f_list[min_item_idx]
        #print(selected_feature)
        final_list.append(selected_feature)
        master_feature_list.remove(selected_feature)
    return final_list, min_error

In [12]:
# Computing the Top 3, 4, 5 genes

clf_1_pre_feed = t2_lin_svm
clf_2_pre_feed = t2_non_lin_svm
clf_3_pre_feed = t2_mlp

t3_lin_svm, t3_lin_svm_err         = seq_fwd_search(3, X_df, Y_df, clf_1, clf_1_pre_feed)
t4_lin_svm, t4_lin_svm_err         = seq_fwd_search(4, X_df, Y_df, clf_1, clf_1_pre_feed)
t5_lin_svm, t5_lin_svm_err         = seq_fwd_search(5, X_df, Y_df, clf_1, clf_1_pre_feed)
lin_svm_feats = [t2_lin_svm, t3_lin_svm, t4_lin_svm, t5_lin_svm]

t3_non_lin_svm, t3_non_lin_svm_err = seq_fwd_search(3, X_df, Y_df, clf_2, clf_2_pre_feed)
t4_non_lin_svm, t4_non_lin_svm_err = seq_fwd_search(4, X_df, Y_df, clf_2, clf_2_pre_feed)
t5_non_lin_svm, t5_non_lin_svm_err = seq_fwd_search(5, X_df, Y_df, clf_2, clf_2_pre_feed)
non_lin_svm_feats = [t2_non_lin_svm, t3_non_lin_svm, t4_non_lin_svm, t5_non_lin_svm]

t3_mlp, t3_mlp_err                 = seq_fwd_search(3, X_df, Y_df, clf_3, clf_3_pre_feed)
t4_mlp, t4_mlp_err                 = seq_fwd_search(4, X_df, Y_df, clf_3, clf_3_pre_feed)
t5_mlp, t5_mlp_err                 = seq_fwd_search(5, X_df, Y_df, clf_3, clf_3_pre_feed)
mlp_feats = [t2_mlp, t3_mlp, t4_mlp, t5_mlp]


In [13]:
print('Top 5 genes using Sequential Forward Search: (Using first 2 genes from Exhaustive Search)\n')
print('Linear SVM        : ', t5_lin_svm, '\t\t', 'Resub error = ', t5_lin_svm_err)
print('Non-Linear SVM    : ', t5_non_lin_svm, '\t', 'Resub error = ', t5_non_lin_svm_err)
print('MLP Neural Network: ', t5_mlp, '\t', 'Resub error = ', t5_mlp_err)

#['LOC51203', 'Contig32185_RC', 'MMP9', 'CFFM4', 'L2DTL']

Top 5 genes using Sequential Forward Search: (Using first 2 genes from Exhaustive Search)

Linear SVM        :  ['IGFBP5.1', 'CCNE2', 'AL080059', 'OXCT', 'KIAA1067'] 		 Resub error =  0.1875
Non-Linear SVM    :  ['KIAA1442', 'ORC6L', 'Contig48328_RC', 'Contig63649_RC', 'HEC'] 	 Resub error =  0.1375
MLP Neural Network:  ['LOC51203', 'Contig32185_RC', 'KIAA1442', 'Contig55377_RC', 'Contig40831_RC'] 	 Resub error =  0.0125


## Storing the Feature subsets in a DataFrame

In [14]:
subset_sizes = ['Top 2', 'Top 3', 'Top 4', 'Top 5', 'all genes']
models = ['Lin SVM', 'Non-Lin SVM', 'MLP']

subset_df = pd.DataFrame(index=subset_sizes, columns=models)

lin_svm_feats.append(all_genes)
non_lin_svm_feats.append(all_genes)
mlp_feats.append(all_genes)

subset_df['Lin SVM'] = lin_svm_feats
subset_df['Non-Lin SVM'] = non_lin_svm_feats
subset_df['MLP'] = mlp_feats



display(subset_df)


Unnamed: 0,Lin SVM,Non-Lin SVM,MLP
Top 2,"[IGFBP5.1, CCNE2]","[KIAA1442, ORC6L]","[LOC51203, Contig32185_RC]"
Top 3,"[IGFBP5.1, CCNE2, AL080059]","[KIAA1442, ORC6L, Contig48328_RC]","[LOC51203, Contig32185_RC, KIAA1442]"
Top 4,"[IGFBP5.1, CCNE2, AL080059, OXCT]","[KIAA1442, ORC6L, Contig48328_RC, Contig63649_RC]","[LOC51203, Contig32185_RC, KIAA1442, Contig553..."
Top 5,"[IGFBP5.1, CCNE2, AL080059, OXCT, KIAA1067]","[KIAA1442, ORC6L, Contig48328_RC, Contig63649_...","[LOC51203, Contig32185_RC, KIAA1442, Contig553..."
all genes,"[AL080059, Contig63649_RC, Contig46218_RC, LOC...","[AL080059, Contig63649_RC, Contig46218_RC, LOC...","[AL080059, Contig63649_RC, Contig46218_RC, LOC..."


In [15]:
# to test if our SFS works correctly (but this is without pre-feed)
# note: For MLP, we get different results (possibly due to ordering of input feature vector)
'''
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

sfs1 = sfs(clf_1,
           k_features=5,
           forward=True,
           floating=False,
           verbose=0,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(X_df, Y_df)
feat_cols = list(sfs1.k_feature_idx_)
feat_names = list(sfs1.k_feature_names_)
print(feat_names)
'''

# LIN SVM - ['AL080059', 'Contig46218_RC', 'MMP9', 'L2DTL', 'PECI.1']
# Non-Lin SVM - ['Contig38288_RC', 'KIAA1442', 'ECT2', 'Contig46223_RC', 'PRC1']
# MLP - ['AL080059', 'LOC51203', 'KIAA1442', 'MMP9', 'Contig32185_RC']


"\nfrom mlxtend.feature_selection import SequentialFeatureSelector as sfs\n\nsfs1 = sfs(clf_1,\n           k_features=5,\n           forward=True,\n           floating=False,\n           verbose=0,\n           scoring='accuracy',\n           cv=0)\n\nsfs1 = sfs1.fit(X_df, Y_df)\nfeat_cols = list(sfs1.k_feature_idx_)\nfeat_names = list(sfs1.k_feature_names_)\nprint(feat_names)\n"

## Training the models with selected Features

In [16]:
#Linear SVM
test_error_df = pd.DataFrame(index=subset_sizes, columns=models)
train_error_df = pd.DataFrame(index=subset_sizes, columns=models)
test_error0_df = pd.DataFrame(index=subset_sizes, columns=models)
test_error1_df = pd.DataFrame(index=subset_sizes, columns=models)

def get_error(model, X_train, Y_train, X_test, Y_test, f_list):
    X_train_subset = X_train.loc[:, f_list]
    X_test_subset  = X_test.loc[:, f_list]
    
    test_size = len(X_test.index) 
    
    model.fit(X_train_subset, Y_train)
    Y_pred = model.predict(X_test_subset)
    
    error_count = test_size - sum(Y_pred == Y_test)
    test_error = error_count/test_size
    
    return test_error

#[divide(a, b) for a, b in zip(df['A'], df['B'])]

# Total Test error

l1 = [get_error(clf_1, X_df, Y_df, X_test_df, Y_test_df, a) for a in subset_df['Lin SVM']]
l2 = [get_error(clf_2, X_df, Y_df, X_test_df, Y_test_df, a) for a in subset_df['Non-Lin SVM']]
l3 = [get_error(clf_3, X_df, Y_df, X_test_df, Y_test_df, a) for a in subset_df['MLP']]

test_error_df['Lin SVM'] = l1
test_error_df['Non-Lin SVM'] = l2
test_error_df['MLP'] = l3

# Total Training error

l4 = [get_error(clf_1, X_df, Y_df, X_df, Y_df, a) for a in subset_df['Lin SVM']]
l5 = [get_error(clf_2, X_df, Y_df, X_df, Y_df, a) for a in subset_df['Non-Lin SVM']]
l6 = [get_error(clf_3, X_df, Y_df, X_df, Y_df, a) for a in subset_df['MLP']]

train_error_df['Lin SVM'] = l4
train_error_df['Non-Lin SVM'] = l5
train_error_df['MLP'] = l6

display('Training error: ', train_error_df)
display('Test error: ', test_error_df)

# Test error on Class 0
test_df_0 = test_df.loc[test_df['Label'] == 0]
test_df_1 = test_df.loc[test_df['Label'] == 1]

X_test_0 = test_df_0.loc[:, all_genes]
X_test_1 = test_df_1.loc[:, all_genes]

Y_test_0 = test_df_0.loc[:, 'Label']
Y_test_1 = test_df_1.loc[:, 'Label']

#display(Y_test_0, Y_test_1)

l7 = [get_error(clf_1, X_df, Y_df, X_test_0, Y_test_0, a) for a in subset_df['Lin SVM']]
l8 = [get_error(clf_2, X_df, Y_df, X_test_0, Y_test_0, a) for a in subset_df['Non-Lin SVM']]
l9 = [get_error(clf_3, X_df, Y_df, X_test_0, Y_test_0, a) for a in subset_df['MLP']]

test_error0_df['Lin SVM'] = l7
test_error0_df['Non-Lin SVM'] = l8
test_error0_df['MLP'] = l9

l10 = [get_error(clf_1, X_df, Y_df, X_test_1, Y_test_1, a) for a in subset_df['Lin SVM']]
l11 = [get_error(clf_2, X_df, Y_df, X_test_1, Y_test_1, a) for a in subset_df['Non-Lin SVM']]
l12 = [get_error(clf_3, X_df, Y_df, X_test_1, Y_test_1, a) for a in subset_df['MLP']]

test_error1_df['Lin SVM'] = l10
test_error1_df['Non-Lin SVM'] = l11
test_error1_df['MLP'] = l12

display('Test error on class 0: ', test_error0_df)
display('Test error on class 1: ', test_error1_df)



'Training error: '

Unnamed: 0,Lin SVM,Non-Lin SVM,MLP
Top 2,0.2,0.2125,0.0625
Top 3,0.2,0.175,0.05
Top 4,0.1875,0.15,0.0125
Top 5,0.1875,0.1375,0.0125
all genes,0.0,0.0625,0.0


'Test error: '

Unnamed: 0,Lin SVM,Non-Lin SVM,MLP
Top 2,0.330233,0.446512,0.413953
Top 3,0.334884,0.423256,0.427907
Top 4,0.339535,0.395349,0.432558
Top 5,0.339535,0.362791,0.47907
all genes,0.339535,0.344186,0.35814


'Test error on class 0: '

Unnamed: 0,Lin SVM,Non-Lin SVM,MLP
Top 2,0.538462,0.205128,0.538462
Top 3,0.538462,0.205128,0.384615
Top 4,0.538462,0.25641,0.358974
Top 5,0.538462,0.25641,0.384615
all genes,0.358974,0.384615,0.282051


'Test error on class 1: '

Unnamed: 0,Lin SVM,Non-Lin SVM,MLP
Top 2,0.284091,0.5,0.386364
Top 3,0.289773,0.471591,0.4375
Top 4,0.295455,0.426136,0.448864
Top 5,0.295455,0.386364,0.5
all genes,0.335227,0.335227,0.375


## First Impressions
    1. Training errors are lesser than Test Errors --> Resubstitution error estimator is optimistically biased as expected
    2. Test error on class 1 is lesser than class 0 as expected (because of higher sample size in class 1)
    3. Based on dimensionality
        a. Linear SVM error decreases with increasing dimensions
        b. Non-Linear SVM error increases and then decreases with increasing dimensions
        c. MLP error increases and then decreased with increasing dimensions

In [17]:
    
    
X = train_df.loc[:, lin_svm_feats[0]]
Y = train_df.loc[:, 'Label']

clf_1.fit(X, Y)

pred = clf_1.predict(X)
error_count = train_size - sum(train_df['Label'] == pred)
error_count/train_size

test_size = len(test_df.index)
print(test_size)
X = test_df.loc[:, lin_svm_feats[0]]

pred2 = clf_1.predict(X)

error_count = test_size - sum(test_df['Label'] == pred2)
print('Lin SVM test error: ', error_count/test_size)

#Non linear SVM
X = train_df.loc[:, non_lin_svm_feats[0]]

clf_2.fit(X, Y)

X = test_df.loc[:, non_lin_svm_feats[0]]

pred3 = clf_2.predict(X)

error_count = test_size - sum(test_df['Label'] == pred3)
print('Non-Lin SVM test error: ', error_count/test_size)


# MLP
X = train_df.loc[:, mlp_feats[0]]

clf_3.fit(X, Y)

X = test_df.loc[:, mlp_feats[0]]

pred4 = clf_3.predict(X)

error_count = test_size - sum(test_df['Label'] == pred4)
print('MLP test error: ', error_count/test_size)


215
Lin SVM test error:  0.3302325581395349
Non-Lin SVM test error:  0.44651162790697674
MLP test error:  0.413953488372093


## Observation:
1. Neural network picks different subsets and coefficients everytime (possibly because of optimization differences?)

In [18]:
# All features

#Linear SVM
X_train = train_df.loc[:, all_feature_list]
Y = train_df.loc[:, 'Label']

X_test = test_df.loc[:, all_feature_list]

clf_1.fit(X_train, Y)

test_size = len(test_df.index)
print(test_size)

pred2 = clf_1.predict(X_test)

error_count = test_size - sum(test_df['Label'] == pred2)
print('Lin SVM test error: ', error_count/test_size)

#Non linear SVM

clf_2.fit(X_train, Y)

pred3 = clf_2.predict(X_test)

error_count = test_size - sum(test_df['Label'] == pred3)
print('Non-Lin SVM test error: ', error_count/test_size)


# MLP

clf_3.fit(X_train, Y)

pred4 = clf_3.predict(X_test)

error_count = test_size - sum(test_df['Label'] == pred4)
print('MLP test error: ', error_count/test_size)

215
Lin SVM test error:  0.3395348837209302
Non-Lin SVM test error:  0.34418604651162793
MLP test error:  0.3581395348837209
