In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from itertools import combinations
from sklearn.neural_network import MLPClassifier
import math

In [2]:
data = pd.read_csv('data/Assign1_Training_Data.txt', sep="\t", header=None)
data_col = data.iloc[0]
data=data.reindex(data.index.drop(0))

data_test = pd.read_csv('data/Assign1_Testing_Data.txt', sep="\t", header=None)
data_test = data_test.reindex(data_test.index.drop(0))

In [3]:
perm = combinations(list(range(1,71)),2)

In [4]:
def get_all_error(model, data, fea):
    data_class_0 = data.loc[data[71] == '0']
    data_class_1 = data.loc[data[71] == '1']
    
    pred_0 = model.predict(data_class_0[fea])
    pred_1 = model.predict(data_class_1[fea])
    
    E0 = pred_0[pred_0 == '1'].size/pred_0.size
    E1 = pred_1[pred_1 == '0'].size/pred_1.size
    E  = (pred_0[pred_0 == '1'].size + pred_1[pred_1 == '0'].size) / (pred_0.size+pred_1.size)
    
    #print ("Error calculated individiually : ", E)
    #print ("Error directly : ", 1.0 - model.score(data[fea], data[71]))
    
    return E0, E1, E

def linear_svm(data=None, feature=None, test_data=None, class_spc_error = None):
    model = svm.SVC(kernel='linear',random_state=0, C=1, gamma='auto')
    model.fit(data[feature], data[71])
    if (class_spc_error == False):
        return model.score(test_data[feature], test_data[71])
    else:
        return get_all_error(model, test_data, feature)

def rbf_svm(data=None, feature=None, test_data=None, class_spc_error = None):
    model = svm.SVC(kernel='rbf',random_state=0, C=10, gamma='auto')
    model.fit(data[feature], data[71])
    if (class_spc_error == False):
        return model.score(test_data[feature], test_data[71])
    else:
        return get_all_error(model, test_data, feature)
    
                       
def nn(data=None, feature=None, test_data=None, class_spc_error = None):
    model = MLPClassifier(hidden_layer_sizes = (5,5), solver = 'lbfgs',  random_state=0, activation='logistic', max_iter=200)
    model.fit(data[feature], data[71])
    if (class_spc_error == False):
        return model.score(test_data[feature], test_data[71])
    else:
        return get_all_error(model, test_data, feature)

In [5]:
best_pair_lin_svm = []
best_pair_rbf_svm = []
best_pair_nn = []
lin_svm_score = 0
rbf_svm_score = 0
nn_score = 0

for i in perm:
    pair = list(i)
    tmp_score = linear_svm (data, pair, data, False)
    if (lin_svm_score < tmp_score):
        best_pair_lin_svm = pair
        lin_svm_score = tmp_score
        
    tmp_score = rbf_svm (data, pair, data, False)    
    if (rbf_svm_score < tmp_score):
        best_pair_rbf_svm = pair
        rbf_svm_score = tmp_score
        
    tmp_score = nn (data, pair, data, False)
    if (nn_score < tmp_score):
        best_pair_nn = pair
        nn_score = tmp_score
    
    
print ("linear svm error : ", 1.0 - lin_svm_score)
print ("rbf svm error : ", 1.0 - rbf_svm_score)
print ("nn svm error : ", 1.0 - nn_score)
        
print ("linear svm best pair : ", best_pair_lin_svm)
print ("rbf svm best pair : ", best_pair_rbf_svm)
print ("nn best pair : ", best_pair_nn)    

linear svm error :  0.21250000000000002
rbf svm error :  0.21250000000000002
nn svm error :  0.07499999999999996
linear svm best pair :  [4, 20]
rbf svm best pair :  [13, 49]
nn best pair :  [4, 31]


In [7]:
# Resubstitution error for best pair

# Linear SVM performance on test set
E0, E1, E = linear_svm (data, best_pair_lin_svm, data, True)
print ("Linear SVM resubstitution error : ")
print ("Best pair of features : \n", data_col[best_pair_lin_svm], " indexed at :", best_pair_lin_svm)
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, best_pair_rbf_svm, data, True)
print ("RBF SVM resubstitution error : ")
print ("Best pair of features : \n", data_col[best_pair_rbf_svm], " indexed at :", best_pair_rbf_svm)
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, best_pair_nn, data, True)
print ("NN resubstitution error : ")
print ("Best pair of features : \n", data_col[best_pair_nn], " indexed at :", best_pair_nn)
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM resubstitution error : 
Best pair of features : 
 4     LOC51203
20        MMP9
Name: 0, dtype: object  indexed at : [4, 20]
E0 :  0.05  E1 :  0.375  E :  0.2125 

RBF SVM resubstitution error : 
Best pair of features : 
 13    KIAA1442
49       ORC6L
Name: 0, dtype: object  indexed at : [13, 49]
E0 :  0.2  E1 :  0.225  E :  0.2125 

NN resubstitution error : 
Best pair of features : 
 4           LOC51203
31    Contig32185_RC
Name: 0, dtype: object  indexed at : [4, 31]
E0 :  0.1  E1 :  0.05  E :  0.075 



In [8]:
# Classification performance with best pair of features on test set

# Linear SVM performance on test set
E0, E1, E = linear_svm (data, best_pair_lin_svm, data_test, True)
print ("Linear SVM error on test data : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, best_pair_rbf_svm, data_test, True)
print ("RBF SVM error on test data : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, best_pair_nn, data_test, True)
print ("NN error on test data :")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM error on test data : 
E0 :  0.20512820512820512  E1 :  0.5113636363636364  E :  0.4558139534883721 

RBF SVM error on test data : 
E0 :  0.20512820512820512  E1 :  0.5  E :  0.44651162790697674 

NN error on test data :
E0 :  0.5897435897435898  E1 :  0.36363636363636365  E :  0.4046511627906977 



In [9]:
def best_d_features (data, d):
    lin_svm_features = best_pair_lin_svm.copy()
    rbf_svm_features = best_pair_rbf_svm.copy()
    nn_features = best_pair_nn.copy()
    lin_score = 0
    rbf_score = 0
    nn_l_score = 0
    features_set = list (range(1,71))
    for k in range(d-2):
        tmp_lin_svm_fea = []
        tmp_rbf_svm_fea = []
        tmp_nn_fea = []
        lin_svm_score = 0
        rbf_svm_score = 0
        nn_score = 0
        
        for i in features_set:
            if (i not in lin_svm_features):
            
                fea = [i]
                fea.extend (lin_svm_features)
            
                tmp_score = linear_svm (data, fea, data, False)
                if (lin_svm_score < tmp_score):
                    tmp_lin_svm_fea = fea
                    lin_svm_score = tmp_score
              
            if (i not in rbf_svm_features):
                fea = [i]
                fea.extend (rbf_svm_features)
            
                tmp_score = rbf_svm (data, fea, data, False)
                if (rbf_svm_score < tmp_score):
                    tmp_rbf_svm_fea = fea
                    rbf_svm_score = tmp_score
            
            if (i not in nn_features):
                fea = nn_features.copy()
                fea.append(i)
            
                tmp_score = nn (data, fea, data, False)
                if (nn_score < tmp_score):
                    tmp_nn_fea = fea
                    nn_score = tmp_score
        
        lin_svm_features = tmp_lin_svm_fea
        rbf_svm_features = tmp_rbf_svm_fea
        nn_features = tmp_nn_fea
        lin_score = lin_svm_score
        rbf_score = rbf_svm_score
        nn_l_score = nn_score
        
        
    return lin_svm_features, rbf_svm_features, nn_features, 1.0 - lin_score, 1.0 - rbf_score, 1.0 - nn_l_score

In [10]:
best_features_for_3 = best_d_features (data, 3)
best_features_for_4 = best_d_features (data, 4)
best_features_for_5 = best_d_features (data, 5)

In [11]:
print ("Best 3 Features for linear, rbf and nn, and their scores : ", best_features_for_3)
print ("Best 4 Features for linear, rbf and nn, and their scores : ", best_features_for_4)
print ("Best 5 Features for linear, rbf and nn, and their scores : ", best_features_for_5)

Best 3 Features for linear, rbf and nn, and their scores :  ([28, 4, 20], [33, 13, 49], [4, 31, 13], 0.19999999999999996, 0.17500000000000004, 0.050000000000000044)
Best 4 Features for linear, rbf and nn, and their scores :  ([6, 28, 4, 20], [2, 33, 13, 49], [4, 31, 13, 21], 0.1875, 0.15000000000000002, 0.012499999999999956)
Best 5 Features for linear, rbf and nn, and their scores :  ([45, 6, 28, 4, 20], [25, 2, 33, 13, 49], [4, 31, 13, 21, 38], 0.1875, 0.13749999999999996, 0.012499999999999956)


In [12]:
# Resubstitution error with 3 features

E0, E1, E = linear_svm (data, best_features_for_3[0], data, True)
print ("Linear SVM Resubstitution for 3 features : ")
print ("Seq 3 features : \n", data_col[best_features_for_3[0]], " indexed at :", best_features_for_3[0])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, best_features_for_3[1], data, True)
print ("RBF SVM Resubstitution error for 3 features : ")
print ("Seq 3 features : \n", data_col[best_features_for_3[1]], " indexed at :", best_features_for_3[1])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, best_features_for_3[2], data, True)
print ("NN Resubstitution error for 3 features : ")
print ("Seq 3 features : \n", data_col[best_features_for_3[2]], " indexed at :", best_features_for_3[2])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM Resubstitution for 3 features : 
Seq 3 features : 
 28      SERF1A
4     LOC51203
20        MMP9
Name: 0, dtype: object  indexed at : [28, 4, 20]
E0 :  0.05  E1 :  0.35  E :  0.2 

RBF SVM Resubstitution error for 3 features : 
Seq 3 features : 
 33    Contig48328_RC
13          KIAA1442
49             ORC6L
Name: 0, dtype: object  indexed at : [33, 13, 49]
E0 :  0.125  E1 :  0.225  E :  0.175 

NN Resubstitution error for 3 features : 
Seq 3 features : 
 4           LOC51203
31    Contig32185_RC
13          KIAA1442
Name: 0, dtype: object  indexed at : [4, 31, 13]
E0 :  0.025  E1 :  0.075  E :  0.05 



In [13]:
# Classification performance with 3 features on test set

# Linear SVM performance on test set
E0, E1, E = linear_svm (data, best_features_for_3[0], data_test, True)
print ("Linear SVM error on test data for 3 features : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, best_features_for_3[1], data_test, True)
print ("RBF SVM error on test data for 3 features : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, best_features_for_3[2], data_test, True)
print ("NN error on test data for 3 features : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM error on test data for 3 features : 
E0 :  0.20512820512820512  E1 :  0.48863636363636365  E :  0.4372093023255814 

RBF SVM error on test data for 3 features : 
E0 :  0.20512820512820512  E1 :  0.4715909090909091  E :  0.4232558139534884 

NN error on test data for 3 features : 
E0 :  0.358974358974359  E1 :  0.45454545454545453  E :  0.4372093023255814 



In [14]:
# Resubstitution error with 4 features

E0, E1, E = linear_svm (data, best_features_for_4[0], data, True)
print ("Linear SVM Resubstitution for 4 features :  ")
print ("Seq 4 features : \n", data_col[best_features_for_4[0]], " indexed at :", best_features_for_4[0])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, best_features_for_4[1], data, True)
print ("RBF SVM Resubstitution for 4 features : ")
print ("Seq 4 features : \n", data_col[best_features_for_4[1]], " indexed at :", best_features_for_4[1])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, best_features_for_4[2], data, True)
print ("NN Resubstitution for 4 features : ")
print ("Seq 4 features : \n", data_col[best_features_for_4[2]], " indexed at :", best_features_for_4[2])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM Resubstitution for 4 features :  
Seq 4 features : 
 6        ALDH4
28      SERF1A
4     LOC51203
20        MMP9
Name: 0, dtype: object  indexed at : [6, 28, 4, 20]
E0 :  0.05  E1 :  0.325  E :  0.1875 

RBF SVM Resubstitution for 4 features : 
Seq 4 features : 
 2     Contig63649_RC
33    Contig48328_RC
13          KIAA1442
49             ORC6L
Name: 0, dtype: object  indexed at : [2, 33, 13, 49]
E0 :  0.15  E1 :  0.15  E :  0.15 

NN Resubstitution for 4 features : 
Seq 4 features : 
 4           LOC51203
31    Contig32185_RC
13          KIAA1442
21    Contig55377_RC
Name: 0, dtype: object  indexed at : [4, 31, 13, 21]
E0 :  0.0  E1 :  0.025  E :  0.0125 



In [15]:
# Classification performance with 4 features on test set

# Linear SVM performance on test set
E0, E1, E = linear_svm (data, best_features_for_4[0], data_test, True)
print ("Linear SVM error on test data for 4 features : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, best_features_for_4[1], data_test, True)
print ("RBF SVM error on test data for 4 features : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, best_features_for_4[2], data_test, True)
print ("NN error on test data for 4 features : ",)
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM error on test data for 4 features : 
E0 :  0.1794871794871795  E1 :  0.5056818181818182  E :  0.44651162790697674 

RBF SVM error on test data for 4 features : 
E0 :  0.2564102564102564  E1 :  0.42613636363636365  E :  0.3953488372093023 

NN error on test data for 4 features : 
E0 :  0.41025641025641024  E1 :  0.4318181818181818  E :  0.42790697674418604 



In [16]:
# Resubstitution error with 5 features

E0, E1, E = linear_svm (data, best_features_for_5[0], data, True)
print ("Linear SVM Resubstitution for 5 features :  ")
print ("Seq 5 features : \n", data_col[best_features_for_5[0]], " indexed at :", best_features_for_5[0])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, best_features_for_5[1], data, True)
print ("RBF SVM Resubstitution for 5 features : ")
print ("Seq 5 features : \n", data_col[best_features_for_5[1]], " indexed at :", best_features_for_5[1])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, best_features_for_5[2], data, True)
print ("NN Resubstitution for 5 features : ")
print ("Seq 5 features : \n", data_col[best_features_for_5[2]], " indexed at :", best_features_for_5[2])
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM Resubstitution for 5 features :  
Seq 5 features : 
 45    Contig51464_RC
6              ALDH4
28            SERF1A
4           LOC51203
20              MMP9
Name: 0, dtype: object  indexed at : [45, 6, 28, 4, 20]
E0 :  0.05  E1 :  0.325  E :  0.1875 

RBF SVM Resubstitution for 5 features : 
Seq 5 features : 
 25               HEC
2     Contig63649_RC
33    Contig48328_RC
13          KIAA1442
49             ORC6L
Name: 0, dtype: object  indexed at : [25, 2, 33, 13, 49]
E0 :  0.175  E1 :  0.1  E :  0.1375 

NN Resubstitution for 5 features : 
Seq 5 features : 
 4           LOC51203
31    Contig32185_RC
13          KIAA1442
21    Contig55377_RC
38    Contig40831_RC
Name: 0, dtype: object  indexed at : [4, 31, 13, 21, 38]
E0 :  0.025  E1 :  0.0  E :  0.0125 



In [17]:
# Classification performance with 5 features on test set

# Linear SVM performance on test set
E0, E1, E = linear_svm (data, best_features_for_5[0], data_test, True)
print ("Linear SVM error on test data for 5 features : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")


E0, E1, E = rbf_svm (data, best_features_for_5[1], data_test, True)
print ("RBF SVM error on test data for 5 features : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")


E0, E1, E = nn (data, best_features_for_5[2], data_test, True)
print ("NN error on test data for 5 features: ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Linear SVM error on test data for 5 features : 
E0 :  0.1794871794871795  E1 :  0.5113636363636364  E :  0.4511627906976744 

RBF SVM error on test data for 5 features : 
E0 :  0.2564102564102564  E1 :  0.38636363636363635  E :  0.3627906976744186 

NN error on test data for 5 features: 
E0 :  0.38461538461538464  E1 :  0.5  E :  0.4790697674418605 



In [18]:
# Resubstitution error for all features

E0, E1, E = linear_svm (data, list(range(1,71)), data, True)
print ("Resubstitution error with all features linear svm : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, list(range(1,71)), data, True)
print ("Resubstitution error with all features rbf svm : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, list(range(1,71)), data, True)
print ("Resubstitution error with all features nn : ",)
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")


Resubstitution error with all features linear svm : 
E0 :  0.075  E1 :  0.025  E :  0.05 

Resubstitution error with all features rbf svm : 
E0 :  0.1  E1 :  0.025  E :  0.0625 

Resubstitution error with all features nn : 
E0 :  0.0  E1 :  0.0  E :  0.0 



In [19]:
E0, E1, E = linear_svm (data, list(range(1,71)), data_test, True)
print ("Error with all features linear svm on test set : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = rbf_svm (data, list(range(1,71)), data_test, True)
print ("Error with all features rbf svm on test set : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

E0, E1, E = nn (data, list(range(1,71)), data_test, True)
print ("Error with all features nn on test set : ")
print ("E0 : ", E0, " E1 : ", E1, " E : ", E, "\n")

Error with all features linear svm on test set : 
E0 :  0.38461538461538464  E1 :  0.32954545454545453  E :  0.3395348837209302 

Error with all features rbf svm on test set : 
E0 :  0.38461538461538464  E1 :  0.3352272727272727  E :  0.34418604651162793 

Error with all features nn on test set : 
E0 :  0.23076923076923078  E1 :  0.38636363636363635  E :  0.3581395348837209 

