# Using Gene Expression Data to predict tumor types based on gene expression data (2308 genes)

#### Data consists of a number of tissue samples corresponding to four distinct types of small round blue cell tumors. For each tissue sample, 2308 gene expression measurements are available.

In [40]:
import numpy as np
from matplotlib.pyplot import subplots, cm
import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from sklearn.svm import SVC
from ISLP.svm import plot as plot_svm
from sklearn.metrics import RocCurveDisplay

In [41]:
roc_curve = RocCurveDisplay.from_estimator

In [42]:
Khan = load_data('Khan')
Khan

{'xtest':        G0001     G0002     G0003     G0004     G0005     G0006     G0007  \
 0   0.139501 -1.168927  0.564973 -3.366796 -1.323132 -0.692547  2.327395   
 1   1.164275 -2.018158  1.103533 -2.165435 -1.440117 -0.437420  2.661587   
 2   0.841093  0.254720 -0.208748 -2.148149 -1.512765 -1.263723  2.946642   
 3   0.685065 -1.927579 -0.233068 -1.640413 -1.008954  0.774451  1.617168   
 4  -1.956163 -2.234926  0.281563 -2.695628 -1.214697 -1.059872  2.498070   
 5  -0.258641 -1.684700  0.175800 -2.323809 -1.692276 -0.008637  2.302135   
 6  -1.109875 -1.046969 -0.853786 -2.607752 -1.770781 -1.259133  1.426380   
 7   1.471485 -1.751578 -0.256700 -1.899122 -1.364924 -1.198654  2.489878   
 8  -0.396159 -1.191386  0.696691 -1.862397 -1.312672  0.744980  1.762708   
 9  -2.136224 -2.236797 -0.946492 -2.777400 -1.822631 -0.455233  2.547514   
 10 -0.190676 -1.513219  0.824439 -2.391416 -1.112610  0.591945  2.101032   
 11  1.321409  0.495367  0.041526 -1.541779 -1.627093 -1.184170  2.

## Fitting support vector classifier using $C=0.01$

Since number of predictors is much greater than the number of observations, we can assume that the decision boundary will be a linear hyperplane

In [43]:
khan_linear =  SVC(kernel='linear', C=0.01)
khan_linear.fit(Khan['xtrain'], Khan['ytrain'])
confusion_table(khan_linear.predict(Khan['xtrain']), Khan['ytrain'])

Truth,1,2,3,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8,0,0,0
2,0,23,0,0
3,0,0,12,0
4,0,0,0,20


100% training accuracy, easy to find linear hyperplanes 

In [44]:
n_support_vectors = khan_linear.n_support_
total_support_vectors = n_support_vectors.sum()
total_support_vectors

54

54 support vectors to depend on for predicting test

In [45]:
confusion_table(khan_linear.predict(Khan['xtest']), Khan['ytest'])

Truth,1,2,3,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,0,0,0
2,0,6,2,0
3,0,0,4,0
4,0,0,0,5


In [46]:
np.mean(khan_linear.predict(Khan['xtest']) == Khan['ytest'])

0.9

90% test accuracy, which is very good

Using cross-validation to select optimal $C$, considering values between 0.01 and 10

In [47]:
kfold = skm.KFold(5, random_state=0, shuffle=True)
grid = skm.GridSearchCV(khan_linear, {'C': [0.01, 0.1, 0.5, 1, 5, 10]}, refit=True, cv=kfold, scoring='accuracy')
grid.fit(Khan['xtrain'], Khan['ytrain'])
grid.best_params_

{'C': 0.01}

In [48]:
grid.cv_results_

{'mean_fit_time': array([0.01486921, 0.01357942, 0.01455412, 0.01264629, 0.01513677,
        0.01107826]),
 'std_fit_time': array([0.01234886, 0.00478367, 0.00678149, 0.00162415, 0.00780924,
        0.00274526]),
 'mean_score_time': array([0.01164556, 0.01163435, 0.00609822, 0.00901084, 0.0084631 ,
        0.01105523]),
 'std_score_time': array([0.00633483, 0.00265277, 0.00562294, 0.00484461, 0.0068779 ,
        0.00628109]),
 'param_C': masked_array(data=[0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
              mask=[False, False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'C': 0.01},
  {'C': 0.1},
  {'C': 0.5},
  {'C': 1},
  {'C': 5},
  {'C': 10}],
 'split0_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split1_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split2_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split3_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split4_test_score': array([1., 1., 1., 1., 1., 1.]),
 'mean_test_score': array([1., 1., 1., 1., 1., 1.]

All values tested have the exact same accuracy, 100%

This can probably be attributed to the vast amount of predictors compared to the small number of observations, making it easy for the SVC to find an exact hyperplane

## Using radial kernel

Equation for radial kernel: $$K(x, x') = \exp(-\gamma\lVert x-x'^2\rVert)$$

where $\gamma = \dfrac{1}{2\sigma^2}$.

Using default value for gamma ($\gamma$)

In [49]:
khan_radial = SVC(kernel='rbf', C=0.01)
khan_radial.fit(Khan['xtrain'], Khan['ytrain'])
confusion_table(khan_radial.predict(Khan['xtrain']), Khan['ytrain'])

Truth,1,2,3,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0,0
2,8,23,12,20
3,0,0,0,0
4,0,0,0,0


In [50]:
np.mean(khan_radial.predict(Khan['xtrain']) == Khan['ytrain'])

0.36507936507936506

64% training error, very high <br>
Have to see how it performs on test

In [51]:
confusion_table(khan_radial.predict(Khan['xtest']), Khan['ytest'])

Truth,1,2,3,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0,0
2,3,6,6,5
3,0,0,0,0
4,0,0,0,0


In [52]:
np.mean(khan_radial.predict(Khan['xtest']) == Khan['ytest'])

0.3

Finding optimal C

In [55]:
kfold = skm.KFold(5, random_state=0, shuffle=True)
grid_radial = skm.GridSearchCV(khan_radial, {'C': [0.01, 0.1, 0.5, 1, 5, 10]}, refit=True, cv=kfold, scoring='accuracy')
grid_radial.fit(Khan['xtrain'], Khan['ytrain'])
grid_radial.best_params_

{'C': 5}

In [56]:
grid_radial.cv_results_

{'mean_fit_time': array([0.02846694, 0.01195331, 0.01430697, 0.0160079 , 0.01380916,
        0.01259203]),
 'std_fit_time': array([0.01483245, 0.00452431, 0.00795887, 0.00387411, 0.00781784,
        0.0062985 ]),
 'mean_score_time': array([0.0300921 , 0.0082572 , 0.00779409, 0.007514  , 0.00776   ,
        0.00939374]),
 'std_score_time': array([0.02657239, 0.0070439 , 0.00699291, 0.0064028 , 0.00699093,
        0.00766998]),
 'param_C': masked_array(data=[0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
              mask=[False, False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'C': 0.01},
  {'C': 0.1},
  {'C': 0.5},
  {'C': 1},
  {'C': 5},
  {'C': 10}],
 'split0_test_score': array([0.23076923, 0.23076923, 0.46153846, 1.        , 1.        ,
        1.        ]),
 'split1_test_score': array([0.30769231, 0.30769231, 0.46153846, 1.        , 1.        ,
        1.        ]),
 'split2_test_score': array([0.23076923, 0.23076923, 0.69230769, 1.        , 1.        ,
        1.    

100% CV accuracy for $C=5$

In [57]:
khan_radial_best = SVC(kernel='rbf', C=5)
khan_radial_best.fit(Khan['xtrain'], Khan['ytrain'])
confusion_table(khan_radial_best.predict(Khan['xtest']), Khan['ytest'])

Truth,1,2,3,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,0,0,0
2,0,6,3,0
3,0,0,2,0
4,0,0,1,5


In [58]:
np.mean(khan_radial_best.predict(Khan['xtest']) == Khan['ytest'])

0.8

80% test accuracy when $C=5$, gamma could also be adjusted

## Using polynomial kernel

Equation for polynomial kernel: $$K(x, x') = \left( \gamma \cdot (x, x') + \text{coef0} \right)^\text{degree}$$


In [59]:
khan_poly = SVC(kernel='poly', degree=2, C=0.01)
khan_poly.fit(Khan['xtrain'], Khan['ytrain'])
confusion_table(khan_poly.predict(Khan['xtrain']), Khan['ytrain'])

Truth,1,2,3,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0,0
2,8,23,12,20
3,0,0,0,0
4,0,0,0,0


In [60]:
np.mean(khan_poly.predict(Khan['xtrain']) == Khan['ytrain'])

0.36507936507936506

36% training accuracy, not very good

In [61]:
np.mean(khan_poly.predict(Khan['xtest']) == Khan['ytest'])

0.3

30% test accuracy

Looking for better parameter C

In [63]:
grid_poly = skm.GridSearchCV(khan_poly, {'C': [0.01, 0.1, 0.5, 1, 5, 10]}, refit=True, cv=kfold, scoring='accuracy')
grid_poly.fit(Khan['xtrain'], Khan['ytrain'])
grid_poly.best_params_

{'C': 1}

In [64]:
grid_poly.cv_results_

{'mean_fit_time': array([0.02580051, 0.00850949, 0.01019621, 0.00897174, 0.01242247,
        0.01061854]),
 'std_fit_time': array([0.00290515, 0.00635024, 0.00677448, 0.0073434 , 0.0062151 ,
        0.00647041]),
 'mean_score_time': array([0.01904278, 0.0112381 , 0.01181879, 0.00849214, 0.00937157,
        0.00827184]),
 'std_score_time': array([0.00930432, 0.00632565, 0.0060673 , 0.00891031, 0.00765185,
        0.00704896]),
 'param_C': masked_array(data=[0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
              mask=[False, False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'C': 0.01},
  {'C': 0.1},
  {'C': 0.5},
  {'C': 1},
  {'C': 5},
  {'C': 10}],
 'split0_test_score': array([0.23076923, 0.23076923, 0.92307692, 1.        , 1.        ,
        1.        ]),
 'split1_test_score': array([0.30769231, 0.30769231, 0.92307692, 1.        , 1.        ,
        1.        ]),
 'split2_test_score': array([0.23076923, 0.23076923, 0.92307692, 1.        , 1.        ,
        1.    

Using $C=1$

In [72]:
khan_poly_best = SVC(kernel='poly', degree=2, C=1)
khan_poly_best.fit(Khan['xtrain'], Khan['ytrain'])
confusion_table(khan_poly_best.predict(Khan['xtest']), Khan['ytest'])

Truth,1,2,3,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,0,0,0
2,0,6,2,0
3,0,0,4,0
4,0,0,0,5


In [73]:
np.mean(khan_poly_best.predict(Khan['xtest']) == Khan['ytest'])

0.9

90% test accuracy using C=1 for quadratic kernel

The best accuracy (90%) occured when a **linear kernel** was used at **$C=0.01, 0.1, 0.5, 1, 5, 10$**, and when a **quadratic kernel** was used with **$C=1, 5, 10$**.

A lot of the confusion tables were the same in this analysis due to the vast number of predictors vs. the super small number of observations (2308 to 83, training and test). This made it easy for the SVMs to find clear decision boundaries that were consistent