In [16]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_score

In [2]:
tcga = pd.read_csv('TCGA_data.csv')
tcga.drop('cancer', axis = 1, inplace = True)
tcga = tcga.sort_index(axis=1)
print(tcga.shape)
tcga.head()

(20715, 787)


Unnamed: 0,cyto.10p11.1,cyto.10p11.21,cyto.10p11.22,cyto.10p11.23,cyto.10p12.1,cyto.10p12.2,cyto.10p12.31,cyto.10p12.32,cyto.10p12.33,cyto.10p13,...,cyto.9q32,cyto.9q33.1,cyto.9q33.2,cyto.9q33.3,cyto.9q34.11,cyto.9q34.12,cyto.9q34.13,cyto.9q34.2,cyto.9q34.3,y
0,-0.1608,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,...,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,BLCA
1,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,...,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,Normal
2,-0.2733,0.0403,0.0403,0.0403,0.0356,0.0356,0.0356,0.0356,0.0356,0.0356,...,-0.2716,-0.2716,-0.2716,-0.2716,-0.3456,-0.3456,0.2685,-0.3587,-0.3587,BLCA
3,0.0013,0.0013,0.0013,0.0013,-0.0057,0.0053,0.0053,0.0053,0.0053,0.0053,...,0.0015,0.0015,0.0015,0.0015,0.0015,0.0015,0.0015,0.0015,0.0015,Normal
4,-0.0003,-0.0003,-0.0003,-0.0003,-0.0003,-0.0002,-0.0002,-0.0002,-0.0002,-0.0002,...,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,Normal


In [3]:
print(len(tcga.y.value_counts()))
tcga.y.value_counts()

28


Normal    10170
BRCA       1079
OV          582
GBM         573
UCEC        539
KIRC        528
HNSC        522
LUAD        516
LGG         512
LUSC        501
THCA        499
PRAD        492
SKCM        469
COAD        449
STAD        441
BLCA        408
LIHC        370
CESC        295
KIRP        288
SARC        257
LAML        191
PAAD        184
ESCA        184
READ        165
PCPG        162
TGCT        150
THYM        123
KICH         66
Name: y, dtype: int64

In [4]:
X = tcga.drop(['y'], axis=1)
Y = tcga['y']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3)

### random forest

In [5]:
# modeling
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
accuracy_score(Y_test, rf.predict(X_test))

0.7142397425583267

In [21]:
from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)

CV_rfc.fit(X_train,Y_train)
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 200}

In [24]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=4, criterion='gini')
rfc1.fit(X_train, Y_train)
accuracy_score(Y_test, rfc1.predict(X_test))

0.5604183427192276

In [32]:
rf_best = RandomForestClassifier(random_state=42,n_jobs = -1, max_depth=4, min_samples_leaf=4, min_samples_split=4, n_estimators = 200)
rf_best.fit(X_train, Y_train)
pred = rf_best.predict(X_test)
accuracy_score(Y_test, pred)

0.5605792437650845

### LDA

In [25]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)  
lda = lda.fit(X_train, Y_train)  
print(accuracy_score(Y_test, lda.predict(X_test)))
print(classification_report(Y_test, lda.predict(X_test), pred, digits=3))

0.5390185036202735
             precision    recall  f1-score   support

       BRCA      0.231     0.557     0.327       325
     Normal      0.578     1.000     0.732      3030
       BRCA      0.231     0.557     0.327       325
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
       BRCA      0.231     0.557     0.327       325
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578     1.000     0.732      3030
     Normal      0.578    

### QDA

In [19]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qud = qda.fit(X_train, Y_train)
accuracy_score(Y_test, qda.predict(X_test))

0.5525341914722446