In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import time
from pandas import DataFrame
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import GridSearchCV

In [2]:
tcga = pd.read_csv('TCGA_data.csv')
tcga.drop('cancer', axis = 1, inplace = True)
tcga = tcga.sort_index(axis=1)
print(tcga.shape)
tcga.head(3)

(20715, 787)


Unnamed: 0,cyto.10p11.1,cyto.10p11.21,cyto.10p11.22,cyto.10p11.23,cyto.10p12.1,cyto.10p12.2,cyto.10p12.31,cyto.10p12.32,cyto.10p12.33,cyto.10p13,...,cyto.9q32,cyto.9q33.1,cyto.9q33.2,cyto.9q33.3,cyto.9q34.11,cyto.9q34.12,cyto.9q34.13,cyto.9q34.2,cyto.9q34.3,y
0,-0.1608,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,...,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,BLCA
1,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,...,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,Normal
2,-0.2733,0.0403,0.0403,0.0403,0.0356,0.0356,0.0356,0.0356,0.0356,0.0356,...,-0.2716,-0.2716,-0.2716,-0.2716,-0.3456,-0.3456,0.2685,-0.3587,-0.3587,BLCA


In [3]:
# PC : 'KIRC', 'KIRP', 'KICH'
tcga.loc[tcga.y=='KIRC', 'y'] = 'RC'
tcga.loc[tcga.y=='KIRP', 'y'] = 'RC'
tcga.loc[tcga.y=='KICH', 'y'] = 'RC'

In [4]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
tcga["y_encode"] = lb_make.fit_transform(tcga["y"])
tcga[["y","y_encode"]].head()

Unnamed: 0,y,y_encode
0,BLCA,0
1,Normal,12
2,BLCA,0
3,Normal,12
4,Normal,12


In [5]:
tcga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20715 entries, 0 to 20714
Columns: 788 entries, cyto.10p11.1 to y_encode
dtypes: float64(786), int64(1), object(1)
memory usage: 124.5+ MB


In [6]:
print(len(tcga.y.value_counts()))
tcga.y.value_counts()  

26


Normal    10170
BRCA       1079
RC          882
OV          582
GBM         573
UCEC        539
HNSC        522
LUAD        516
LGG         512
LUSC        501
THCA        499
PRAD        492
SKCM        469
COAD        449
STAD        441
BLCA        408
LIHC        370
CESC        295
SARC        257
LAML        191
ESCA        184
PAAD        184
READ        165
PCPG        162
TGCT        150
THYM        123
Name: y, dtype: int64

In [7]:
X = tcga.drop(['y','y_encode'], axis=1)
Y = tcga['y_encode']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3)

In [8]:
tcga.head()

Unnamed: 0,cyto.10p11.1,cyto.10p11.21,cyto.10p11.22,cyto.10p11.23,cyto.10p12.1,cyto.10p12.2,cyto.10p12.31,cyto.10p12.32,cyto.10p12.33,cyto.10p13,...,cyto.9q33.1,cyto.9q33.2,cyto.9q33.3,cyto.9q34.11,cyto.9q34.12,cyto.9q34.13,cyto.9q34.2,cyto.9q34.3,y,y_encode
0,-0.1608,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,...,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,BLCA,0
1,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,...,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,Normal,12
2,-0.2733,0.0403,0.0403,0.0403,0.0356,0.0356,0.0356,0.0356,0.0356,0.0356,...,-0.2716,-0.2716,-0.2716,-0.3456,-0.3456,0.2685,-0.3587,-0.3587,BLCA,0
3,0.0013,0.0013,0.0013,0.0013,-0.0057,0.0053,0.0053,0.0053,0.0053,0.0053,...,0.0015,0.0015,0.0015,0.0015,0.0015,0.0015,0.0015,0.0015,Normal,12
4,-0.0003,-0.0003,-0.0003,-0.0003,-0.0003,-0.0002,-0.0002,-0.0002,-0.0002,-0.0002,...,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,-0.0024,Normal,12


### random forest

In [15]:
# modeling
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
accuracy_score(Y_test, rf.predict(X_test))

0.7259855189058729

In [19]:
import time
start_time = time.time()
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [300,500],
    'max_depth' : [75,100]
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=-1)

CV_rfc.fit(X_train,Y_train)
print('sec : ',time.time()-start_time)
CV_rfc.best_params_

sec :  813.526594877243


{'max_depth': 100, 'n_estimators': 500}

In [20]:
rf_best = CV_rfc.best_estimator_
pred = rf_best.predict(X_test)
accuracy = accuracy_score(Y_test, pred)
print('decision tree accuracy : {0:.4f}'.format(accuracy))

decision tree accuracy : 0.7807


## LightGBM

In [9]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=200)

evals = [(X_test, Y_test)]
lgbm_clf.fit(X_train, Y_train, early_stopping_rounds=30, eval_metric="logloss", eval_set=evals,
                verbose=50)

Training until validation scores don't improve for 30 rounds.
[50]	valid_0's multi_logloss: 0.814804	valid_0's multi_logloss: 0.814804
[100]	valid_0's multi_logloss: 0.786545	valid_0's multi_logloss: 0.786545
Early stopping, best iteration is:
[83]	valid_0's multi_logloss: 0.779703	valid_0's multi_logloss: 0.779703


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=200, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [12]:
pred = lgbm_clf.predict(X_test)
accuracy = accuracy_score(Y_test, pred)
print('LightGBM accuracy : {0:.4f}'.format(accuracy))

LightGBM accuracy : 0.8018


In [7]:
from lightgbm import LGBMClassifier

LGBM_clf = LGBMClassifier(n_estimators=200)
evals = [(X_train,Y_train),(X_test, Y_test)]
params = {'num_leaves': [32, 64 ],
          'max_depth':[128, 160],
          'min_child_samples':[40, 60, 100],
          'subsample':[0.8, 1]}

# 하이퍼 파라미터 테스트의 수행속도를 향상 시키기 위해 cv 를 지정하지 않습니다. 
gridcv = GridSearchCV(LGBM_clf, param_grid=params)
gridcv.fit(X_train, Y_train, early_stopping_rounds=30, eval_metric="logloss", eval_set=evals,verbose=100)

print('GridSearchCV 최적 파라미터:', gridcv.best_params_)
pred = gridcv.predict(X_test)
accuracy = accuracy_score(Y_test, pred)
print('LightGBM accuracy : {0:.4f}'.format(accuracy))

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[65]	valid_0's multi_logloss: 0.295724	valid_0's multi_logloss: 0.295724	valid_1's multi_logloss: 0.888712	valid_1's multi_logloss: 0.888712
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[64]	valid_0's multi_logloss: 0.288231	valid_0's multi_logloss: 0.288231	valid_1's multi_logloss: 0.876504	valid_1's multi_logloss: 0.876504
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[61]	valid_0's multi_logloss: 0.298406	valid_0's multi_logloss: 0.298406	valid_1's multi_logloss: 0.873047	valid_1's multi_logloss: 0.873047
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[65]	valid_0's multi_logloss: 0.295724	valid_0's multi_logloss: 0.295724	valid_1's multi_logloss: 0.888712	valid_1's multi_logloss: 0.888712
Training until validation scores don't improve f

Training until validation scores don't improve for 30 rounds.
[100]	valid_0's multi_logloss: 0.264908	valid_0's multi_logloss: 0.264908	valid_1's multi_logloss: 0.835105	valid_1's multi_logloss: 0.835105
Early stopping, best iteration is:
[74]	valid_0's multi_logloss: 0.284859	valid_0's multi_logloss: 0.284859	valid_1's multi_logloss: 0.819375	valid_1's multi_logloss: 0.819375
Training until validation scores don't improve for 30 rounds.
[100]	valid_0's multi_logloss: 0.254235	valid_0's multi_logloss: 0.254235	valid_1's multi_logloss: 0.81816	valid_1's multi_logloss: 0.81816
Early stopping, best iteration is:
[77]	valid_0's multi_logloss: 0.270898	valid_0's multi_logloss: 0.270898	valid_1's multi_logloss: 0.805526	valid_1's multi_logloss: 0.805526
Training until validation scores don't improve for 30 rounds.
[100]	valid_0's multi_logloss: 0.263256	valid_0's multi_logloss: 0.263256	valid_1's multi_logloss: 0.819938	valid_1's multi_logloss: 0.819938
Early stopping, best iteration is:
[77

Training until validation scores don't improve for 30 rounds.
[100]	valid_0's multi_logloss: 0.264187	valid_0's multi_logloss: 0.264187	valid_1's multi_logloss: 0.831652	valid_1's multi_logloss: 0.831652
Early stopping, best iteration is:
[78]	valid_0's multi_logloss: 0.279921	valid_0's multi_logloss: 0.279921	valid_1's multi_logloss: 0.817174	valid_1's multi_logloss: 0.817174
Training until validation scores don't improve for 30 rounds.
[100]	valid_0's multi_logloss: 0.254273	valid_0's multi_logloss: 0.254273	valid_1's multi_logloss: 0.817237	valid_1's multi_logloss: 0.817237
Early stopping, best iteration is:
[83]	valid_0's multi_logloss: 0.2647	valid_0's multi_logloss: 0.2647	valid_1's multi_logloss: 0.806519	valid_1's multi_logloss: 0.806519
Training until validation scores don't improve for 30 rounds.
[100]	valid_0's multi_logloss: 0.261792	valid_0's multi_logloss: 0.261792	valid_1's multi_logloss: 0.816826	valid_1's multi_logloss: 0.816826
Early stopping, best iteration is:
[79]	

### Support Vector Machine

In [8]:
from sklearn import svm
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, Y_train)
accuracy_score(Y_test, svm.predict(X_test))

0.7478680611423975

In [None]:
# Grid Search
# Parameter Grid
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001, 0.00001, 10]}
 
# Make grid search classifier
clf_grid = GridSearchCV(svm.SVC(), param_grid, verbose=1)
 
# Train the classifier
clf_grid.fit(X_train, Y_train)
 
# clf = grid.best_estimator_()
print("Best Parameters:\n", clf_grid.best_params_)
print("Best Estimators:\n", clf_grid.best_estimator_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


## AdaBoost

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.ensemble import AdaBoostClassifier
pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', AdaBoostClassifier())])
pipeline.fit(X_train, Y_train)
accuracy = accuracy_score(Y_test, pipeline.predict(X_test))
print('AdaBoost accuracy : {0:.4f}'.format(accuracy))

AdaBoost accuracy : 0.5718


## GBM

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', GradientBoostingClassifier())])
pipeline.fit(X_train, Y_train)
accuracy = accuracy_score(Y_test, pipeline.predict(X_test))
print('GBM accuracy : {0:.4f}'.format(accuracy))

GBM accuracy : 0.8047
