In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import time
from pandas import DataFrame
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report, precision_score,accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
tcga = pd.read_csv('TCGA_data.csv')
tcga.drop('cancer', axis = 1, inplace = True)
tcga = tcga.sort_index(axis=1)

from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
tcga["y_encode"] = lb_make.fit_transform(tcga["y"])
print('TCGA data shape:',tcga.shape)
tcga.head(3)

TCGA data shape: (20715, 788)


Unnamed: 0,cyto.10p11.1,cyto.10p11.21,cyto.10p11.22,cyto.10p11.23,cyto.10p12.1,cyto.10p12.2,cyto.10p12.31,cyto.10p12.32,cyto.10p12.33,cyto.10p13,...,cyto.9q33.1,cyto.9q33.2,cyto.9q33.3,cyto.9q34.11,cyto.9q34.12,cyto.9q34.13,cyto.9q34.2,cyto.9q34.3,y,y_encode
0,-0.1608,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,0.2213,...,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,-0.1732,BLCA,0
1,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,...,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,0.0009,Normal,15
2,-0.2733,0.0403,0.0403,0.0403,0.0356,0.0356,0.0356,0.0356,0.0356,0.0356,...,-0.2716,-0.2716,-0.2716,-0.3456,-0.3456,0.2685,-0.3587,-0.3587,BLCA,0


In [3]:
urine = pd.read_csv('urine_k15_0419.csv')
urine.drop(['Unnamed: 0','X'],axis=1,inplace=True)
urine = urine.sort_index(axis=1)
urine = urine.loc[urine['y']!='RCC_100k']
urine = urine.replace(['BLCA_15k','NL_15k','PRAD_15k'],[ 0,15,19 ])
urine = urine.reset_index(drop = True)
print('Urine data shape:',urine.shape)
urine.head(3)

Urine data shape: (83, 787)


Unnamed: 0,cyto.10p11.1,cyto.10p11.21,cyto.10p11.22,cyto.10p11.23,cyto.10p12.1,cyto.10p12.2,cyto.10p12.31,cyto.10p12.32,cyto.10p12.33,cyto.10p13,...,cyto.9q32,cyto.9q33.1,cyto.9q33.2,cyto.9q33.3,cyto.9q34.11,cyto.9q34.12,cyto.9q34.13,cyto.9q34.2,cyto.9q34.3,y
0,0.0476,0.024783,0.022472,0.07248,0.015994,-0.021031,0.040992,0.04525,-0.095944,-0.008191,...,-0.179049,-0.036678,-0.06217,-0.09068,-0.100512,-0.334829,-0.111079,-0.110344,-0.125893,0
1,-0.16784,-0.009222,0.001799,0.010304,-0.001134,-0.147352,-0.041152,-0.044125,-0.110519,0.031379,...,-0.051297,-0.214,-0.098335,0.086731,0.0561,0.110971,0.076008,-0.007611,-0.061152,0
2,0.17316,0.027507,0.067975,0.04651,0.060755,0.080742,0.049664,0.087875,0.218056,0.31106,...,-0.096214,-0.037703,-0.088706,-0.100768,-0.109234,-0.082571,-0.102937,-0.1316,-0.07617,0


In [4]:
urine.y.value_counts()

0     42
15    21
19    20
Name: y, dtype: int64

In [4]:
X = tcga.drop(['y','y_encode'], axis=1)
y = tcga['y_encode']

In [9]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5, random_state=0, shuffle = True)

In [25]:
def accuracy(model) :
    pred = model.predict(urine.drop('y', axis = 1))
    df_pred = pd.DataFrame({'actual':urine['y'],'predict':pred})
    
    BC_pred = df_pred.loc[df_pred['predict']==0]
    NL_pred = df_pred.loc[df_pred['predict']==15]
    PC_pred = df_pred.loc[df_pred['predict']==19]
    
    print('BC accuracy : {0:.4f}'.format(BC_pred.shape[0]/42))
    print('Normal accuracy : {0:.4f}'.format(NL_pred.shape[0]/21))
    print('PC accuracy : {0:.4f}'.format(PC_pred.shape[0]/20))

# Logistic

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.linear_model import LogisticRegression

standardsc = StandardScaler()
mms = MinMaxScaler()
mas = MaxAbsScaler()

lr = LogisticRegression()

In [12]:
def model_pipeline(model) :
    pipeline1 = Pipeline([('StandardScaler', standardsc), ('Logistic', model)])
    pipeline2 = Pipeline([('MinMaxScaler', mms), ('Logistic', model)])
    pipeline3 = Pipeline([('MaxAbsScaler', mas), ('Logistic', model)])

    cv_acc = []
    pip1_acc = []
    pip2_acc = []
    pip3_acc = []

    for i, (tr_ind, te_ind) in enumerate(skf.split(X,y)) :

        X_train, X_test = X.iloc[tr_ind], X.iloc[te_ind]
        y_train, y_test = y[tr_ind], y[te_ind]

        model.fit(X_train, y_train)
        pipeline1.fit(X_train, y_train)
        pipeline2.fit(X_train, y_train)
        pipeline3.fit(X_train, y_train)

        cv_acc.append(accuracy_score(y[te_ind], model.predict(X_test)))
        pip1_acc.append(accuracy_score(y[te_ind], pipeline1.predict(X_test)))
        pip2_acc.append(accuracy_score(y[te_ind], pipeline2.predict(X_test)))
        pip3_acc.append(accuracy_score(y[te_ind], pipeline3.predict(X_test)))

        print('{0} 번째 accuracy non_scale : {1:.4f}% StandardScale : {2:.4f}% MinMax : {3:.4f}% MaxAbs : {4:.4f}%'.format(i,cv_acc[i],pip1_acc[i],pip2_acc[i],pip3_acc[i]))

    print('\n mean accuracy non_scale : {0:.4f}% StandardScale : {1:.4f}% MinMax : {2:.4f}% MaxAbs : {3:.4f}%'.format(np.mean(cv_acc),np.mean(pip1_acc),np.mean(pip2_acc),np.mean(pip3_acc)))

In [None]:
model_pipeline(lr)

0 번째 accuracy non_scale : 0.8048% StandardScale : 0.7896% MinMax : 0.5000% MaxAbs : 0.7944%
1 번째 accuracy non_scale : 0.7990% StandardScale : 0.7821% MinMax : 0.4958% MaxAbs : 0.7886%
2 번째 accuracy non_scale : 0.7999% StandardScale : 0.7801% MinMax : 0.4975% MaxAbs : 0.7929%
3 번째 accuracy non_scale : 0.8035% StandardScale : 0.7786% MinMax : 0.4978% MaxAbs : 0.7876%


In [15]:
from sklearn.linear_model import LogisticRegression
start_time = time.time()
lr = LogisticRegression(n_jobs=-1)
lr.fit(X,y)
print('sec : ',time.time()-start_time) # 2 minutes

sec :  109.51327610015869


In [16]:
pred = lr.predict(urine.drop('y', axis = 1))
accuracy = accuracy_score(urine['y'],pred)
print('Logistic accuracy : {0:.4f}'.format(accuracy))

Logistic accuracy : 0.3253


In [23]:
pd.DataFrame({'actual':urine['y'],'predict':pred}).head(3)

Unnamed: 0,actual,predict
0,0,13
1,0,2
2,0,19


In [26]:
accuracy(lr)

BC accuracy : 0.1190
Normal accuracy : 2.1905
PC accuracy : 0.2000


# LightGBM

In [10]:
from lightgbm import LGBMClassifier
start_time = time.time()

lgbm_clf = LGBMClassifier(n_estimators=200, n_jobs=-1)
lgbm_clf.fit(X, y, eval_metric="logloss", verbose=50)
print(time.time()-start_time) # 4 minutes

244.83253026008606


In [11]:
pred = lgbm_clf.predict(urine.drop('y', axis = 1))
accuracy = accuracy_score(urine['y'], pred)
print('LightGBM accuracy : {0:.4f}'.format(accuracy))

LightGBM accuracy : 0.3373


In [27]:
accuracy(lgbm_clf)

BC accuracy : 0.1667
Normal accuracy : 1.8571
PC accuracy : 0.0000
