In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

## TCGA data

In [3]:
TCGA = pd.read_csv('TCGA_data.csv')
TCGA = TCGA.loc[(TCGA.cancer == 'BLCA') | (TCGA.cancer == 'PAAD')| (TCGA.cancer == 'KICH')| (TCGA.cancer == 'KIRC')| (TCGA.cancer == 'KIRP')]
TCGA.loc[(TCGA.cancer == 'BLCA') & (TCGA.y != 'Normal'),'y'] = 'BC'
TCGA.loc[(TCGA.cancer == 'PAAD') & (TCGA.y != 'Normal'),'y'] = 'PC'
TCGA.loc[(TCGA.cancer == 'KICH') & (TCGA.y != 'Normal'),'y'] = 'RC'
TCGA.loc[(TCGA.cancer == 'KIRC') & (TCGA.y != 'Normal'),'y'] = 'RC'
TCGA.loc[(TCGA.cancer == 'KIRP') & (TCGA.y != 'Normal'),'y'] = 'RC'

## urine data

In [7]:
BC_100 = pd.read_csv('BC_32ea_k100_cyto.csv')
Normal_100 = pd.read_csv('Normal_21ea_k100_cyto.csv')
PC_100 = pd.read_csv('PC_20ea_k100_cyto.csv')
RC_100 = pd.read_csv('RC_20ea_k100_cyto.csv')

#drop person id
BC_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
Normal_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
PC_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
RC_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)

BC_100['cancer'] = 'BC'
Normal_100['cancer'] = 'normal'
PC_100['cancer'] = 'PC'
RC_100['cancer'] = "RC"

K100 = pd.concat([BC_100,Normal_100,PC_100,RC_100], axis = 0)

del [BC_100,Normal_100,PC_100,RC_100]

K100 = K100.dropna(axis=0)
K100.isna().sum().sum()

0

## 공통 col 추출

In [8]:
a = list(TCGA.columns)
b = list(K100.columns)

new_col = []
for col in a :
    for col2 in b :
        if col==col2 :
            new_col.append(col)       

In [9]:
TCGA_new = TCGA[new_col]
K100_new = K100[new_col]

## 1. scale : 기본 스케일. 평균과 표준편차 사용

In [19]:
X = TCGA_new.drop(['cancer'], axis=1)
X = scale(X)
Y = TCGA['y']
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

rf = RandomForestClassifier()
rf.fit(X_train,y_train)

predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.909


In [20]:
X = K100_new.drop(['cancer'], axis=1)
X = scale(X)
Y = K100['cancer']

predicted = rf.predict(X)
accuracy = accuracy_score(Y, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.352


## 2. robust_scale : 중앙값과 IQR사용. 아웃라이어의 영향을 최소화

In [21]:
X = TCGA_new.drop(['cancer'], axis=1)
X = robust_scale(X)
Y = TCGA['y']
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

rf = RandomForestClassifier()
rf.fit(X_train,y_train)

predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.911


In [22]:
X = K100_new.drop(['cancer'], axis=1)
X = robust_scale(X)
Y = K100['cancer']

predicted = rf.predict(X)
accuracy = accuracy_score(Y, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.227


## 3. minmax_scale : 최대/최소값이 각각 1, 0이 되도록 스케일링

In [23]:
X = TCGA_new.drop(['cancer'], axis=1)
X = minmax_scale(X)
Y = TCGA['y']
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

rf = RandomForestClassifier()
rf.fit(X_train,y_train)

predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.91


In [24]:
X = K100_new.drop(['cancer'], axis=1)
X = minmax_scale(X)
Y = K100['cancer']

predicted = rf.predict(X)
accuracy = accuracy_score(Y, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.284


## 4. maxabs_scale :  최대절대값과 0이 각각 1, 0이 되도록 스케일링

In [25]:
X = TCGA_new.drop(['cancer'], axis=1)
X = maxabs_scale(X)
Y = TCGA['y']
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

rf = RandomForestClassifier()
rf.fit(X_train,y_train)

predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.914


In [26]:
X = K100_new.drop(['cancer'], axis=1)
X = maxabs_scale(X)
Y = K100['cancer']

predicted = rf.predict(X)
accuracy = accuracy_score(Y, predicted)

print(f'accuracy score: {accuracy:.3}')

accuracy score: 0.42
