In [14]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Load data : Normal - BC

In [2]:
### TCGA ###
TCGA = pd.read_csv('TCGA_data.csv')
TCGA = TCGA.loc[(TCGA.cancer == 'BLCA')]
TCGA.loc[(TCGA.cancer == 'BLCA') & (TCGA.y != 'Normal'),'y'] = 'BC'

### Urine ###
BC_100 = pd.read_csv('BC_32ea_k15_cyto.csv')
Normal_100 = pd.read_csv('Normal_21ea_k15_cyto.csv')

BC_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
Normal_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)

BC_100['cancer'] = 'BC'
Normal_100['cancer'] = 'normal'

K100 = pd.concat([BC_100,Normal_100], axis = 0)
del [BC_100,Normal_100]

TCGA.y.value_counts()

BC        408
Normal    375
Name: y, dtype: int64

## 공통 col 추출

In [4]:
a = list(TCGA.columns)
b = list(K100.columns)

new_col = []
for col in a :
    for col2 in b :
        if col==col2 :
            new_col.append(col)       

In [5]:
TCGA_new = TCGA[new_col]
K100_new = K100[new_col]

In [6]:
X_train = TCGA_new.drop(['cancer'], axis=1)
X_train = scale(X_train)
Y_train = TCGA['y']

X_test = K100_new.drop(['cancer'], axis=1)
X_test = scale(X_test)
Y_test = K100['cancer']

### random forest

In [19]:
# modeling
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
accuracy11 = accuracy_score(Y_test, rf.predict(X_test))

0.6037735849056604 0.9987228607918263


In [20]:
confusion_matrix(Y_test, rf.predict(X_test))

array([[32,  0],
       [21,  0]], dtype=int64)

### logistic

In [21]:
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression()
lr1.fit(X_train,Y_train)
accuracy_score(Y_test, rf.predict(X_test))

0.6037735849056604

In [16]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0).fit(X_train,Y_train)
accuracy_score(Y_test, clf.predict(X_test))

0.5283018867924528

### Adaboost

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.ensemble import AdaBoostClassifier

pipeline = Pipeline([('scaler', MaxAbsScaler()), ('classifier', AdaBoostClassifier())])
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

0.6037735849056604

In [9]:
confusion_matrix(Y_test, pipeline.predict(X_test))

array([[32,  0],
       [21,  0]], dtype=int64)

### xgboost

In [15]:
from xgboost import XGBClassifier
pipeline = Pipeline([('scaler', MaxAbsScaler()), ('classifier', XGBClassifier())])
pipeline.fit(X_train, Y_train)
y_pred = pipeline.predict(X_test)
accuracy_score(Y_test, y_pred)

0.6037735849056604

In [16]:
confusion_matrix(Y_test, pipeline.predict(X_test))

array([[32,  0],
       [21,  0]], dtype=int64)

## Load data : Normal - PC

In [20]:
### TCGA ###
TCGA = pd.read_csv('TCGA_data.csv')
TCGA = TCGA.loc[(TCGA.cancer == 'PRAD')]
TCGA.loc[(TCGA.cancer == 'PRAD') & (TCGA.y != 'Normal'),'y'] = 'PC'

### Urine ###
PC_15 = pd.read_csv('PC_20ea_k15_cyto.csv')
Normal_15 = pd.read_csv('Normal_21ea_k15_cyto.csv')

PC_15.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
Normal_15.drop(['Unnamed: 0','id'], axis = 1, inplace = True)

PC_15['cancer'] = 'PC'
Normal_15['cancer'] = 'normal'

K15 = pd.concat([PC_15,Normal_15], axis = 0)
del [PC_15,Normal_15]

TCGA.y.value_counts()

PC        492
Normal    478
Name: y, dtype: int64

## 공통 col 추출

In [21]:
a = list(TCGA.columns)
b = list(K15.columns)

new_col = []
for col in a :
    for col2 in b :
        if col==col2 :
            new_col.append(col)       

In [22]:
TCGA_new = TCGA[new_col]
K15_new = K15[new_col]

In [29]:
K15_new = K15_new.fillna(-1022)

In [30]:
X_train = TCGA_new.drop(['cancer'], axis=1)
X_train = scale(X_train)
Y_train = TCGA['y']

X_test = K15_new.drop(['cancer'], axis=1)
X_test = scale(X_test)
Y_test = K15['cancer']

### random forest

In [34]:
# modeling
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
accuracy11 = accuracy_score(Y_test, rf.predict(X_test))
accuracy11

0.4634146341463415

In [35]:
Y_test.value_counts()

normal    21
PC        20
Name: cancer, dtype: int64

In [32]:
confusion_matrix(Y_test, rf.predict(X_test))

array([[ 0,  0,  0],
       [ 1, 19,  0],
       [ 0, 21,  0]], dtype=int64)

### logistic

In [41]:
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression()
lr1.fit(X_train,Y_train)
accuracy_score(Y_test, rf.predict(X_test))

0.4634146341463415

In [42]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0).fit(X_train,Y_train)
accuracy_score(Y_test, clf.predict(X_test))

0.4878048780487805

### Adaboost

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.ensemble import AdaBoostClassifier

pipeline = Pipeline([('scaler', MaxAbsScaler()), ('classifier', AdaBoostClassifier())])
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

0.4878048780487805

In [44]:
confusion_matrix(Y_test, pipeline.predict(X_test))

array([[20,  0],
       [21,  0]], dtype=int64)

### xgboost

In [45]:
from xgboost import XGBClassifier
pipeline = Pipeline([('scaler', MaxAbsScaler()), ('classifier', XGBClassifier())])
pipeline.fit(X_train, Y_train)
y_pred = pipeline.predict(X_test)
accuracy_score(Y_test, y_pred)

0.4878048780487805

In [46]:
confusion_matrix(Y_test, pipeline.predict(X_test))

array([[20,  0],
       [21,  0]], dtype=int64)