In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

## K100

In [13]:
BC_100 = pd.read_csv('BC_32ea_k100_cyto.csv')
Normal_100 = pd.read_csv('Normal_21ea_k100_cyto.csv')
PC_100 = pd.read_csv('PC_20ea_k100_cyto.csv')
RC_100 = pd.read_csv('RC_20ea_k100_cyto.csv')

In [16]:
#drop person id
BC_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
Normal_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
PC_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)
RC_100.drop(['Unnamed: 0','id'], axis = 1, inplace = True)

In [18]:
BC_100['cancer'] = 'BC'
Normal_100['cancer'] = 'normal'
PC_100['cancer'] = 'PC'
RC_100['cancer'] = "RC"

K100 = pd.concat([BC_100,Normal_100,PC_100,RC_100], axis = 0)

In [19]:
del [BC_100,Normal_100,PC_100,RC_100]

In [20]:
K100.shape

(94, 766)

In [21]:
K100.cancer.value_counts()

BC        32
RC        21
normal    21
PC        20
Name: cancer, dtype: int64

In [31]:
K100.isna().sum().sum()

647

In [30]:
K100_drop = K100.dropna(axis=0)

In [37]:
K100_drop.isna().sum().sum()

0

In [34]:
K100_drop.cancer.value_counts()

BC        32
normal    21
PC        19
RC        16
Name: cancer, dtype: int64

In [32]:
print(K100.shape)
print(K100_drop.shape)

(94, 766)
(88, 766)


In [33]:
K100_drop.head()

Unnamed: 0,cyto.10p11.1,cyto.10p11.21,cyto.10p11.22,cyto.10p11.23,cyto.10p12.1,cyto.10p12.2,cyto.10p12.31,cyto.10p12.32,cyto.10p12.33,cyto.10p13,...,cyto.9q33.1,cyto.9q33.2,cyto.9q33.3,cyto.9q34.11,cyto.9q34.12,cyto.9q34.13,cyto.9q34.2,cyto.9q34.3,<NA>,cancer
0,0.181,-0.019967,0.044536,-0.026,0.018065,0.0049,0.048,0.084,-0.042857,0.0196,...,0.015217,-0.044152,-0.004636,0.014333,0.0116,-0.004737,-0.019571,-0.022065,-0.745,BC
1,0.127,0.010733,0.03125,0.0486,0.084413,0.0516,0.092889,0.102,-0.012857,0.07528,...,-0.043478,-0.046,-0.007386,-0.009233,-0.0136,-0.060474,0.027714,0.00229,0.255,BC
2,0.004,0.007567,0.024964,0.0812,-0.011391,0.03605,-0.050194,-0.093,-0.059857,-0.01634,...,-0.158783,-0.193576,-0.075977,-0.106667,-0.0114,-0.072421,-0.127714,-0.168903,-0.136,BC
3,0.135667,0.061467,-0.061286,-0.071733,0.070522,0.0627,0.020556,-0.002,0.050857,-0.00038,...,-0.019326,0.012515,-0.009591,0.016667,0.0438,0.019947,0.0115,0.031387,0.01,BC
4,-0.251,-0.1347,-0.17575,-0.1658,-0.112522,-0.0667,-0.127972,0.026,-0.198429,-0.00408,...,-0.074109,0.024758,0.097,0.018667,0.117,0.058368,0.038571,0.020065,-0.4595,BC


## 1.1 Classification : KNN 

In [40]:
#대대분류 삭제
K100_cluter = K100_drop.drop('cancer', axis = 1)
K100_cluter.shape

(88, 765)

In [42]:
mms = MinMaxScaler()
mms.fit(K100_cluter) #대대분류 뺀 column들만 갖고
cluster_transformed = mms.transform(K100_cluter)

In [47]:
# original 
k=4
km = KMeans(n_clusters = k, algorithm = 'auto')
km.fit(K100_cluter) # 데이터를 이용하여 클러스터링을 위한 학습 시작하고 중심점 추출
km_predict = pd.DataFrame(km.predict(K100_cluter)) # 학습된 모델이 맞춰 라벨 리턴
km_predict.columns = ['predict']
km_predict.predict.value_counts()

0    77
1     9
3     1
2     1
Name: predict, dtype: int64

In [48]:
# scale data
km = KMeans(n_clusters = k, algorithm = 'auto')
km.fit(cluster_transformed) # 데이터를 이용하여 클러스터링을 위한 학습 시작하고 중심점 추출
km_predict_scale = pd.DataFrame(km.predict(cluster_transformed)) # 학습된 모델이 맞춰 라벨 리턴
km_predict_scale.columns = ['predict']
km_predict_scale.predict.value_counts()

0    76
2    10
3     1
1     1
Name: predict, dtype: int64

## 1.2 Classification : Modeling

In [49]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier()
from sklearn.svm import SVC
svm1 = SVC()

In [59]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True, random_state=0)

In [60]:
for i, (idx_train, idx_cv) in enumerate(cv.split(K100_drop)):
    df_train = K100_drop.iloc[idx_train]
    df_cv = K100_drop.iloc[idx_cv]
    
    x_train = df_train.drop(['cancer'], axis=1)
    y_train = df_train['cancer']
    
    x_cv = df_cv.drop(['cancer'], axis=1)
    y_cv = df_cv['cancer']
    
    print('[K =',i,']')
    print('-----------svm1-----------')
    svm1.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, svm1.predict(x_train)), accuracy_score(y_cv, svm1.predict(x_cv))))
    
    print('--------rf1--------')
    rf1.fit(x_train, y_train)
    print("train accurcacy = {}, cv accurcacy = {}".format(accuracy_score(y_train, rf1.predict(x_train)), accuracy_score(y_cv, rf1.predict(x_cv))))
    

[K = 0 ]
-----------svm1-----------
train accurcacy = 0.34285714285714286, cv accurcacy = 0.4444444444444444
--------rf1--------
train accurcacy = 1.0, cv accurcacy = 0.3333333333333333
[K = 1 ]
-----------svm1-----------
train accurcacy = 0.38571428571428573, cv accurcacy = 0.2777777777777778
--------rf1--------
train accurcacy = 0.9857142857142858, cv accurcacy = 0.2222222222222222
[K = 2 ]
-----------svm1-----------
train accurcacy = 0.34285714285714286, cv accurcacy = 0.4444444444444444
--------rf1--------
train accurcacy = 0.9857142857142858, cv accurcacy = 0.3888888888888889
[K = 3 ]
-----------svm1-----------
train accurcacy = 0.352112676056338, cv accurcacy = 0.4117647058823529
--------rf1--------
train accurcacy = 1.0, cv accurcacy = 0.5294117647058824
[K = 4 ]
-----------svm1-----------
train accurcacy = 0.39436619718309857, cv accurcacy = 0.23529411764705882
--------rf1--------
train accurcacy = 0.971830985915493, cv accurcacy = 0.17647058823529413
