In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cluster import KMeans

import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

from math import sqrt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR



In [5]:
clinical = pd.read_csv('./trainData/train_data_clinical_patient.csv', usecols=['PATIENT_ID', 'CANCER_TYPE_ACRONYM'])

In [6]:
clinical

Unnamed: 0,PATIENT_ID,CANCER_TYPE_ACRONYM
0,TCGA-3M-AB46,STAD
1,TCGA-3M-AB47,STAD
2,TCGA-B7-5816,STAD
3,TCGA-B7-5818,STAD
4,TCGA-B7-A5TI,STAD
...,...,...
2692,TCGA-WY-A85B,LGG
2693,TCGA-WY-A85C,LGG
2694,TCGA-WY-A85D,LGG
2695,TCGA-WY-A85E,LGG


In [7]:
first_df = pd.read_csv("./trainData/train_data_rna-seq.csv", encoding='utf-8')
first_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-3M-AB46-01,TCGA-3M-AB47-01,TCGA-B7-5816-01,TCGA-B7-5818-01,TCGA-B7-A5TI-01,TCGA-B7-A5TJ-01,TCGA-B7-A5TK-01,TCGA-B7-A5TN-01,...,TCGA-VW-A8FI-01,TCGA-W9-A837-01,TCGA-WH-A86K-01,TCGA-WY-A858-01,TCGA-WY-A859-01,TCGA-WY-A85A-01,TCGA-WY-A85B-01,TCGA-WY-A85C-01,TCGA-WY-A85D-01,TCGA-WY-A85E-01
0,,100130426,,,,,,,,,...,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197
1,,100133144,-0.6912,-0.4056,-2.0249,-1.7479,-0.6720,0.2729,-1.6346,-0.0469,...,1.1529,0.5938,0.4161,-0.0182,-1.0264,-0.4623,0.0507,0.4778,-0.4763,0.2876
2,,100134869,-0.0569,-1.1995,-0.5616,1.2902,-0.0517,0.1636,-3.7799,0.0582,...,0.7471,1.0861,0.0440,0.2981,-0.4821,0.2481,0.5676,0.4846,-0.6968,-0.9707
3,,10357,0.7839,-0.0397,-2.7656,-1.0076,0.1231,1.2847,0.6795,0.6711,...,-0.3517,-0.8943,1.0942,0.7643,-0.4045,-0.4197,-0.2125,-0.5183,-0.8822,-0.0090
4,,10431,2.6285,0.1811,-0.2540,-0.2604,1.3377,-0.1858,1.2572,0.4134,...,-0.3810,-0.1261,-0.3370,0.7580,1.2928,0.2058,0.2314,-0.6588,0.4685,0.6967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYG11A,440590,0.6274,-0.5807,-1.4317,-1.4317,-0.8730,-0.7320,-0.7461,-0.9315,...,0.7317,-1.2143,-1.2143,-0.4660,0.1100,-1.2143,-0.2231,0.9422,-1.2143,1.1519
20527,ZYG11B,79699,0.5821,0.3698,-0.8969,-1.6536,-0.9848,-0.9928,0.7091,-1.2326,...,-1.3630,0.1147,0.0871,-0.9503,0.8521,0.4425,0.3485,1.0692,-0.1277,0.7383
20528,ZYX,7791,-0.8533,-0.1465,-0.0919,0.3913,-0.5291,0.1615,0.8531,0.4881,...,2.3670,-1.0349,-0.1840,0.3662,-0.4420,-0.9402,-0.8402,-0.4389,-0.6813,-0.3227
20529,ZZEF1,23140,-1.6618,-0.0619,-0.3589,-1.5988,-0.6019,-0.1565,-1.6103,-1.1189,...,-1.7330,0.2662,-0.5119,-0.9516,-0.6513,-0.1893,0.1042,0.7413,-0.5001,-0.4875


In [8]:
#Nan제거
second_df = first_df.dropna(subset=['Hugo_Symbol'])
second_df = second_df.reset_index(drop=True)
second_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-3M-AB46-01,TCGA-3M-AB47-01,TCGA-B7-5816-01,TCGA-B7-5818-01,TCGA-B7-A5TI-01,TCGA-B7-A5TJ-01,TCGA-B7-A5TK-01,TCGA-B7-A5TN-01,...,TCGA-VW-A8FI-01,TCGA-W9-A837-01,TCGA-WH-A86K-01,TCGA-WY-A858-01,TCGA-WY-A859-01,TCGA-WY-A85A-01,TCGA-WY-A85B-01,TCGA-WY-A85C-01,TCGA-WY-A85D-01,TCGA-WY-A85E-01
0,A1BG,1,0.2076,0.4276,-0.9922,-1.0179,-0.4527,-0.5642,0.5899,-0.1583,...,0.8868,-1.4281,2.0103,1.3660,-0.1550,0.0365,0.1544,0.6591,1.8078,0.6973
1,A1CF,29974,-1.0518,0.5072,-2.1072,-2.1278,0.6810,1.2261,-1.1916,-0.5322,...,-2.3298,-2.3298,-2.3298,-2.3298,-2.3298,-2.3298,-2.3298,-0.2885,-2.3298,-2.3298
2,A2BP1,54715,-0.8329,2.1122,-1.0945,-1.0945,-1.0945,-0.1976,-1.0945,-1.0945,...,-1.1720,1.3013,-0.5875,-2.0743,1.7416,0.0690,-0.8346,-0.8072,-1.1571,0.8931
3,A2LD1,87769,1.2071,0.6262,0.3858,-2.6991,0.8135,2.8214,-1.3555,0.3870,...,1.4632,-0.3344,0.2268,0.3680,-0.4358,-0.0906,0.0154,-0.3613,-0.9304,-1.1419
4,A2M,2,-0.7818,0.9275,-0.4149,-1.2313,0.4039,-0.7086,1.1713,0.4539,...,-0.6719,-0.9471,0.2347,2.3168,-1.4888,-0.3067,0.8500,-0.0253,0.0875,-1.1735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20497,ZYG11A,440590,0.6274,-0.5807,-1.4317,-1.4317,-0.8730,-0.7320,-0.7461,-0.9315,...,0.7317,-1.2143,-1.2143,-0.4660,0.1100,-1.2143,-0.2231,0.9422,-1.2143,1.1519
20498,ZYG11B,79699,0.5821,0.3698,-0.8969,-1.6536,-0.9848,-0.9928,0.7091,-1.2326,...,-1.3630,0.1147,0.0871,-0.9503,0.8521,0.4425,0.3485,1.0692,-0.1277,0.7383
20499,ZYX,7791,-0.8533,-0.1465,-0.0919,0.3913,-0.5291,0.1615,0.8531,0.4881,...,2.3670,-1.0349,-0.1840,0.3662,-0.4420,-0.9402,-0.8402,-0.4389,-0.6813,-0.3227
20500,ZZEF1,23140,-1.6618,-0.0619,-0.3589,-1.5988,-0.6019,-0.1565,-1.6103,-1.1189,...,-1.7330,0.2662,-0.5119,-0.9516,-0.6513,-0.1893,0.1042,0.7413,-0.5001,-0.4875


In [46]:
first_df[['TCGA-BR-4280-01']]

Unnamed: 0,TCGA-BR-4280-01
0,
1,-0.0802
2,-0.1003
3,0.9334
4,0.9662
...,...
20526,-1.2122
20527,-0.9524
20528,-0.7196
20529,0.3103


In [9]:
second_df = second_df.drop(['Hugo_Symbol'],axis=1)

second_df=second_df.transpose()

second_df = second_df.rename(columns=second_df.iloc[0])

second_df.index = second_df.index.str.rstrip('-01')

second_df = second_df.drop(second_df.index[0])
second_df

Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,55055.0,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0
TCGA-3M-AB46,0.2076,-1.0518,-0.8329,1.2071,-0.7818,-0.8253,-0.9673,-1.5546,0.7883,0.0646,...,1.2552,2.6983,-1.0061,-0.4243,-1.1151,0.6274,0.5821,-0.8533,-1.6618,0.2748
TCGA-3M-AB47,0.4276,0.5072,2.1122,0.6262,0.9275,-0.6354,0.4636,-1.0286,-1.4886,-0.5795,...,0.1266,-0.5029,-1.1278,0.6149,-0.7507,-0.5807,0.3698,-0.1465,-0.0619,0.5492
TCGA-B7-5816,-0.9922,-2.1072,-1.0945,0.3858,-0.4149,-0.4633,1.1193,0.3950,-0.7351,-0.6825,...,-0.9013,0.1416,-0.0425,-0.9111,-1.1933,-1.4317,-0.8969,-0.0919,-0.3589,-1.4745
TCGA-B7-5818,-1.0179,-2.1278,-1.0945,-2.6991,-1.2313,0.8780,-0.5319,1.9559,-1.4886,-0.0726,...,-1.4085,0.7458,-2.5996,-5.5211,1.3570,-1.4317,-1.6536,0.3913,-1.5988,-2.7521
TCGA-B7-A5TI,-0.4527,0.6810,-1.0945,0.8135,0.4039,0.6896,-0.0799,1.5994,-0.1381,0.0263,...,0.3886,1.0328,-1.6170,-1.0720,-0.4373,-0.8730,-0.9848,-0.5291,-0.6019,0.0214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85A,0.0365,-2.3298,0.0690,-0.0906,-0.3067,0.9726,-0.7398,-0.6719,-1.3060,0.8323,...,-0.5834,-0.4914,-0.3775,-0.6975,0.0466,-1.2143,0.4425,-0.9402,-0.1893,0.4236
TCGA-WY-A85B,0.1544,-2.3298,-0.8346,0.0154,0.8500,-0.8925,-1.0079,0.9052,-1.3060,-0.0945,...,1.0486,1.1282,-0.1515,0.5935,0.3402,-0.2231,0.3485,-0.8402,0.1042,1.0122
TCGA-WY-A85C,0.6591,-0.2885,-0.8072,-0.3613,-0.0253,-0.9871,0.1267,0.3564,-1.3060,0.8454,...,-0.6817,-0.7663,0.3284,0.7869,0.9922,0.9422,1.0692,-0.4389,0.7413,1.2262
TCGA-WY-A85D,1.8078,-2.3298,-1.1571,-0.9304,0.0875,0.8196,-0.3884,-1.8299,-1.3060,0.9347,...,-1.9247,-0.3125,-1.2386,-1.1186,0.6909,-1.2143,-0.1277,-0.6813,-0.5001,0.8035


In [10]:
# 백업용
third_df=second_df.copy()

# cna_dfset에 Cancer 열 추가
third_df['Cancer'] = ''

# clinical 데이터프레임을 순회하며 cna_dfset에 값 할당
for index, row in third_df.iterrows():
    for i, pat in clinical.iterrows():
        if(index==pat['PATIENT_ID']):
            cancer_type = pat['CANCER_TYPE_ACRONYM']
            third_df.at[index, 'Cancer'] = cancer_type
            break
third_df

Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0,Cancer
TCGA-3M-AB46,0.2076,-1.0518,-0.8329,1.2071,-0.7818,-0.8253,-0.9673,-1.5546,0.7883,0.0646,...,2.6983,-1.0061,-0.4243,-1.1151,0.6274,0.5821,-0.8533,-1.6618,0.2748,STAD
TCGA-3M-AB47,0.4276,0.5072,2.1122,0.6262,0.9275,-0.6354,0.4636,-1.0286,-1.4886,-0.5795,...,-0.5029,-1.1278,0.6149,-0.7507,-0.5807,0.3698,-0.1465,-0.0619,0.5492,STAD
TCGA-B7-5816,-0.9922,-2.1072,-1.0945,0.3858,-0.4149,-0.4633,1.1193,0.3950,-0.7351,-0.6825,...,0.1416,-0.0425,-0.9111,-1.1933,-1.4317,-0.8969,-0.0919,-0.3589,-1.4745,STAD
TCGA-B7-5818,-1.0179,-2.1278,-1.0945,-2.6991,-1.2313,0.8780,-0.5319,1.9559,-1.4886,-0.0726,...,0.7458,-2.5996,-5.5211,1.3570,-1.4317,-1.6536,0.3913,-1.5988,-2.7521,STAD
TCGA-B7-A5TI,-0.4527,0.6810,-1.0945,0.8135,0.4039,0.6896,-0.0799,1.5994,-0.1381,0.0263,...,1.0328,-1.6170,-1.0720,-0.4373,-0.8730,-0.9848,-0.5291,-0.6019,0.0214,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85A,0.0365,-2.3298,0.0690,-0.0906,-0.3067,0.9726,-0.7398,-0.6719,-1.3060,0.8323,...,-0.4914,-0.3775,-0.6975,0.0466,-1.2143,0.4425,-0.9402,-0.1893,0.4236,LGG
TCGA-WY-A85B,0.1544,-2.3298,-0.8346,0.0154,0.8500,-0.8925,-1.0079,0.9052,-1.3060,-0.0945,...,1.1282,-0.1515,0.5935,0.3402,-0.2231,0.3485,-0.8402,0.1042,1.0122,LGG
TCGA-WY-A85C,0.6591,-0.2885,-0.8072,-0.3613,-0.0253,-0.9871,0.1267,0.3564,-1.3060,0.8454,...,-0.7663,0.3284,0.7869,0.9922,0.9422,1.0692,-0.4389,0.7413,1.2262,LGG
TCGA-WY-A85D,1.8078,-2.3298,-1.1571,-0.9304,0.0875,0.8196,-0.3884,-1.8299,-1.3060,0.9347,...,-0.3125,-1.2386,-1.1186,0.6909,-1.2143,-0.1277,-0.6813,-0.5001,0.8035,LGG


In [11]:
#만일 Cancer에 값이 안들어간 경우 해당 행은 제외
fourth_df = third_df[third_df['Cancer'] != '']
fourth_df

Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0,Cancer
TCGA-3M-AB46,0.2076,-1.0518,-0.8329,1.2071,-0.7818,-0.8253,-0.9673,-1.5546,0.7883,0.0646,...,2.6983,-1.0061,-0.4243,-1.1151,0.6274,0.5821,-0.8533,-1.6618,0.2748,STAD
TCGA-3M-AB47,0.4276,0.5072,2.1122,0.6262,0.9275,-0.6354,0.4636,-1.0286,-1.4886,-0.5795,...,-0.5029,-1.1278,0.6149,-0.7507,-0.5807,0.3698,-0.1465,-0.0619,0.5492,STAD
TCGA-B7-5816,-0.9922,-2.1072,-1.0945,0.3858,-0.4149,-0.4633,1.1193,0.3950,-0.7351,-0.6825,...,0.1416,-0.0425,-0.9111,-1.1933,-1.4317,-0.8969,-0.0919,-0.3589,-1.4745,STAD
TCGA-B7-5818,-1.0179,-2.1278,-1.0945,-2.6991,-1.2313,0.8780,-0.5319,1.9559,-1.4886,-0.0726,...,0.7458,-2.5996,-5.5211,1.3570,-1.4317,-1.6536,0.3913,-1.5988,-2.7521,STAD
TCGA-B7-A5TI,-0.4527,0.6810,-1.0945,0.8135,0.4039,0.6896,-0.0799,1.5994,-0.1381,0.0263,...,1.0328,-1.6170,-1.0720,-0.4373,-0.8730,-0.9848,-0.5291,-0.6019,0.0214,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85A,0.0365,-2.3298,0.0690,-0.0906,-0.3067,0.9726,-0.7398,-0.6719,-1.3060,0.8323,...,-0.4914,-0.3775,-0.6975,0.0466,-1.2143,0.4425,-0.9402,-0.1893,0.4236,LGG
TCGA-WY-A85B,0.1544,-2.3298,-0.8346,0.0154,0.8500,-0.8925,-1.0079,0.9052,-1.3060,-0.0945,...,1.1282,-0.1515,0.5935,0.3402,-0.2231,0.3485,-0.8402,0.1042,1.0122,LGG
TCGA-WY-A85C,0.6591,-0.2885,-0.8072,-0.3613,-0.0253,-0.9871,0.1267,0.3564,-1.3060,0.8454,...,-0.7663,0.3284,0.7869,0.9922,0.9422,1.0692,-0.4389,0.7413,1.2262,LGG
TCGA-WY-A85D,1.8078,-2.3298,-1.1571,-0.9304,0.0875,0.8196,-0.3884,-1.8299,-1.3060,0.9347,...,-0.3125,-1.2386,-1.1186,0.6909,-1.2143,-0.1277,-0.6813,-0.5001,0.8035,LGG


In [12]:
#데이터를 0으로 변겅하는 프로세스
fifth_df = fourth_df.fillna(0)
fifth_df


Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0,Cancer
TCGA-3M-AB46,0.2076,-1.0518,-0.8329,1.2071,-0.7818,-0.8253,-0.9673,-1.5546,0.7883,0.0646,...,2.6983,-1.0061,-0.4243,-1.1151,0.6274,0.5821,-0.8533,-1.6618,0.2748,STAD
TCGA-3M-AB47,0.4276,0.5072,2.1122,0.6262,0.9275,-0.6354,0.4636,-1.0286,-1.4886,-0.5795,...,-0.5029,-1.1278,0.6149,-0.7507,-0.5807,0.3698,-0.1465,-0.0619,0.5492,STAD
TCGA-B7-5816,-0.9922,-2.1072,-1.0945,0.3858,-0.4149,-0.4633,1.1193,0.3950,-0.7351,-0.6825,...,0.1416,-0.0425,-0.9111,-1.1933,-1.4317,-0.8969,-0.0919,-0.3589,-1.4745,STAD
TCGA-B7-5818,-1.0179,-2.1278,-1.0945,-2.6991,-1.2313,0.8780,-0.5319,1.9559,-1.4886,-0.0726,...,0.7458,-2.5996,-5.5211,1.3570,-1.4317,-1.6536,0.3913,-1.5988,-2.7521,STAD
TCGA-B7-A5TI,-0.4527,0.6810,-1.0945,0.8135,0.4039,0.6896,-0.0799,1.5994,-0.1381,0.0263,...,1.0328,-1.6170,-1.0720,-0.4373,-0.8730,-0.9848,-0.5291,-0.6019,0.0214,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85A,0.0365,-2.3298,0.0690,-0.0906,-0.3067,0.9726,-0.7398,-0.6719,-1.3060,0.8323,...,-0.4914,-0.3775,-0.6975,0.0466,-1.2143,0.4425,-0.9402,-0.1893,0.4236,LGG
TCGA-WY-A85B,0.1544,-2.3298,-0.8346,0.0154,0.8500,-0.8925,-1.0079,0.9052,-1.3060,-0.0945,...,1.1282,-0.1515,0.5935,0.3402,-0.2231,0.3485,-0.8402,0.1042,1.0122,LGG
TCGA-WY-A85C,0.6591,-0.2885,-0.8072,-0.3613,-0.0253,-0.9871,0.1267,0.3564,-1.3060,0.8454,...,-0.7663,0.3284,0.7869,0.9922,0.9422,1.0692,-0.4389,0.7413,1.2262,LGG
TCGA-WY-A85D,1.8078,-2.3298,-1.1571,-0.9304,0.0875,0.8196,-0.3884,-1.8299,-1.3060,0.9347,...,-0.3125,-1.2386,-1.1186,0.6909,-1.2143,-0.1277,-0.6813,-0.5001,0.8035,LGG


In [37]:
testval = fifth_df.isin(['MARCH11']).any().any()
testval

False

In [13]:
fifth_df.isnull().sum()

1.0        0
29974.0    0
54715.0    0
87769.0    0
2.0        0
          ..
79699.0    0
7791.0     0
23140.0    0
26009.0    0
Cancer     0
Length: 20503, dtype: int64

In [14]:
X = fifth_df.drop("Cancer", axis=1)
y = fifth_df["Cancer"]

In [15]:
X

Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,55055.0,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0
TCGA-3M-AB46,0.2076,-1.0518,-0.8329,1.2071,-0.7818,-0.8253,-0.9673,-1.5546,0.7883,0.0646,...,1.2552,2.6983,-1.0061,-0.4243,-1.1151,0.6274,0.5821,-0.8533,-1.6618,0.2748
TCGA-3M-AB47,0.4276,0.5072,2.1122,0.6262,0.9275,-0.6354,0.4636,-1.0286,-1.4886,-0.5795,...,0.1266,-0.5029,-1.1278,0.6149,-0.7507,-0.5807,0.3698,-0.1465,-0.0619,0.5492
TCGA-B7-5816,-0.9922,-2.1072,-1.0945,0.3858,-0.4149,-0.4633,1.1193,0.3950,-0.7351,-0.6825,...,-0.9013,0.1416,-0.0425,-0.9111,-1.1933,-1.4317,-0.8969,-0.0919,-0.3589,-1.4745
TCGA-B7-5818,-1.0179,-2.1278,-1.0945,-2.6991,-1.2313,0.8780,-0.5319,1.9559,-1.4886,-0.0726,...,-1.4085,0.7458,-2.5996,-5.5211,1.3570,-1.4317,-1.6536,0.3913,-1.5988,-2.7521
TCGA-B7-A5TI,-0.4527,0.6810,-1.0945,0.8135,0.4039,0.6896,-0.0799,1.5994,-0.1381,0.0263,...,0.3886,1.0328,-1.6170,-1.0720,-0.4373,-0.8730,-0.9848,-0.5291,-0.6019,0.0214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85A,0.0365,-2.3298,0.0690,-0.0906,-0.3067,0.9726,-0.7398,-0.6719,-1.3060,0.8323,...,-0.5834,-0.4914,-0.3775,-0.6975,0.0466,-1.2143,0.4425,-0.9402,-0.1893,0.4236
TCGA-WY-A85B,0.1544,-2.3298,-0.8346,0.0154,0.8500,-0.8925,-1.0079,0.9052,-1.3060,-0.0945,...,1.0486,1.1282,-0.1515,0.5935,0.3402,-0.2231,0.3485,-0.8402,0.1042,1.0122
TCGA-WY-A85C,0.6591,-0.2885,-0.8072,-0.3613,-0.0253,-0.9871,0.1267,0.3564,-1.3060,0.8454,...,-0.6817,-0.7663,0.3284,0.7869,0.9922,0.9422,1.0692,-0.4389,0.7413,1.2262
TCGA-WY-A85D,1.8078,-2.3298,-1.1571,-0.9304,0.0875,0.8196,-0.3884,-1.8299,-1.3060,0.9347,...,-1.9247,-0.3125,-1.2386,-1.1186,0.6909,-1.2143,-0.1277,-0.6813,-0.5001,0.8035


In [16]:
y

TCGA-3M-AB46    STAD
TCGA-3M-AB47    STAD
TCGA-B7-5816    STAD
TCGA-B7-5818    STAD
TCGA-B7-A5TI    STAD
                ... 
TCGA-WY-A85A     LGG
TCGA-WY-A85B     LGG
TCGA-WY-A85C     LGG
TCGA-WY-A85D     LGG
TCGA-WY-A85E     LGG
Name: Cancer, Length: 1962, dtype: object

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

In [19]:
final_model = LogisticRegression(max_iter=400)
final_model.fit(X, y)

In [20]:
#train, test split해서 학습시킨 model로 점수 구하였을 때...
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [21]:
y_pred

array(['STAD', 'COAD', 'COAD', 'LUAD', 'LUAD', 'STAD', 'COAD', 'STAD',
       'STAD', 'LGG', 'LIHC', 'LUAD', 'LUAD', 'LGG', 'STAD', 'STAD',
       'LGG', 'LIHC', 'LIHC', 'COAD', 'LIHC', 'COAD', 'LIHC', 'LIHC',
       'LIHC', 'COAD', 'STAD', 'STAD', 'LIHC', 'LIHC', 'LGG', 'STAD',
       'LIHC', 'LGG', 'SKCM', 'LUAD', 'LGG', 'LIHC', 'STAD', 'LUAD',
       'LGG', 'LUAD', 'LIHC', 'LIHC', 'LUAD', 'STAD', 'LGG', 'LGG', 'LGG',
       'LUAD', 'COAD', 'LUAD', 'STAD', 'LIHC', 'LUAD', 'LUAD', 'LGG',
       'LGG', 'LUAD', 'LUAD', 'STAD', 'STAD', 'STAD', 'LUAD', 'LUAD',
       'LGG', 'LUAD', 'LUAD', 'LGG', 'LIHC', 'COAD', 'STAD', 'STAD',
       'COAD', 'LUAD', 'LGG', 'LGG', 'LUAD', 'LGG', 'LGG', 'LGG', 'LGG',
       'LIHC', 'LUAD', 'LIHC', 'COAD', 'LUAD', 'LUAD', 'LIHC', 'LGG',
       'COAD', 'LIHC', 'LUAD', 'LGG', 'LGG', 'COAD', 'COAD', 'LUAD',
       'COAD', 'COAD', 'LGG', 'LUAD', 'COAD', 'LGG', 'STAD', 'LUAD',
       'LGG', 'LGG', 'STAD', 'STAD', 'LIHC', 'LUAD', 'LUAD', 'LGG', 'LGG',
       'COA

### 여기서부터는 테스트 데이터 결과 도출입니다

In [22]:
test_rna_df = pd.read_csv("./testData/test_data_rna-seq_v1.csv", encoding='utf-8')
test_rna_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,patient_34,patient_35,patient_5,patient_65,patient_7,patient_8,patient_9,patient_1,...,patient_52,patient_53,patient_54,patient_28,patient_68,patient_57,patient_58,patient_59,patient_72,patient_61
0,,100130426,,,,,,,,,...,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197
1,,100133144,0.0313,-1.1888,-0.6275,-2.8225,-0.9355,0.2755,-0.5159,-3.6531,...,-0.0104,-0.4469,-0.2674,-1.4728,-1.1543,-0.0359,-3.1094,-0.7935,-3.1094,-1.4103
2,,100134869,-1.9899,0.0618,-0.6568,-2.0187,-0.2693,1.4479,1.0652,-1.2866,...,-0.6447,-0.7510,-0.4568,-0.9079,-0.1149,-0.9998,-0.6871,0.8822,-0.4106,0.4940
3,,10357,0.8538,-1.4139,-1.6290,-1.6207,-2.0586,-1.8798,-0.6720,-0.8844,...,0.2089,1.8635,1.5797,0.0796,0.3014,0.0396,-1.1425,-0.5699,-0.2267,0.2357
4,,10431,0.0726,-1.4780,-1.4820,0.1970,-1.0236,0.1647,0.5723,1.0053,...,0.3580,0.8525,0.2391,0.5402,0.6718,0.2211,2.4127,0.6833,-0.5026,1.6788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYG11A,440590,-1.4317,1.1030,-0.6893,-0.3313,-1.0208,-1.4317,-0.2368,1.8149,...,0.6812,-0.3444,-1.2143,-1.2143,-0.1383,-0.5836,-0.0645,-0.6352,0.5606,-1.2143
20527,ZYG11B,79699,-0.4268,-1.1815,-0.9811,-1.2863,-0.3284,-1.4403,-1.2965,0.7038,...,-0.0309,1.1275,0.8424,0.6774,1.8079,-1.3332,-3.6953,-0.6294,-0.7373,-2.0814
20528,ZYX,7791,-0.0860,0.4660,0.7733,1.8049,1.3232,0.5540,-0.2708,1.1159,...,-1.1755,-0.5240,-1.6614,-0.2644,-0.6075,0.5624,1.3264,-0.5177,-0.3267,0.6660
20529,ZZEF1,23140,-0.2940,-1.3591,-1.4425,0.3220,0.5374,-2.5013,-0.5578,-0.3484,...,-0.8474,-0.8440,0.0883,0.2331,-2.0918,-1.0157,-0.9135,-0.4802,-1.3742,-1.6413


In [23]:
#Nan제거
test_rna_df = test_rna_df.dropna(subset=['Hugo_Symbol'])
test_rna_df = test_rna_df.reset_index(drop=True)
test_rna_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,patient_34,patient_35,patient_5,patient_65,patient_7,patient_8,patient_9,patient_1,...,patient_52,patient_53,patient_54,patient_28,patient_68,patient_57,patient_58,patient_59,patient_72,patient_61
0,A1BG,1,-0.7970,-1.4861,0.1948,1.3791,0.8012,-1.3639,-1.2248,0.3409,...,0.2190,-2.0785,-0.4279,-1.4864,-0.4417,0.9471,-0.7556,0.1781,0.9657,0.3516
1,A1CF,29974,-1.6064,-0.5171,0.7332,0.6536,0.5584,-0.2447,0.2907,,...,-2.3298,-2.3298,-2.3298,-2.3298,0.0101,-2.3298,-2.3298,-2.3298,3.1993,-2.3298
2,A2BP1,54715,-1.0945,-1.0945,-0.8868,-0.4227,0.9259,-0.9107,-0.9458,,...,-0.8648,0.5075,0.0062,0.0683,1.6677,0.1720,-0.3173,0.1840,-0.3776,-0.9338
3,A2LD1,87769,-0.4542,-0.2880,-0.2676,0.4978,0.5735,-0.5469,0.3127,-0.4666,...,-0.3464,-0.0686,0.1903,-0.1848,-0.1319,0.9772,-0.5544,-0.8007,1.8816,1.4004
4,A2M,2,0.4107,0.4235,0.8486,1.6071,1.6144,0.1215,-0.5271,1.5034,...,0.8221,0.7054,0.0560,1.7167,-0.9979,0.2529,1.2668,-0.9036,0.1425,-0.4349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20497,ZYG11A,440590,-1.4317,1.1030,-0.6893,-0.3313,-1.0208,-1.4317,-0.2368,1.8149,...,0.6812,-0.3444,-1.2143,-1.2143,-0.1383,-0.5836,-0.0645,-0.6352,0.5606,-1.2143
20498,ZYG11B,79699,-0.4268,-1.1815,-0.9811,-1.2863,-0.3284,-1.4403,-1.2965,0.7038,...,-0.0309,1.1275,0.8424,0.6774,1.8079,-1.3332,-3.6953,-0.6294,-0.7373,-2.0814
20499,ZYX,7791,-0.0860,0.4660,0.7733,1.8049,1.3232,0.5540,-0.2708,1.1159,...,-1.1755,-0.5240,-1.6614,-0.2644,-0.6075,0.5624,1.3264,-0.5177,-0.3267,0.6660
20500,ZZEF1,23140,-0.2940,-1.3591,-1.4425,0.3220,0.5374,-2.5013,-0.5578,-0.3484,...,-0.8474,-0.8440,0.0883,0.2331,-2.0918,-1.0157,-0.9135,-0.4802,-1.3742,-1.6413


In [24]:
test_rna_set = test_rna_df.drop(['Hugo_Symbol'],axis=1)

test_rna_set=test_rna_set.transpose()

test_rna_set = test_rna_set.rename(columns=test_rna_set.iloc[0])

test_rna_set = test_rna_set.drop(test_rna_set.index[0])
test_rna_set

Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,55055.0,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0
patient_34,-0.7970,-1.6064,-1.0945,-0.4542,0.4107,0.4798,-0.4802,-1.1262,-1.4886,-0.6607,...,0.6668,0.1833,-0.2432,-0.3486,0.1757,-1.4317,-0.4268,-0.0860,-0.2940,-0.0350
patient_35,-1.4861,-0.5171,-1.0945,-0.2880,0.4235,-0.8253,1.1270,-0.3289,-1.4886,-0.2843,...,-1.4181,-1.4764,0.4286,-0.2136,-0.4854,1.1030,-1.1815,0.4660,-1.3591,-0.9253
patient_5,0.1948,0.7332,-0.8868,-0.2676,0.8486,-0.7489,0.8944,2.0952,-1.4886,-0.5063,...,-1.1046,-0.9107,0.4796,0.2064,-0.6044,-0.6893,-0.9811,0.7733,-1.4425,-0.7750
patient_65,1.3791,0.6536,-0.4227,0.4978,1.6071,-0.8253,1.9917,-1.1048,-1.4886,1.0861,...,-2.2675,-0.4971,0.8220,-0.0229,-0.5947,-0.3313,-1.2863,1.8049,0.3220,-1.9503
patient_7,0.8012,0.5584,0.9259,0.5735,1.6144,0.8943,1.2996,0.6000,-0.8817,-0.5246,...,-0.8591,-0.4480,0.3806,-0.2777,-0.7029,-1.0208,-0.3284,1.3232,0.5374,-0.6608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
patient_57,0.9471,-2.3298,0.1720,0.9772,0.2529,1.5292,0.0796,-1.8299,-1.3060,-0.8843,...,0.0831,-0.9160,-1.0609,-0.9301,0.3490,-0.5836,-1.3332,0.5624,-1.0157,0.6091
patient_58,-0.7556,-2.3298,-0.3173,-0.5544,1.2668,-1.4734,2.3380,-1.8299,-1.3060,2.0684,...,-0.4510,1.2286,-1.5066,-1.0752,-0.7429,-0.0645,-3.6953,1.3264,-0.9135,-4.3633
patient_59,0.1781,-2.3298,0.1840,-0.8007,-0.9036,0.5082,-0.3016,-1.8299,-0.4040,0.6352,...,-1.9418,-0.5863,-0.5323,0.0291,0.1092,-0.6352,-0.6294,-0.5177,-0.4802,-1.3504
patient_72,0.9657,3.1993,-0.3776,1.8816,0.1425,1.9203,0.2481,-0.7041,-1.3060,-0.5267,...,-0.8721,-2.1572,-1.5062,-1.5347,-0.2171,0.5606,-0.7373,-0.3267,-1.3742,0.0142


In [25]:
#데이터를 0으로 변겅하는 프로세스
test_rna_set = test_rna_set.fillna(0)
test_rna_set


Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,55055.0,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0
patient_34,-0.7970,-1.6064,-1.0945,-0.4542,0.4107,0.4798,-0.4802,-1.1262,-1.4886,-0.6607,...,0.6668,0.1833,-0.2432,-0.3486,0.1757,-1.4317,-0.4268,-0.0860,-0.2940,-0.0350
patient_35,-1.4861,-0.5171,-1.0945,-0.2880,0.4235,-0.8253,1.1270,-0.3289,-1.4886,-0.2843,...,-1.4181,-1.4764,0.4286,-0.2136,-0.4854,1.1030,-1.1815,0.4660,-1.3591,-0.9253
patient_5,0.1948,0.7332,-0.8868,-0.2676,0.8486,-0.7489,0.8944,2.0952,-1.4886,-0.5063,...,-1.1046,-0.9107,0.4796,0.2064,-0.6044,-0.6893,-0.9811,0.7733,-1.4425,-0.7750
patient_65,1.3791,0.6536,-0.4227,0.4978,1.6071,-0.8253,1.9917,-1.1048,-1.4886,1.0861,...,-2.2675,-0.4971,0.8220,-0.0229,-0.5947,-0.3313,-1.2863,1.8049,0.3220,-1.9503
patient_7,0.8012,0.5584,0.9259,0.5735,1.6144,0.8943,1.2996,0.6000,-0.8817,-0.5246,...,-0.8591,-0.4480,0.3806,-0.2777,-0.7029,-1.0208,-0.3284,1.3232,0.5374,-0.6608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
patient_57,0.9471,-2.3298,0.1720,0.9772,0.2529,1.5292,0.0796,-1.8299,-1.3060,-0.8843,...,0.0831,-0.9160,-1.0609,-0.9301,0.3490,-0.5836,-1.3332,0.5624,-1.0157,0.6091
patient_58,-0.7556,-2.3298,-0.3173,-0.5544,1.2668,-1.4734,2.3380,-1.8299,-1.3060,2.0684,...,-0.4510,1.2286,-1.5066,-1.0752,-0.7429,-0.0645,-3.6953,1.3264,-0.9135,-4.3633
patient_59,0.1781,-2.3298,0.1840,-0.8007,-0.9036,0.5082,-0.3016,-1.8299,-0.4040,0.6352,...,-1.9418,-0.5863,-0.5323,0.0291,0.1092,-0.6352,-0.6294,-0.5177,-0.4802,-1.3504
patient_72,0.9657,3.1993,-0.3776,1.8816,0.1425,1.9203,0.2481,-0.7041,-1.3060,-0.5267,...,-0.8721,-2.1572,-1.5062,-1.5347,-0.2171,0.5606,-0.7373,-0.3267,-1.3742,0.0142


In [26]:
#환자1부터 차례대로 있게 하는것
test_rna_set = test_rna_set.iloc[test_rna_set.index.astype(str).str.extract('(\d+)').astype(int).sort_values(by=0).index]
test_rna_set

Unnamed: 0,1.0,29974.0,54715.0,87769.0,2.0,144568.0,53947.0,51146.0,404744.0,8086.0,...,55055.0,11130.0,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0
patient_1,0.3409,0.0000,0.0000,-0.4666,1.5034,-0.8245,1.3905,-0.2116,0.0000,-0.1563,...,-0.7690,-0.1299,0.1915,-0.1904,-1.1761,1.8149,0.7038,1.1159,-0.3484,-0.4009
patient_2,-0.6584,-0.9931,-0.8520,-0.2783,0.6657,-0.5695,-0.8448,-0.3670,-0.0778,-0.3129,...,-0.0649,-0.0885,-0.8805,-0.5958,0.5515,0.3507,0.3513,-0.1165,-0.2731,0.0095
patient_3,-0.4427,0.7887,-1.0359,0.1624,0.1326,-1.5105,0.9635,-1.1665,1.5832,0.5078,...,1.2754,1.3721,1.2591,1.9395,1.5085,1.7182,2.0320,0.1193,0.9679,1.0774
patient_4,-1.0651,0.8219,-1.0359,1.2859,-1.9703,-1.5105,-1.3483,-1.1665,-1.5794,0.0386,...,1.3948,1.0977,2.1222,0.8577,2.7827,0.4183,0.8452,-2.0517,1.1303,0.4846
patient_5,0.1948,0.7332,-0.8868,-0.2676,0.8486,-0.7489,0.8944,2.0952,-1.4886,-0.5063,...,-1.1046,-0.9107,0.4796,0.2064,-0.6044,-0.6893,-0.9811,0.7733,-1.4425,-0.7750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
patient_70,-0.0754,-1.2801,-0.4983,-0.0559,-1.1749,-0.9136,1.4859,-0.0936,-4.6136,0.6735,...,1.4717,1.9149,-1.2712,-0.5375,-1.5697,-0.4334,-0.7383,1.5243,-1.5691,-0.6089
patient_71,-0.3020,-1.2801,-0.9874,-1.0931,-1.8139,-0.9136,-0.4788,0.2504,-4.6136,0.7004,...,1.3178,0.9137,-0.8435,0.6213,-0.8221,1.9176,0.6194,0.9272,-2.2363,-0.6841
patient_72,0.9657,3.1993,-0.3776,1.8816,0.1425,1.9203,0.2481,-0.7041,-1.3060,-0.5267,...,-0.8721,-2.1572,-1.5062,-1.5347,-0.2171,0.5606,-0.7373,-0.3267,-1.3742,0.0142
patient_73,0.1144,0.6345,-0.9874,0.7102,0.4054,0.1779,-0.6599,-0.3626,-4.6136,2.3964,...,-2.2624,-0.7311,-1.5147,-1.2047,-2.3282,-0.1411,-0.7221,0.8746,-0.7372,-3.0246


In [27]:
test_pred = final_model.predict(test_rna_set)

In [28]:
test_pred

array(['STAD', 'LUAD', 'LIHC', 'LIHC', 'STAD', 'LIHC', 'STAD', 'STAD',
       'STAD', 'LUAD', 'LUAD', 'LUAD', 'SKCM', 'SKCM', 'LUAD', 'COAD',
       'LUAD', 'STAD', 'LUAD', 'LUAD', 'LIHC', 'LUAD', 'LUAD', 'LIHC',
       'LUAD', 'LIHC', 'LIHC', 'LGG', 'LIHC', 'LIHC', 'COAD', 'LIHC',
       'LIHC', 'STAD', 'STAD', 'LIHC', 'COAD', 'COAD', 'COAD', 'COAD',
       'LUAD', 'COAD', 'LGG', 'SKCM', 'LUAD', 'COAD', 'COAD', 'LGG',
       'LGG', 'SKCM', 'LGG', 'LGG', 'LGG', 'LGG', 'LIHC', 'COAD', 'LGG',
       'LGG', 'LGG', 'COAD', 'LGG', 'SKCM', 'SKCM', 'SKCM', 'STAD',
       'SKCM', 'SKCM', 'LGG', 'LUAD', 'SKCM', 'SKCM', 'LGG', 'SKCM',
       'SKCM'], dtype=object)

In [44]:
data = {'id': ['patient_' + str(i) for i in range(1, len(test_pred)+1)],
        'expected': test_pred}

df = pd.DataFrame(data)
df.to_csv('rna_predictions.csv', index=False)

### 데이터 학습에 영향을 끼친 column 찾기

In [29]:
# 회귀 계수 출력
coef = final_model.coef_
print(coef[0])

[ 0.00180909  0.00106825 -0.00162253 ... -0.00029034 -0.00076311
  0.00016087]


In [33]:
feature_importance = abs(model.coef_[0])
most_influential_column_index = np.argmax(feature_importance)
most_influential_column = X_train.columns[most_influential_column_index]
print("Most influential column:", most_influential_column)

Most influential column: 390321.0


In [34]:
feature_importance = abs(model.coef_[1])
most_influential_column_index = np.argmax(feature_importance)
most_influential_column = X_train.columns[most_influential_column_index]
print("Most influential column 2st:", most_influential_column)

Most influential column 2st: 403278.0


In [35]:
feature_importance = abs(model.coef_[2])
most_influential_column_index = np.argmax(feature_importance)
most_influential_column = X_train.columns[most_influential_column_index]
print("Most influential column 3st:", most_influential_column)

Most influential column 3st: 641451.0
