In [308]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cluster import KMeans

import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

from math import sqrt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import joblib


In [309]:
clinical = pd.read_csv('./trainData/train_data_clinical_patient.csv', usecols=['PATIENT_ID', 'CANCER_TYPE_ACRONYM'])

In [310]:
clinical

Unnamed: 0,PATIENT_ID,CANCER_TYPE_ACRONYM
0,TCGA-3M-AB46,STAD
1,TCGA-3M-AB47,STAD
2,TCGA-B7-5816,STAD
3,TCGA-B7-5818,STAD
4,TCGA-B7-A5TI,STAD
...,...,...
2692,TCGA-WY-A85B,LGG
2693,TCGA-WY-A85C,LGG
2694,TCGA-WY-A85D,LGG
2695,TCGA-WY-A85E,LGG


In [311]:
mutation_df = pd.read_csv("./trainData/train_data_mutations.csv", encoding='utf-8')
mutation_df

  mutation_df = pd.read_csv("./trainData/train_data_mutations.csv", encoding='utf-8')


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,ENSP,ExAC_AF,CENTERS,CCDS,EXON,ExAC_AF_OTH,SAS_MAF,Exon_Number,MINIMISED,PUBMED
0,ABCC2,0,.,GRCh37,10,101560185,101560185,+,missense_variant,Missense_Mutation,...,ENSP00000359478,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS7484.1,Sep-32,.,.,Sep-32,.,.
1,UCMA,0,.,GRCh37,10,13264119,13264119,+,missense_variant,Missense_Mutation,...,ENSP00000367952,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS31147.1,5-May,.,.,5-May,.,.
2,STAM,0,.,GRCh37,10,17737155,17737155,+,missense_variant,Missense_Mutation,...,ENSP00000366746,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS7122.1,14-Jul,.,.,14-Jul,.,.
3,RASGEF1A,0,.,GRCh37,10,43693490,43693490,+,missense_variant,Missense_Mutation,...,ENSP00000379154,.,MUTECT|MUSE|VARSCANS,CCDS7202.2,13-Oct,.,.,13-Oct,.,.
4,AGAP4,0,.,GRCh37,10,46321916,46321916,+,missense_variant,Missense_Mutation,...,ENSP00000392513,.,RADIA|VARSCANS,CCDS7215.1,7-Jul,.,.,7-Jul,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442693,DCAF12L2,0,.,GRCh37,X,125299773,125299773,+,synonymous_variant,Silent,...,ENSP00000441489,.,MUTECT|SOMATICSNIPER|MUSE|VARSCANS,CCDS43991.1,2/2,.,.,2/2,.,.
1442694,IQSEC2,0,.,GRCh37,X,53277942,53277942,+,missense_variant,Missense_Mutation,...,ENSP00000379712,.,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CCDS48130.1,6/15,.,.,6/15,.,.
1442695,ITIH6,0,.,GRCh37,X,54783821,54783821,+,missense_variant,Missense_Mutation,...,ENSP00000218436,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS14361.1,8/13,.,.,8/13,.,.
1442696,ATRX,0,.,GRCh37,X,76938297,76938297,+,stop_gained,Nonsense_Mutation,...,ENSP00000362441,.,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CCDS14434.1,9/35,.,.,9/35,.,.


In [312]:
# 'Matched_Norm_Sample_Barcode' 열 추출
matched_norm_sample_barcode = mutation_df['Matched_Norm_Sample_Barcode']

# 데이터 프레임에서 'Matched_Norm_Sample_Barcode' 열 삭제
mutation_df = mutation_df.drop(columns=['Matched_Norm_Sample_Barcode'])

# 'Matched_Norm_Sample_Barcode' 열을 첫 번째 열로 추가
mutation_df.insert(0, 'Matched_Norm_Sample_Barcode', matched_norm_sample_barcode)
mutation_df

Unnamed: 0,Matched_Norm_Sample_Barcode,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,...,ENSP,ExAC_AF,CENTERS,CCDS,EXON,ExAC_AF_OTH,SAS_MAF,Exon_Number,MINIMISED,PUBMED
0,TCGA-3M-AB46-10,ABCC2,0,.,GRCh37,10,101560185,101560185,+,missense_variant,...,ENSP00000359478,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS7484.1,Sep-32,.,.,Sep-32,.,.
1,TCGA-3M-AB46-10,UCMA,0,.,GRCh37,10,13264119,13264119,+,missense_variant,...,ENSP00000367952,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS31147.1,5-May,.,.,5-May,.,.
2,TCGA-3M-AB46-10,STAM,0,.,GRCh37,10,17737155,17737155,+,missense_variant,...,ENSP00000366746,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS7122.1,14-Jul,.,.,14-Jul,.,.
3,TCGA-3M-AB46-10,RASGEF1A,0,.,GRCh37,10,43693490,43693490,+,missense_variant,...,ENSP00000379154,.,MUTECT|MUSE|VARSCANS,CCDS7202.2,13-Oct,.,.,13-Oct,.,.
4,TCGA-3M-AB46-10,AGAP4,0,.,GRCh37,10,46321916,46321916,+,missense_variant,...,ENSP00000392513,.,RADIA|VARSCANS,CCDS7215.1,7-Jul,.,.,7-Jul,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442693,TCGA-WY-A85E-10,DCAF12L2,0,.,GRCh37,X,125299773,125299773,+,synonymous_variant,...,ENSP00000441489,.,MUTECT|SOMATICSNIPER|MUSE|VARSCANS,CCDS43991.1,2/2,.,.,2/2,.,.
1442694,TCGA-WY-A85E-10,IQSEC2,0,.,GRCh37,X,53277942,53277942,+,missense_variant,...,ENSP00000379712,.,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CCDS48130.1,6/15,.,.,6/15,.,.
1442695,TCGA-WY-A85E-10,ITIH6,0,.,GRCh37,X,54783821,54783821,+,missense_variant,...,ENSP00000218436,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS14361.1,8/13,.,.,8/13,.,.
1442696,TCGA-WY-A85E-10,ATRX,0,.,GRCh37,X,76938297,76938297,+,stop_gained,...,ENSP00000362441,.,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CCDS14434.1,9/35,.,.,9/35,.,.


In [313]:
mutation_df['Matched_Norm_Sample_Barcode'] = mutation_df['Matched_Norm_Sample_Barcode'].str.rstrip('-01')
mutation_df

Unnamed: 0,Matched_Norm_Sample_Barcode,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,...,ENSP,ExAC_AF,CENTERS,CCDS,EXON,ExAC_AF_OTH,SAS_MAF,Exon_Number,MINIMISED,PUBMED
0,TCGA-3M-AB46,ABCC2,0,.,GRCh37,10,101560185,101560185,+,missense_variant,...,ENSP00000359478,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS7484.1,Sep-32,.,.,Sep-32,.,.
1,TCGA-3M-AB46,UCMA,0,.,GRCh37,10,13264119,13264119,+,missense_variant,...,ENSP00000367952,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS31147.1,5-May,.,.,5-May,.,.
2,TCGA-3M-AB46,STAM,0,.,GRCh37,10,17737155,17737155,+,missense_variant,...,ENSP00000366746,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS7122.1,14-Jul,.,.,14-Jul,.,.
3,TCGA-3M-AB46,RASGEF1A,0,.,GRCh37,10,43693490,43693490,+,missense_variant,...,ENSP00000379154,.,MUTECT|MUSE|VARSCANS,CCDS7202.2,13-Oct,.,.,13-Oct,.,.
4,TCGA-3M-AB46,AGAP4,0,.,GRCh37,10,46321916,46321916,+,missense_variant,...,ENSP00000392513,.,RADIA|VARSCANS,CCDS7215.1,7-Jul,.,.,7-Jul,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442693,TCGA-WY-A85E,DCAF12L2,0,.,GRCh37,X,125299773,125299773,+,synonymous_variant,...,ENSP00000441489,.,MUTECT|SOMATICSNIPER|MUSE|VARSCANS,CCDS43991.1,2/2,.,.,2/2,.,.
1442694,TCGA-WY-A85E,IQSEC2,0,.,GRCh37,X,53277942,53277942,+,missense_variant,...,ENSP00000379712,.,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CCDS48130.1,6/15,.,.,6/15,.,.
1442695,TCGA-WY-A85E,ITIH6,0,.,GRCh37,X,54783821,54783821,+,missense_variant,...,ENSP00000218436,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS14361.1,8/13,.,.,8/13,.,.
1442696,TCGA-WY-A85E,ATRX,0,.,GRCh37,X,76938297,76938297,+,stop_gained,...,ENSP00000362441,.,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CCDS14434.1,9/35,.,.,9/35,.,.


In [317]:
mutation_dfset=mutation_df.copy()

mutation_dfset=mutation_dfset[['Matched_Norm_Sample_Barcode','Chromosome','Start_Position','End_Position','Variant_Type','t_ref_count','t_alt_count','n_ref_count','n_alt_count','Protein_position','NCALLERS','n_depth','t_depth','cDNA_position','CDS_position']]
mutation_dfset=mutation_dfset.transpose()

mutation_dfset = mutation_dfset.rename(columns=mutation_dfset.iloc[0])
mutation_dfset = mutation_dfset.drop(mutation_dfset.index[0])
mutation_dfset=mutation_dfset.transpose()

mutation_dfset

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position
TCGA-3M-AB46,10,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187,1074
TCGA-3M-AB46,10,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474,401
TCGA-3M-AB46,10,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858,643
TCGA-3M-AB46,10,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693,1186
TCGA-3M-AB46,10,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565,1439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,X,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216,135
TCGA-WY-A85E,X,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621,2420
TCGA-WY-A85E,X,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716,2686
TCGA-WY-A85E,X,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666,2451


In [318]:
for i in range(0, clinical.shape[0]-1):   
    abcd=mutation_dfset.index.isin([clinical.at[i,'PATIENT_ID']])
    mutation_dfset.loc[abcd, 'Cancer'] = clinical.at[i,'CANCER_TYPE_ACRONYM']
mutation_dfset

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer
TCGA-3M-AB46,10,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187,1074,STAD
TCGA-3M-AB46,10,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474,401,STAD
TCGA-3M-AB46,10,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858,643,STAD
TCGA-3M-AB46,10,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693,1186,STAD
TCGA-3M-AB46,10,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565,1439,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,X,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216,135,LGG
TCGA-WY-A85E,X,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621,2420,LGG
TCGA-WY-A85E,X,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716,2686,LGG
TCGA-WY-A85E,X,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666,2451,LGG


In [319]:
#데이터를 0으로 변겅하는 프로세스
mutation_dfset = mutation_dfset.fillna(0)
mutation_dfset
#데이터를 삭제하면 40만개가 사라져서  nan값이 들어있는 행을 없애지 않고 0으로 변경
# mutation_dfset=mutation_dfset.dropna(axis=0)
# mutation_dfset

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer
TCGA-3M-AB46,10,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187,1074,STAD
TCGA-3M-AB46,10,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474,401,STAD
TCGA-3M-AB46,10,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858,643,STAD
TCGA-3M-AB46,10,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693,1186,STAD
TCGA-3M-AB46,10,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565,1439,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,X,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216,135,LGG
TCGA-WY-A85E,X,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621,2420,LGG
TCGA-WY-A85E,X,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716,2686,LGG
TCGA-WY-A85E,X,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666,2451,LGG


In [320]:
#cancer쪽에 답이 들어있는 것만 데이터 프레임으로 만들어야 학습이 됨
mutation_dfset = mutation_dfset[mutation_dfset['Cancer'] != '']
mutation_dfset

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer
TCGA-3M-AB46,10,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187,1074,STAD
TCGA-3M-AB46,10,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474,401,STAD
TCGA-3M-AB46,10,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858,643,STAD
TCGA-3M-AB46,10,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693,1186,STAD
TCGA-3M-AB46,10,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565,1439,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,X,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216,135,LGG
TCGA-WY-A85E,X,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621,2420,LGG
TCGA-WY-A85E,X,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716,2686,LGG
TCGA-WY-A85E,X,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666,2451,LGG


In [None]:
# 숫자만 추출하는 정규식 패턴
pattern = r"\d+\.\d+"

# 정규식을 이용하여 숫자 추출
result = re.findall(pattern, mutation_dfset['PolyPhen'])

# 숫자만 추출하는 정규식 패턴
pattern = r"\d+"

# 정규식을 이용하여 숫자 추출
result = re.findall(pattern, data)


### 여기서부터는 데이터 정제 끝내고 학습 준비시작합니다

In [116]:
#원한 잇코딩으로 하다가 결과 안좋아서 다른것으로 진행

# # Hugo_Symbol 열과 Consequence 열을 기준으로 원핫 인코딩 적용
# one_hot_encoding = pd.get_dummies(mutation_dfset[['Variant_Type']])

# # 원핫 인코딩 결과를 기존 데이터프레임에 추가
# df_encoded = pd.concat([mutation_dfset, one_hot_encoding], axis=1)


In [117]:
# df_encoded = df_encoded.drop(['Variant_Type'],axis=1)
# df_encoded

Unnamed: 0,Start_Position,End_Position,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer,Variant_Type_DEL,Variant_Type_INS,Variant_Type_ONP,Variant_Type_SNP,Variant_Type_TNP
TCGA-3M-AB46,101560185,101560185,191,110,182,0,358.0,5,182,302,1187,1074,STAD,0,0,0,1,0
TCGA-3M-AB46,13264119,13264119,9,5,14,0,134.0,5,14,14,474,401,STAD,0,0,0,1,0
TCGA-3M-AB46,17737155,17737155,50,37,55,0,215.0,5,55,87,858,643,STAD,0,0,0,1,0
TCGA-3M-AB46,43693490,43693490,166,15,84,0,396.0,3,84,181,3693,1186,STAD,0,0,0,1,0
TCGA-3M-AB46,46321916,46321916,19,15,14,0,480.0,2,14,35,1565,1439,STAD,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,118223155,118223155,27,5,38,0,680.0,5,38,33,2038,2038,LGG,0,0,0,1,0
TCGA-WY-A85E,125299773,125299773,29,14,35,0,45.0,4,35,43,216,135,LGG,0,0,0,1,0
TCGA-WY-A85E,53277942,53277942,40,26,62,0,807.0,5,62,66,2621,2420,LGG,0,0,0,1,0
TCGA-WY-A85E,54783821,54783821,72,20,63,0,896.0,5,63,92,2716,2686,LGG,0,0,0,1,0


In [118]:
filtered_df = df_encoded[pd.to_numeric(df_encoded['cDNA_position'], errors='coerce').notnull()]

print(filtered_df)

             Start_Position End_Position t_ref_count t_alt_count n_ref_count  \
TCGA-3M-AB46      101560185    101560185         191         110         182   
TCGA-3M-AB46       13264119     13264119           9           5          14   
TCGA-3M-AB46       17737155     17737155          50          37          55   
TCGA-3M-AB46       43693490     43693490         166          15          84   
TCGA-3M-AB46       46321916     46321916          19          15          14   
...                     ...          ...         ...         ...         ...   
TCGA-WY-A85E      118223155    118223155          27           5          38   
TCGA-WY-A85E      125299773    125299773          29          14          35   
TCGA-WY-A85E       53277942     53277942          40          26          62   
TCGA-WY-A85E       54783821     54783821          72          20          63   
TCGA-WY-A85E       76938297     76938297         110          76         190   

             n_alt_count Protein_positi

In [119]:
filtered_df

Unnamed: 0,Start_Position,End_Position,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer,Variant_Type_DEL,Variant_Type_INS,Variant_Type_ONP,Variant_Type_SNP,Variant_Type_TNP
TCGA-3M-AB46,101560185,101560185,191,110,182,0,358.0,5,182,302,1187,1074,STAD,0,0,0,1,0
TCGA-3M-AB46,13264119,13264119,9,5,14,0,134.0,5,14,14,474,401,STAD,0,0,0,1,0
TCGA-3M-AB46,17737155,17737155,50,37,55,0,215.0,5,55,87,858,643,STAD,0,0,0,1,0
TCGA-3M-AB46,43693490,43693490,166,15,84,0,396.0,3,84,181,3693,1186,STAD,0,0,0,1,0
TCGA-3M-AB46,46321916,46321916,19,15,14,0,480.0,2,14,35,1565,1439,STAD,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,118223155,118223155,27,5,38,0,680.0,5,38,33,2038,2038,LGG,0,0,0,1,0
TCGA-WY-A85E,125299773,125299773,29,14,35,0,45.0,4,35,43,216,135,LGG,0,0,0,1,0
TCGA-WY-A85E,53277942,53277942,40,26,62,0,807.0,5,62,66,2621,2420,LGG,0,0,0,1,0
TCGA-WY-A85E,54783821,54783821,72,20,63,0,896.0,5,63,92,2716,2686,LGG,0,0,0,1,0


In [321]:
test_df=mutation_dfset.copy()
test_df

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer
TCGA-3M-AB46,10,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187,1074,STAD
TCGA-3M-AB46,10,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474,401,STAD
TCGA-3M-AB46,10,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858,643,STAD
TCGA-3M-AB46,10,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693,1186,STAD
TCGA-3M-AB46,10,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565,1439,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,X,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216,135,LGG
TCGA-WY-A85E,X,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621,2420,LGG
TCGA-WY-A85E,X,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716,2686,LGG
TCGA-WY-A85E,X,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666,2451,LGG


In [327]:
test_df=mutation_dfset.copy()

test_df['Chromosome'] = pd.to_numeric(test_df['Chromosome'], errors='coerce')
test_df['Protein_position'] = pd.to_numeric(test_df['Protein_position'], errors='coerce')
test_df['Start_Position'] = pd.to_numeric(test_df['Start_Position'], errors='coerce')
test_df['End_Position'] = pd.to_numeric(test_df['End_Position'], errors='coerce')
test_df['t_ref_count'] = pd.to_numeric(test_df['t_ref_count'], errors='coerce')
test_df['t_alt_count'] = pd.to_numeric(test_df['t_alt_count'], errors='coerce')
test_df['n_ref_count'] = pd.to_numeric(test_df['n_ref_count'], errors='coerce')
test_df['n_alt_count'] = pd.to_numeric(test_df['n_alt_count'], errors='coerce')
test_df['NCALLERS'] = pd.to_numeric(test_df['NCALLERS'], errors='coerce')
test_df['n_depth'] = pd.to_numeric(test_df['n_depth'], errors='coerce')
test_df['t_depth'] = pd.to_numeric(test_df['t_depth'], errors='coerce')
test_df['cDNA_position'] = pd.to_numeric(test_df['cDNA_position'], errors='coerce')
test_df['CDS_position'] = pd.to_numeric(test_df['CDS_position'], errors='coerce')

In [328]:
test_df

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer
TCGA-3M-AB46,10.0,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187.0,1074.0,STAD
TCGA-3M-AB46,10.0,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474.0,401.0,STAD
TCGA-3M-AB46,10.0,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858.0,643.0,STAD
TCGA-3M-AB46,10.0,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693.0,1186.0,STAD
TCGA-3M-AB46,10.0,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565.0,1439.0,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216.0,135.0,LGG
TCGA-WY-A85E,,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621.0,2420.0,LGG
TCGA-WY-A85E,,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716.0,2686.0,LGG
TCGA-WY-A85E,,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666.0,2451.0,LGG


In [329]:
#데이터를 0으로 변겅하는 프로세스
test_df = test_df.fillna(0)
test_df

#얘 하니까 너무 많이 사라져서 조치
# test_df=test_df.dropna(axis=0)
# test_df

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer
TCGA-3M-AB46,10.0,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187.0,1074.0,STAD
TCGA-3M-AB46,10.0,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474.0,401.0,STAD
TCGA-3M-AB46,10.0,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858.0,643.0,STAD
TCGA-3M-AB46,10.0,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693.0,1186.0,STAD
TCGA-3M-AB46,10.0,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565.0,1439.0,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,0.0,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216.0,135.0,LGG
TCGA-WY-A85E,0.0,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621.0,2420.0,LGG
TCGA-WY-A85E,0.0,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716.0,2686.0,LGG
TCGA-WY-A85E,0.0,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666.0,2451.0,LGG


In [359]:
test_df = test_df[test_df['Cancer'] != 0]
test_df

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position,Cancer
TCGA-3M-AB46,10.0,101560185,101560185,SNP,191,110,182,0,358.0,5,182,302,1187.0,1074.0,STAD
TCGA-3M-AB46,10.0,13264119,13264119,SNP,9,5,14,0,134.0,5,14,14,474.0,401.0,STAD
TCGA-3M-AB46,10.0,17737155,17737155,SNP,50,37,55,0,215.0,5,55,87,858.0,643.0,STAD
TCGA-3M-AB46,10.0,43693490,43693490,SNP,166,15,84,0,396.0,3,84,181,3693.0,1186.0,STAD
TCGA-3M-AB46,10.0,46321916,46321916,SNP,19,15,14,0,480.0,2,14,35,1565.0,1439.0,STAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,0.0,125299773,125299773,SNP,29,14,35,0,45.0,4,35,43,216.0,135.0,LGG
TCGA-WY-A85E,0.0,53277942,53277942,SNP,40,26,62,0,807.0,5,62,66,2621.0,2420.0,LGG
TCGA-WY-A85E,0.0,54783821,54783821,SNP,72,20,63,0,896.0,5,63,92,2716.0,2686.0,LGG
TCGA-WY-A85E,0.0,76938297,76938297,SNP,110,76,190,0,817.0,5,190,186,2666.0,2451.0,LGG


In [371]:
X = test_df.drop("Cancer", axis=1)
y = test_df["Cancer"]

In [378]:
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 생성

# X_train데이터를 이용 피팅하고 라벨숫자로 변환한다
label_encoder = LabelEncoder()
label_encoder.fit(X['Variant_Type'])

X['Variant_Type']=label_encoder.transform(X['Variant_Type'])


In [379]:
X

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position
TCGA-3M-AB46,10.0,101560185,101560185,3,191,110,182,0,358.0,5,182,302,1187.0,1074.0
TCGA-3M-AB46,10.0,13264119,13264119,3,9,5,14,0,134.0,5,14,14,474.0,401.0
TCGA-3M-AB46,10.0,17737155,17737155,3,50,37,55,0,215.0,5,55,87,858.0,643.0
TCGA-3M-AB46,10.0,43693490,43693490,3,166,15,84,0,396.0,3,84,181,3693.0,1186.0
TCGA-3M-AB46,10.0,46321916,46321916,3,19,15,14,0,480.0,2,14,35,1565.0,1439.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85E,0.0,125299773,125299773,3,29,14,35,0,45.0,4,35,43,216.0,135.0
TCGA-WY-A85E,0.0,53277942,53277942,3,40,26,62,0,807.0,5,62,66,2621.0,2420.0
TCGA-WY-A85E,0.0,54783821,54783821,3,72,20,63,0,896.0,5,63,92,2716.0,2686.0
TCGA-WY-A85E,0.0,76938297,76938297,3,110,76,190,0,817.0,5,190,186,2666.0,2451.0


In [380]:
# 정규화 함수 정의
def normalize_column(column):
    min_value = column.min()
    max_value = column.max()
    normalized_column = (column - min_value) / (max_value - min_value)
    return normalized_column

# 정규화할 숫자 열 선택
numeric_columns = ['Chromosome','Start_Position', 'End_Position', 'Variant_Type','t_ref_count', 't_alt_count',
                   'n_ref_count', 'n_alt_count', 'Protein_position', 'NCALLERS',
                   'n_depth', 't_depth', 'cDNA_position', 'CDS_position']

# 선택한 열들을 정규화
X[numeric_columns] = X[numeric_columns].apply(normalize_column)

# 정규화된 데이터 확인
print(X)

              Chromosome  Start_Position  End_Position  Variant_Type  \
TCGA-3M-AB46    0.454545        0.407509      0.407509          0.75   
TCGA-3M-AB46    0.454545        0.053201      0.053201          0.75   
TCGA-3M-AB46    0.454545        0.071150      0.071150          0.75   
TCGA-3M-AB46    0.454545        0.175306      0.175306          0.75   
TCGA-3M-AB46    0.454545        0.185853      0.185853          0.75   
...                  ...             ...           ...           ...   
TCGA-WY-A85E    0.000000        0.502770      0.502770          0.75   
TCGA-WY-A85E    0.000000        0.213766      0.213766          0.75   
TCGA-WY-A85E    0.000000        0.219808      0.219808          0.75   
TCGA-WY-A85E    0.000000        0.308708      0.308708          0.75   
TCGA-DU-7306    0.863636        0.091658      0.091744          0.00   

              t_ref_count  t_alt_count  n_ref_count  n_alt_count  \
TCGA-3M-AB46     0.025484     0.020270     0.036345          0.0   

In [357]:
X['Start_Position'].dtype

dtype('float64')

In [381]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [234]:
# 결측값을 평균으로 대체하는 SimpleImputer 생성
imputer = SimpleImputer(strategy='mean')

# X 데이터에 대해 결측값 대체 수행
X = imputer.fit_transform(X)

In [382]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [383]:
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [384]:
y_pred = model.predict(X_test)

In [385]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.4553007616012813


In [386]:
y_test

TCGA-55-1592    LUAD
TCGA-17-Z005    LUAD
TCGA-EE-A2GT    SKCM
TCGA-55-7283    LUAD
TCGA-17-Z049    LUAD
                ... 
TCGA-AY-A54L    COAD
TCGA-05-4396    LUAD
TCGA-AD-6889    COAD
TCGA-AA-3947    COAD
TCGA-EB-A24D    SKCM
Name: Cancer, Length: 239758, dtype: object

In [387]:
y_pred[205000:205046]

array(['COAD', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM',
       'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'STAD', 'SKCM', 'SKCM',
       'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM',
       'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'COAD', 'SKCM', 'COAD',
       'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'STAD', 'SKCM', 'SKCM',
       'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM', 'SKCM'], dtype=object)

### 여기서부터는 테스트 데이터 돌리기

In [388]:
test_mu = pd.read_csv("./testData/test_data_mutations.csv", encoding='utf-8')

test_mu

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,ENSP,ExAC_AF,CENTERS,CCDS,EXON,ExAC_AF_OTH,SAS_MAF,Exon_Number,MINIMISED,PUBMED
0,LOXL4,0,.,GRCh37,10,100015439,100015439,+,missense_variant,Missense_Mutation,...,ENSP00000260702,.,RADIA|MUTECT|VARSCANS,CCDS7473.1,15-Oct,.,.,15-Oct,.,.
1,HPSE2,0,.,GRCh37,10,100481463,100481463,+,missense_variant,Missense_Mutation,...,ENSP00000359583,.,RADIA|MUTECT|VARSCANS,CCDS7477.1,12-May,.,.,12-May,.,.
2,CNNM2,0,.,GRCh37,10,104679833,104679833,+,synonymous_variant,Silent,...,ENSP00000358894,.,RADIA|MUTECT|MUSE|VARSCANS,CCDS44474.1,08-Jan,.,.,08-Jan,.,.
3,SLK,0,.,GRCh37,10,105750529,105750529,+,synonymous_variant,Silent,...,ENSP00000358770,.,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CCDS7553.1,19-Feb,.,.,19-Feb,.,.
4,BTBD16,0,.,GRCh37,10,124089067,124089067,+,synonymous_variant,Silent,...,ENSP00000260723,.,MUTECT|MUSE,CCDS31301.1,16-Nov,.,.,16-Nov,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32088,MTBP,0,.,GRCh37,8,121528321,121528321,+,frameshift_variant,Frame_Shift_Del,...,ENSP00000303398,.,INDELOCATOR*|VARSCANI*|PINDEL,CCDS6333.1,18/22,.,.,18/22,.,.
32089,SLC39A14,0,.,GRCh37,8,22273317,22273317,+,synonymous_variant,Silent,...,ENSP00000352779,.,MUTECT|MUSE|VARSCANS,CCDS6030.1,06-Sep,.,.,06-Sep,.,.
32090,IFNA21,0,.,GRCh37,9,21166376,21166376,+,missense_variant,Missense_Mutation,...,ENSP00000369574,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS6497.1,01-Jan,.,.,01-Jan,.,.
32091,HCFC1,0,.,GRCh37,X,153222852,153222852,+,missense_variant,Missense_Mutation,...,ENSP00000309555,.,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS,CCDS44020.1,13/26,.,.,13/26,.,.


In [390]:
test_dfset=test_mu.copy()

test_dfset=test_dfset[['Tumor_Sample_Barcode','Chromosome','Start_Position','End_Position','Variant_Type','t_ref_count','t_alt_count','n_ref_count','n_alt_count','Protein_position','NCALLERS','n_depth','t_depth','cDNA_position','CDS_position']]
test_dfset=test_dfset.transpose()

test_dfset = test_dfset.rename(columns=test_dfset.iloc[0])
test_dfset = test_dfset.drop(test_dfset.index[0])
test_dfset=test_dfset.transpose()

test_dfset

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position
patient_1,10,100015439,100015439,SNP,42,4,45,0,496.0,3,45,46,1637,1486
patient_1,10,100481463,100481463,SNP,56,5,70,0,303.0,3,70,61,967,907
patient_1,10,104679833,104679833,SNP,166,25,140,0,532.0,4,140,191,1784,1596
patient_1,10,105750529,105750529,SNP,50,15,62,0,83.0,5,62,66,792,247
patient_1,10,124089067,124089067,SNP,41,4,46,0,328.0,2,46,45,1235,984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
patient_61,8,121528321,121528321,DEL,113,31,118,0,712.0,3,118,144,2181,2136
patient_61,8,22273317,22273317,SNP,75,7,81,0,262.0,3,81,82,961,786
patient_61,9,21166376,21166376,SNP,117,56,212,1,79.0,5,213,173,284,236
patient_61,X,153222852,153222852,SNP,214,100,337,1,756.0,5,338,315,3233,2266


In [391]:
#학습할 때 사용했던 라벨 데이터를 그대로 적용시켜서 테스트가 가능토록 진행
test_dfset['Variant_Type']=label_encoder.transform(test_dfset['Variant_Type'])
test_dfset

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position
patient_1,10,100015439,100015439,3,42,4,45,0,496.0,3,45,46,1637,1486
patient_1,10,100481463,100481463,3,56,5,70,0,303.0,3,70,61,967,907
patient_1,10,104679833,104679833,3,166,25,140,0,532.0,4,140,191,1784,1596
patient_1,10,105750529,105750529,3,50,15,62,0,83.0,5,62,66,792,247
patient_1,10,124089067,124089067,3,41,4,46,0,328.0,2,46,45,1235,984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
patient_61,8,121528321,121528321,0,113,31,118,0,712.0,3,118,144,2181,2136
patient_61,8,22273317,22273317,3,75,7,81,0,262.0,3,81,82,961,786
patient_61,9,21166376,21166376,3,117,56,212,1,79.0,5,213,173,284,236
patient_61,X,153222852,153222852,3,214,100,337,1,756.0,5,338,315,3233,2266


In [392]:
test_final=test_dfset.copy()

test_final['Chromosome'] = pd.to_numeric(test_final['Chromosome'], errors='coerce')
test_final['Protein_position'] = pd.to_numeric(test_final['Protein_position'], errors='coerce')
test_final['Start_Position'] = pd.to_numeric(test_final['Start_Position'], errors='coerce')
test_final['End_Position'] = pd.to_numeric(test_final['End_Position'], errors='coerce')
test_final['t_ref_count'] = pd.to_numeric(test_final['t_ref_count'], errors='coerce')
test_final['t_alt_count'] = pd.to_numeric(test_final['t_alt_count'], errors='coerce')
test_final['n_ref_count'] = pd.to_numeric(test_final['n_ref_count'], errors='coerce')
test_final['n_alt_count'] = pd.to_numeric(test_final['n_alt_count'], errors='coerce')
test_final['NCALLERS'] = pd.to_numeric(test_final['NCALLERS'], errors='coerce')
test_final['n_depth'] = pd.to_numeric(test_final['n_depth'], errors='coerce')
test_final['t_depth'] = pd.to_numeric(test_final['t_depth'], errors='coerce')
test_final['cDNA_position'] = pd.to_numeric(test_final['cDNA_position'], errors='coerce')
test_final['CDS_position'] = pd.to_numeric(test_final['CDS_position'], errors='coerce')

#데이터를 0으로 변겅하는 프로세스
test_final = test_final.fillna(0)
test_final

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position
patient_1,10.0,100015439,100015439,3,42,4,45,0,496.0,3,45,46,1637.0,1486.0
patient_1,10.0,100481463,100481463,3,56,5,70,0,303.0,3,70,61,967.0,907.0
patient_1,10.0,104679833,104679833,3,166,25,140,0,532.0,4,140,191,1784.0,1596.0
patient_1,10.0,105750529,105750529,3,50,15,62,0,83.0,5,62,66,792.0,247.0
patient_1,10.0,124089067,124089067,3,41,4,46,0,328.0,2,46,45,1235.0,984.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
patient_61,8.0,121528321,121528321,0,113,31,118,0,712.0,3,118,144,2181.0,2136.0
patient_61,8.0,22273317,22273317,3,75,7,81,0,262.0,3,81,82,961.0,786.0
patient_61,9.0,21166376,21166376,3,117,56,212,1,79.0,5,213,173,284.0,236.0
patient_61,0.0,153222852,153222852,3,214,100,337,1,756.0,5,338,315,3233.0,2266.0


In [393]:
#아까 학습 때 사용했던 정규화 함수를 그대로 사용
test_final[numeric_columns] = test_final[numeric_columns].apply(normalize_column)
test_final

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position
patient_1,0.454545,0.401216,0.401216,1.0,0.019783,0.002125,0.014026,0.000000,0.014687,0.333333,0.014026,0.017391,0.015377,0.013988
patient_1,0.454545,0.403087,0.403087,1.0,0.026378,0.003188,0.023503,0.000000,0.008972,0.333333,0.023503,0.024256,0.009083,0.008538
patient_1,0.454545,0.419936,0.419936,1.0,0.078191,0.024442,0.050038,0.000000,0.015753,0.666667,0.050038,0.083753,0.016758,0.015023
patient_1,0.454545,0.424234,0.424234,1.0,0.023552,0.013815,0.020470,0.000000,0.002458,1.000000,0.020470,0.026545,0.007439,0.002325
patient_1,0.454545,0.497833,0.497833,1.0,0.019312,0.002125,0.014405,0.000000,0.009712,0.000000,0.014405,0.016934,0.011601,0.009263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
patient_61,0.363636,0.487556,0.487556,0.0,0.053227,0.030818,0.041698,0.000000,0.021083,0.333333,0.041698,0.062243,0.020487,0.020107
patient_61,0.363636,0.089207,0.089207,1.0,0.035327,0.005313,0.027672,0.000000,0.007758,0.333333,0.027672,0.033867,0.009027,0.007399
patient_61,0.409091,0.084764,0.084764,1.0,0.055111,0.057386,0.077331,0.333333,0.002339,1.000000,0.077710,0.075515,0.002668,0.002222
patient_61,0.000000,0.614759,0.614759,1.0,0.100801,0.104145,0.124716,0.333333,0.022386,1.000000,0.125095,0.140503,0.030368,0.021330


In [394]:
testdata_pred = model.predict(test_final)

In [395]:
testdata_pred

array(['SKCM', 'SKCM', 'COAD', ..., 'COAD', 'COAD', 'LUAD'], dtype=object)

In [396]:
data = {'id': ['patient_' + str(i) for i in range(1, len(testdata_pred)+1)],
        'expected': testdata_pred}

df = pd.DataFrame(data)
df.to_csv('predictions.csv', index=False)

In [398]:
test_final.tail(20)

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Type,t_ref_count,t_alt_count,n_ref_count,n_alt_count,Protein_position,NCALLERS,n_depth,t_depth,cDNA_position,CDS_position
patient_72,0.363636,0.472529,0.472529,1.0,0.031559,0.006376,0.029947,0.0,0.006455,0.666667,0.029947,0.030664,0.007083,0.006147
patient_72,0.363636,0.396913,0.396913,1.0,0.017428,0.001063,0.01213,0.333333,0.001954,0.0,0.012509,0.014645,0.003015,0.001854
patient_72,0.409091,0.496978,0.496978,1.0,0.016015,0.006376,0.019333,0.0,0.013266,0.0,0.019333,0.015561,0.015048,0.012632
patient_72,0.0,0.618401,0.618401,1.0,0.012718,0.017003,0.017817,0.0,0.059222,1.0,0.017817,0.016934,0.058238,0.05647
patient_61,0.5,0.239135,0.239135,1.0,0.029675,0.035069,0.036391,0.0,0.002724,1.0,0.036391,0.041648,0.003569,0.002589
patient_61,0.545455,0.254842,0.254842,1.0,0.072539,0.090329,0.092494,0.0,0.007344,1.0,0.092494,0.106636,0.007975,0.006994
patient_61,0.681818,0.358616,0.358616,1.0,0.093264,0.054198,0.101971,0.0,0.05028,1.0,0.101971,0.111213,0.051353,0.047941
patient_61,0.727273,0.340441,0.340441,1.0,0.024965,0.042508,0.038666,0.0,0.001629,1.0,0.038666,0.040275,0.003635,0.001534
patient_61,0.772727,0.153625,0.153625,1.0,0.071126,0.090329,0.100076,0.0,0.01676,1.0,0.100076,0.105263,0.018073,0.015984
patient_61,0.863636,0.013973,0.013973,1.0,0.016957,0.014878,0.031084,0.333333,0.00536,1.0,0.031463,0.020137,0.005091,0.005102
