In [30]:
!ls
# Numerical Imports 
import pandas as pd
import numpy as np
import scipy 

# Plotting 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# %matplotlib notebook

# Python 
import os

# sklearn 
from sklearn.metrics import f1_score # f1_score(y_true, y_pred)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel

# scipy
from scipy.cluster import hierarchy as hc # for dendograms 

Full.ipynb	     NA with RNA-Seq.ipynb  sum_tab_1.csv  train_cli.tsv
Gender Visual.ipynb  Proteomics.ipynb	    sum_tab_2.csv  train_pro.tsv
Import.ipynb	     README.txt		    test_cli.tsv   train_rna.tsv
MSI Visual.ipynb     RNA-Seq.ipynb	    test_pro.tsv   visuals.ipynb
NA.ipynb	     start.ipynb	    test_rna.tsv


In [31]:
df_train_pro = pd.read_table(f'{os.getcwd()}/train_pro.tsv', 
                           delim_whitespace=True,
                           low_memory=False,).T
df_test_pro = pd.read_table(f'{os.getcwd()}/test_pro.tsv', 
                           delim_whitespace=True,
                           low_memory=False,).T
df_train_rna = pd.read_table(f'{os.getcwd()}/train_rna.tsv', 
                           delim_whitespace=True,
                           low_memory=False,).T
df_test_rna = pd.read_table(f'{os.getcwd()}/test_rna.tsv', 
                           delim_whitespace=True,
                           low_memory=False,).T
df_train_cli = pd.read_csv(f'{os.getcwd()}/train_cli.tsv', 
                           delim_whitespace=True,
                           low_memory=False,)
df_test_cli = pd.read_csv(f'{os.getcwd()}/test_cli.tsv', 
                           delim_whitespace=True,
                           low_memory=False,)
df_train_mislabel = pd.read_csv(f'{os.getcwd()}/sum_tab_1.csv', 
                           low_memory=False,)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 250)

In [32]:
# df_train_pro
# df_test_pro
# df_train_rna
# df_test_rna
# df_train_cli
# df_test_cli
# df_train_mislabel

In [33]:
# Come back to the way you handle this NA, sophisticated way will imporve by at least 5% 
train_pro = df_train_pro.copy(deep=True)
# train_pro = train_pro.fillna(train_pro.median())
train_pro.index.name = 'sample'

test_pro = df_test_pro.copy(deep=True)
# test_pro = test_pro.fillna(test_pro.median())
test_pro.index.name = 'sample'


train_rna = df_train_rna.copy(deep=True)
# train_rna = train_rna.fillna(train_rna.median())
train_rna.index.name = 'sample'

test_rna = df_test_rna.copy(deep=True)
# test_rna = test_rna.fillna(test_rna.median())
test_rna.index.name = 'sample'

In [34]:
# df_train_pro
# train_pro
# df_test_pro
# test_pro

In [35]:
# df_train_rna
# train_rna
# df_test_rna
# test_rna

In [36]:
train_cli = df_train_cli.copy(deep=True)
train_cli = train_cli.set_index('sample')
train_cli = train_cli.replace({'gender': {'Male':0, 'Female':1},
                              'msi': {'MSI-Low/MSS':0, 'MSI-High':1}})

test_cli = df_test_cli.copy(deep=True)
test_cli = test_cli.set_index('sample')
test_cli = test_cli.replace({'gender': {'Male':0, 'Female':1},
                              'msi': {'MSI-Low/MSS':0, 'MSI-High':1}})

In [37]:
# df_train_cli
# train_cli
# df_test_cli
# test_cli

In [38]:
train_mislabel = df_train_mislabel.copy(deep=True)
train_mislabel = train_mislabel.set_index('sample')

In [39]:
# df_train_mislabel
# train_mislabel

In [40]:
train_pro.reset_index(drop=True, inplace=True)
train_rna.reset_index(drop=True, inplace=True)
train_cli.reset_index(drop=True, inplace=True)
train_mislabel.reset_index(drop=True, inplace=True)

In [41]:
# train_pro
# train_rna
# train_cli
# train_mislabel

In [42]:
train_pro_combined = pd.concat([train_mislabel, train_cli, train_pro], axis=1)
train_rna_combined = pd.concat([train_mislabel, train_cli, train_rna], axis=1)
train_combined = pd.concat([train_mislabel, train_cli, train_rna, train_pro], axis=1)

In [43]:
# train_pro_combined
# train_rna_combined
train_combined

Unnamed: 0,mismatch,gender,msi,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A4GALT,AAAS,AACS,AADAC,AADAT,AAED1,AAGAB,AAK1,AAMDC,AAMP,AANAT,AAR2,AARD,AARS,AARS2,AARSD1,AASDH,AASDHPPT,AASS,AATF,AATK,ABAT,ABCA1,ABCA10,ABCA11P,ABCA12,ABCA13,ABCA17P,ABCA2,ABCA3,ABCA4,ABCA5,ABCA6,ABCA7,ABCA8,ABCA9,ABCB1,ABCB10,ABCB11,ABCB4,ABCB5,ABCB6,ABCB7,ABCB8,ABCB9,ABCC1,ABCC10,ABCC11,ABCC13,ABCC2,ABCC3,ABCC4,ABCC5,ABCC6,ABCC6P1,ABCC6P2,ABCC8,ABCC9,ABCD1,ABCD2,ABCD3,ABCD4,ABCE1,ABCF1,ABCF2,ABCF3,ABCG1,ABCG2,ABCG4,ABCG5,ABCG8,ABHD1,ABHD10,ABHD11,ABHD12,ABHD12B,ABHD13,ABHD14A,ABHD14B,ABHD15,ABHD16A,ABHD16B,ABHD2,ABHD3,ABHD4,ABHD5,ABHD6,ABHD8,ABI1,ABI2,ABI3,ABI3BP,ABL1,ABL2,ABLIM1,ABLIM2,ABLIM3,ABO,ABR,ABRACL,ABT1,ABTB1,ABTB2,ACAA1,ACAA2,ACACA,ACACB,ACAD10,ACAD11,ACAD8,ACAD9,ACADM,ACADS,ACADSB,ACADVL,ACAN,ACAP1,...,VPS18,VPS25,VPS26A,VPS26B,VPS28,VPS29,VPS33A,VPS33B,VPS35,VPS36,VPS37B,VPS41,VPS45,VPS4A,VPS4B,VPS50,VPS51,VPS52,VPS53,VRK1,VSIG4,VTA1,VTI1B,VTN,VWA1,VWA5A,VWA8,VWF,WAPL,WARS,WARS2,WAS,WASF2,WASHC1,WASHC3,WASHC4,WASHC5,WASL,WBP11,WBP2,WDFY1,WDR1,WDR11,WDR12,WDR13,WDR18,WDR26,WDR3,WDR36,WDR4,WDR43,WDR44,WDR45,WDR5,WDR61,WDR74,WDR75,WDR77,WDR82,WFS1,WRNIP1,WTAP,WWC3,XAB2,XIAP,XPC,XPNPEP1,XPNPEP2,XPNPEP3,XPO1,XPO4,XPO5,XPO7,XPOT,XRCC1,XRCC4,XRCC5,XRCC6,XRN1,XRN2,YARS,YARS2,YBX1,YBX3,YIPF4,YIPF5,YIPF6,YKT6,YLPM1,YME1L1,YTHDF2,YTHDF3,YWHAB,YWHAE,YWHAG,YWHAH,YWHAQ,YWHAZ,ZADH2,ZAK,ZBED1,ZBTB7A,ZC3H11A,ZC3H13,ZC3H14,ZC3H15,ZC3H4,ZC3HAV1,ZC3HAV1L,ZCCHC8,ZFPL1,ZFR,ZG16,ZMPSTE24,ZMYM3,ZNF185,ZNF207,ZNF280C,ZNF326,ZNF638,ZNF706,ZPR1,ZW10,ZYX,ZZEF1
0,0,1,0,0.810976,0.294094,2.898144,6.548415,1.004192,2.519397,5.749835,5.04326,0.297011,4.504254,2.024335,5.403343,1.831152,5.165514,7.240806,,5.403007,0.409724,6.551073,3.100041,5.277979,3.298451,5.04593,1.0013,5.689962,0.969101,2.982777,2.366695,0.245197,3.066077,1.492358,0.053929,0.031171,3.255512,0.51914,0.096053,2.246003,0.349446,1.483484,1.022936,0.511371,3.912004,3.71496,0.155912,0.653988,0.175236,3.802488,4.574414,3.433086,1.912922,3.343587,3.305114,0.103883,1.015897,4.427634,4.611711,4.095616,3.138293,3.54397,1.999173,2.875465,0.046088,3.028601,2.024893,0.196113,5.713165,4.111305,6.101022,4.169641,5.450756,4.993056,3.230541,1.144649,0.160974,0.173802,0.14458,0.341795,4.883685,6.468786,6.234869,,3.294638,4.411262,5.705954,3.380753,4.866446,1.859904,3.859852,5.033883,4.344119,3.032352,3.096348,1.994397,5.443316,2.963861,2.109574,0.99648,4.188414,2.307384,4.654947,2.092253,1.161089,3.701283,5.096069,7.270606,5.695157,4.313367,2.516734,7.325857,8.451549,4.3584,1.960287,3.472469,2.684599,4.044159,4.867411,5.169815,5.370055,3.156983,7.553656,0.447867,1.687745,...,1.874889,2.102446,3.644706,0.999049,2.352649,1.355122,1.700971,0.986234,5.201797,1.694419,2.016128,1.037863,2.12976,,2.439649,1.121213,2.570043,1.978001,2.120013,1.037239,,1.985583,1.402441,3.587076,3.139336,1.604568,2.042865,3.276686,1.032778,5.45048,1.551416,1.061254,2.249946,,1.03255,2.360181,3.720805,1.909891,,0.984047,2.031655,6.0683,2.420489,2.353787,1.001043,1.00463,1.065994,2.22194,3.515957,1.023543,1.869788,,,2.177536,3.296747,1.004654,1.794322,1.570204,2.553235,1.925916,,0.991065,1.295359,1.03123,,,3.648148,,2.108976,4.665956,1.133668,3.241074,3.263931,3.196612,2.676144,0.96984,5.853167,5.692673,,3.632066,4.390283,2.785913,2.733676,2.34125,,1.006993,,2.698599,1.473819,1.917234,1.455458,1.493388,4.193716,5.192265,3.821649,4.017592,4.665611,6.300753,1.915149,1.620717,,1.030688,0.992511,1.652999,,1.630672,1.05126,2.915539,1.624048,1.040985,2.097838,2.657478,5.774241,2.280993,,,2.078553,,1.505944,1.007971,1.005565,1.016313,1.859706,3.834986,
1,1,1,0,0.340575,0.286721,2.22373,7.355751,0.967359,2.042327,5.949647,5.540432,0.633366,3.143702,2.85246,5.214069,1.350142,4.827544,7.442804,0.07773,4.881468,0.157388,6.859313,3.773362,4.681637,2.354399,3.245056,2.188932,6.038234,0.801714,1.874068,2.420736,0.830577,2.926069,0.687222,0.172355,,3.612654,1.402553,0.166862,2.521407,0.832623,2.273459,1.123911,0.736438,1.451913,3.560877,0.057962,0.930209,0.608913,4.382873,4.144631,3.51681,1.537415,4.876979,3.928401,0.039895,0.490589,0.165121,6.785603,2.946119,3.803974,2.549305,0.748726,1.027996,0.15281,2.215813,2.746132,0.126538,5.283263,4.277419,4.183235,4.597771,5.216281,5.105845,3.29482,0.854548,0.267521,0.309996,0.218923,0.188975,4.345775,6.593631,5.110704,0.104853,2.656222,4.738079,6.407237,2.392053,4.807543,1.111387,5.724492,5.851426,4.363283,2.930195,2.852994,2.143993,5.571885,4.374626,1.824362,2.907845,5.302708,2.388016,7.010714,4.22131,2.051842,5.293758,6.752665,5.809694,4.625485,5.330238,1.102896,5.854552,6.346346,4.604084,3.148955,3.85879,3.109879,3.724043,5.341813,4.568501,5.063238,3.097221,8.74212,1.652846,1.365372,...,,3.109412,2.380084,,0.905782,2.062183,1.060592,0.990402,5.488938,,,1.041954,1.763433,0.997776,0.996438,,1.624536,,1.491358,,,2.521443,1.434329,3.204487,1.393413,2.559929,3.611227,3.993474,,4.765848,,,2.580698,1.007797,1.036454,2.395179,3.149886,1.232844,1.429274,0.987323,2.068335,5.445641,1.933701,2.171754,1.0047,1.619178,,,2.849889,,1.54539,1.016185,,2.442678,2.356311,1.008733,1.842537,,1.776815,1.073962,,0.995106,,2.185865,,,3.347044,,1.019143,4.369491,1.1379,2.253928,3.139802,2.177914,2.08564,,5.759085,6.105171,0.993673,3.202251,2.857531,2.834447,3.542736,2.390906,,1.010723,,2.132064,2.41396,2.751047,,1.529682,5.019476,5.06274,3.819669,3.993814,4.38987,6.139485,1.94507,1.662572,,1.034917,0.995823,1.680459,,3.392088,2.212228,2.967906,,1.927286,0.970673,1.792636,,1.879562,,,2.560404,,2.51563,1.51679,1.009703,,1.465056,4.156957,
2,0,0,0,1.612506,0.347598,2.20656,7.045954,2.028455,2.829612,5.230059,4.25725,0.61435,3.0635,3.015768,5.307434,2.00704,5.81631,6.877159,0.110958,5.428458,0.526734,6.06038,3.069175,4.679611,2.54995,4.86269,2.13148,5.540457,2.389167,3.140561,2.544298,0.459123,2.113906,0.217318,0.454064,0.089015,3.405832,1.049774,0.094002,2.719581,0.838956,3.027581,0.836651,0.53788,3.91683,3.583882,0.174002,0.732249,0.567775,3.133188,4.176748,3.412255,1.462576,4.01802,3.442805,0.115537,0.688189,0.671391,5.335491,2.908932,3.759258,3.049696,1.414518,2.542534,0.099227,2.581783,2.613845,0.314135,4.857029,3.735537,5.420539,4.132577,5.514917,4.755915,4.153202,2.427634,0.17833,0.292972,,,3.954731,7.263489,5.583831,4.430649,3.84586,4.252372,6.291409,3.223986,4.397446,3.158116,5.187287,4.907994,3.71961,3.138278,3.330321,2.592238,5.021664,3.823034,2.526062,2.757239,4.892917,2.460347,4.777808,3.192402,2.64391,3.98536,5.682361,7.511192,5.241501,4.399818,2.812487,6.84046,7.053297,3.437229,2.31342,3.303087,2.755707,3.500622,4.569392,4.260654,5.213649,2.866477,7.388616,0.900672,2.446984,...,,1.150936,2.583221,,1.666417,2.084077,,,5.13104,0.935247,2.089437,1.041954,1.178319,1.986515,2.690108,,1.633379,1.025122,1.49926,2.311238,,2.834932,0.981792,3.238147,3.334579,2.050865,1.691969,2.977732,,5.062525,1.040445,,2.820295,,,2.725473,2.744512,1.232844,1.436125,1.439645,1.400227,6.160113,1.173862,2.391981,1.0047,,,1.80906,1.94759,,2.404426,,,2.012335,2.985397,,,1.619377,2.132841,,,1.567598,,1.034952,,1.585332,3.682018,1.097587,1.523819,4.691421,2.885033,2.282044,2.136803,3.064393,0.862361,0.974529,5.658831,5.743094,,2.816889,3.84977,1.258,2.404759,1.602509,1.006134,,,3.002017,0.986676,,,,4.772247,5.047297,3.978143,3.574884,4.230233,5.876627,0.947357,1.072991,,1.034917,,1.703609,1.741504,2.703586,1.056045,3.298531,,,1.727459,2.115843,4.629868,2.353838,,1.019584,2.122499,,1.5545,,1.009703,1.019879,1.93292,4.303349,
3,0,1,0,0.768138,0.251778,2.996356,6.451277,1.142773,1.849382,5.526629,4.904821,0.721755,3.025762,2.844898,5.477457,1.477467,6.195029,6.779318,0.08148,5.484737,0.18094,6.266849,2.820478,4.246034,3.250814,4.857285,1.586813,5.56359,1.749705,2.60039,2.150795,0.358562,2.517248,,0.173244,0.194729,3.170614,1.066757,0.10336,2.310197,0.442474,2.06424,0.579675,0.309876,3.692298,4.08226,,0.608184,0.32563,2.35298,4.961576,3.735129,1.953723,3.373161,3.188174,0.044356,0.857686,0.232968,5.489861,4.03768,3.754474,4.075045,1.795847,3.890242,0.053599,1.874852,2.909666,0.26037,5.154072,3.435753,5.808019,3.469754,5.714793,4.326157,4.729395,0.660431,0.110811,0.367033,0.19298,0.27604,5.038556,6.584836,6.557013,0.107923,4.009564,4.268579,6.111598,3.570122,4.455314,2.3076,5.382653,5.236057,4.477853,3.611746,4.418856,2.417644,5.756634,3.482352,2.51509,1.515171,4.617153,2.00702,5.646916,2.092172,2.331457,4.04826,5.656388,7.488311,5.128368,3.576527,2.543819,5.953383,7.060906,3.47226,1.615436,3.123835,2.901394,3.577114,4.43454,4.332251,5.107847,3.634826,7.480069,0.604092,1.759912,...,2.057208,1.165767,3.038106,1.014884,1.903141,2.270899,1.077535,1.67562,4.471561,1.764703,1.723113,1.056133,1.861394,1.01172,2.459934,,2.38182,2.066282,1.58002,,,2.041771,0.993285,4.364993,3.313562,3.187111,2.610699,4.263347,,5.111046,1.054258,,2.54809,,,2.443209,2.723586,2.889669,2.015298,1.507958,2.278978,5.545925,,1.604628,,,,,1.014872,,2.203565,1.6687,,1.739148,2.329611,,1.179531,1.707813,2.923116,1.087222,,1.009116,1.313745,,,1.059527,3.115744,,1.906348,4.537447,2.165044,2.748947,3.659253,3.893942,1.8195,,5.698497,5.65579,,2.972644,4.141407,2.314568,2.888977,1.618239,,,,2.869346,0.999796,,1.011424,1.616613,3.784756,5.332163,2.703492,3.271345,4.095585,5.595029,,2.636959,,,1.52197,2.265474,,1.753413,1.847031,1.588121,1.056122,1.652611,0.981701,1.83899,3.44728,1.987014,,,1.888041,,1.993309,1.024392,,1.589198,1.569086,5.218254,
4,0,1,0,1.03618,0.295748,3.540528,4.952865,1.817558,2.204899,4.598606,4.638229,0.238909,3.138107,2.403575,5.49743,0.935659,5.692919,6.689885,,5.209626,0.437897,5.897469,3.359287,5.076434,3.899793,4.807734,0.571727,5.237851,2.026172,2.467114,2.536564,0.231202,3.223858,0.30294,0.217116,0.150535,3.61249,0.746864,,1.788909,0.394983,1.702453,0.692445,0.176502,3.610089,4.140734,,0.273111,0.064842,3.384769,4.173698,2.893223,1.440728,3.674492,3.158719,0.229017,0.756948,1.959424,4.248929,2.925176,3.421125,4.18226,1.866187,3.681646,,1.629176,1.872852,0.387186,5.794133,4.243787,5.4592,3.785907,5.15667,4.65837,3.979211,1.244521,0.047697,0.903793,0.815708,0.085383,5.173913,5.18997,7.467404,0.164706,4.169751,4.842522,5.733179,3.819551,4.503505,2.908169,4.739522,4.054044,4.63641,3.231717,3.961462,2.117164,5.504303,2.784826,1.785326,0.551532,5.095646,2.057171,4.352692,4.222319,0.867251,2.465279,4.301612,7.167394,5.743122,4.515306,2.237181,6.486581,7.025624,3.413887,0.934749,3.403427,2.762262,3.778876,4.618209,5.253233,5.257573,2.968465,6.98774,0.681891,1.58422,...,1.000956,1.007623,2.468072,,,1.253679,1.747518,,5.242863,1.238659,1.117222,1.095483,1.720368,2.34731,2.377951,1.028815,1.092676,1.107547,,1.099506,,2.092249,,3.601896,3.524648,3.433307,1.039664,2.493115,,4.687618,1.098661,1.079596,3.126376,,1.103,2.30785,2.153438,0.966396,2.148476,,,4.980792,1.635016,1.968695,,,,2.109407,1.932765,,1.113075,,,2.131409,,,1.004156,1.853276,,1.887381,1.066349,,,,,,2.517715,,,4.773365,1.763394,2.068975,2.21536,1.6932,1.256594,,5.484836,5.612621,,4.024215,3.074826,3.47974,3.234496,1.387073,1.106513,1.131864,1.274783,2.010923,,2.723817,1.152584,2.444187,5.339172,4.462588,3.435482,4.177548,4.714756,6.506306,,1.806512,,1.099285,1.172282,1.254199,,2.743619,1.072966,2.440604,,,1.21666,1.229703,2.881849,0.950156,,,2.196551,,1.90974,1.13458,1.120853,1.129109,1.205099,4.190499,1.756951
5,0,0,1,1.617613,0.09642,2.064141,8.016981,2.282282,3.955794,4.943246,4.878066,2.405706,2.148362,2.833991,5.403447,1.631213,5.445221,6.916681,0.340626,5.473349,0.203619,5.933623,3.210151,4.25996,3.400047,4.335494,1.268851,5.41332,2.337906,3.011764,3.228863,0.739657,2.628333,,0.118698,,2.87954,0.819851,0.070096,2.170534,1.079657,2.070282,1.423225,0.30171,4.412655,3.901511,0.0694,1.072676,0.337249,3.848418,4.715514,3.068407,2.112057,3.638455,3.078606,0.109196,0.431589,1.381605,5.535816,3.974728,3.310352,3.491547,1.861015,2.56268,0.177501,2.069875,3.862889,0.348597,5.439788,4.406068,5.616706,4.127932,5.200658,4.980527,4.688729,3.365606,0.118781,1.434984,1.792684,0.208695,4.661321,5.770085,5.934842,0.116503,4.140999,3.905493,5.662318,2.846541,4.791476,3.374937,6.894614,4.649768,5.482819,2.886289,3.258695,2.795727,5.014632,3.22633,3.257285,2.120316,4.907314,3.417398,4.814516,3.693615,3.051036,3.378421,5.677883,5.51455,4.985137,4.576441,3.251455,6.090799,6.410729,3.385198,2.362005,3.671045,2.317498,4.423911,4.979344,5.409542,5.50819,2.686283,7.318063,1.075792,2.158477,...,1.153464,1.133735,2.415956,1.699685,2.257361,2.332542,2.057343,1.409354,5.210131,1.600666,1.44337,1.025525,1.16102,1.80572,2.083222,1.680099,2.657955,1.008402,1.719669,1.025207,,2.838251,1.630556,3.986907,2.255812,2.770229,2.674141,3.832766,1.556346,5.631913,,1.871693,3.200075,1.402483,1.020778,1.840754,2.596804,1.211641,2.379656,1.6257,2.156737,5.321504,2.267226,2.25622,0.990012,,1.461721,,2.498136,1.009742,1.402496,1.000059,,1.83729,1.302808,1.41655,3.250633,1.02448,2.604815,,1.038465,1.399712,,2.740672,,,4.008988,,1.377358,4.536532,1.965133,3.11081,3.422338,1.215538,1.923381,,4.709402,5.327016,1.53279,2.992445,4.598634,2.421719,3.901918,2.502207,,1.380022,,2.22006,1.363564,2.364574,2.519584,2.261525,4.554425,5.416583,3.70583,4.20865,4.610811,5.980216,1.264775,1.055871,,1.017938,2.082728,1.255584,0.988936,3.107293,1.539306,3.841584,1.022575,2.302779,1.822195,2.181409,4.091659,1.701099,,,2.701314,,2.099915,2.237601,0.993088,1.005552,3.137286,3.861595,2.032095
6,0,1,1,1.292022,0.85471,0.164002,6.701985,1.195557,3.259684,5.11496,4.006365,,2.439376,3.688518,5.575236,1.695901,6.56282,7.297054,0.243849,4.960043,0.099956,5.762179,3.485102,5.454825,3.045438,4.716457,3.21154,6.110312,2.587598,0.920195,2.22704,1.025644,2.028412,0.751835,0.135471,0.067311,3.184354,3.424328,,2.224978,0.788189,2.767517,0.220149,0.28813,2.255226,4.172041,,0.418997,0.296705,2.790324,5.120751,3.727765,1.250155,3.412921,2.664348,,0.078911,0.115762,3.615957,1.783403,2.654533,2.130897,1.147462,1.655144,,1.846737,3.767532,1.27371,4.973634,3.847265,5.804981,3.545446,5.267264,5.079504,3.42339,0.239407,,0.077875,,0.502653,4.801022,6.280931,5.970001,0.313912,3.1319,3.778051,5.529233,3.854713,4.547908,2.263419,4.706074,4.814131,3.281737,2.927902,3.168165,1.990173,4.751829,3.408092,4.209402,1.515947,4.591681,2.281657,4.944312,2.778098,1.99848,0.252914,5.171653,6.666207,4.829336,4.772917,1.641795,5.741651,5.982876,3.503825,1.502776,3.298701,2.47138,3.671775,5.146644,4.60424,5.617775,3.576653,7.589436,1.045813,4.300335,...,1.173173,1.729235,2.850659,1.001331,1.846007,1.744129,2.221726,,5.059449,0.93405,1.002613,1.040494,1.757605,2.435905,2.956037,,1.04185,,0.971417,1.586015,,2.584055,0.980608,3.618463,3.66483,,,3.810672,,6.668646,,2.673831,2.47536,1.006384,1.035061,2.852405,2.165148,1.23096,0.973535,1.428494,2.218614,6.036922,1.171924,2.119386,,,,,1.901742,,1.032877,1.567865,,1.326977,2.067512,1.007277,1.160755,1.604956,2.676969,,,1.55324,,1.884273,,1.044743,4.219692,,1.017847,4.691337,1.709933,3.203116,3.770991,,1.382615,,5.328892,5.629418,,3.735753,4.241311,1.256413,3.158732,1.97301,,,,2.281598,2.127734,1.552448,1.485259,1.015648,4.28242,5.264883,4.295902,4.238073,4.069143,5.902384,,,,,0.994641,2.190186,,1.661751,,3.535914,,1.043426,0.969536,1.4178,,2.840855,,,2.440054,,1.540254,1.511706,,,0.873647,3.942143,1.722888
7,0,1,1,0.962942,0.471148,0.214592,7.123389,1.399497,2.394045,5.813972,4.874615,,4.053062,3.580359,5.118055,1.079981,5.478693,7.782304,0.275665,5.077706,0.146809,6.196643,3.853859,4.851122,3.110362,4.719676,1.477681,5.957405,1.08623,2.348207,3.190603,0.063169,3.086619,1.050855,,0.05115,4.59506,3.179593,,1.185653,0.264245,2.763523,0.359072,0.192891,1.590719,3.878682,,0.38067,0.12181,3.789153,4.381405,3.609682,1.711803,3.770074,2.582244,0.048551,0.088632,0.129787,4.544348,3.948243,3.154342,2.609212,1.575292,1.554865,,1.454945,3.255827,0.804516,4.436246,3.973566,5.80336,3.48352,5.480825,5.289868,4.087704,0.669884,0.101679,0.744416,0.849156,0.467505,4.444747,6.362265,6.318395,,3.415389,4.77502,6.371437,3.855671,5.425951,2.205888,5.184095,4.227673,3.971163,2.871426,3.873735,1.768165,5.379527,3.518031,3.815292,2.403268,5.193893,2.335259,5.363449,2.964098,1.653075,3.555972,5.51354,6.684382,5.425824,5.387349,2.844697,5.608599,7.25395,3.190277,1.828936,4.491832,2.333145,4.335316,5.349823,4.700886,5.98851,3.442891,7.561668,0.160363,3.28773,...,,2.454151,2.69407,,1.885075,2.0557,,,5.127586,0.941933,1.667866,1.050105,1.186893,2.492242,3.358091,,1.051689,,1.533739,,,2.460424,0.988401,3.401379,3.278681,2.319247,2.603939,2.945626,,6.495823,,2.328955,3.145868,1.015686,1.044228,2.696921,1.199161,1.243365,0.981349,1.758321,2.061792,6.025777,1.184681,1.558889,,,,,2.429481,,1.04164,,,1.373458,2.115142,,1.897741,1.049191,2.101533,,,,,1.585842,,,3.72654,,1.026375,4.763167,1.146329,1.923367,2.674142,1.840063,1.43192,,5.337624,5.629782,,2.333447,3.678074,1.904573,2.698236,2.195361,,,,2.335012,1.900912,1.008941,1.529996,1.02441,4.350423,4.925461,3.631536,4.095313,4.326252,5.814367,1.423599,,,1.043344,1.002418,0.949012,1.019091,2.299141,1.065584,3.268597,,1.606766,1.438623,1.465551,3.267372,2.66329,,,2.952627,,,,1.01795,1.026981,1.922123,4.399934,
8,0,1,0,0.994111,0.409193,3.628532,7.69246,1.743556,2.444568,5.286536,4.811481,1.224631,1.658102,2.817046,4.050684,2.939371,5.969437,7.647143,0.07394,5.939598,0.150886,6.213449,2.953738,4.536174,2.880462,4.214769,2.164575,6.013429,1.667501,4.707455,3.293676,0.538522,2.70396,,0.252983,0.083648,4.879264,0.927178,0.077749,2.607108,0.715752,2.012365,0.775564,0.583123,3.426716,3.835934,,1.366551,0.128454,4.217744,4.849622,3.40835,1.363262,5.072292,3.728286,0.038098,0.140615,2.235561,4.985795,3.394869,4.542797,3.767226,1.655396,3.368993,,2.930193,3.214481,0.174819,4.492371,3.140837,4.964494,4.186289,5.118419,4.991787,5.373015,0.870998,0.065239,0.094383,0.096143,0.40532,4.11371,6.098775,7.53006,0.100609,3.527568,3.19955,6.280617,3.092545,4.511417,2.997651,5.360261,3.507543,4.288919,2.538451,3.321272,2.795011,5.500955,4.571151,2.693374,1.874152,5.07089,3.004903,5.653289,4.959937,1.577672,4.157987,5.653864,6.468568,4.575819,4.543298,2.347715,5.975269,6.092342,4.073489,2.716728,3.965437,2.856945,4.305015,5.130818,4.249112,4.67358,3.498097,7.327795,0.911155,1.810467,...,1.087986,2.744357,3.504047,2.098382,1.150561,1.36599,1.542383,,5.409243,1.362949,1.620501,0.979172,2.078729,1.667099,2.53521,1.475353,2.628536,0.992523,,1.943756,,2.157318,1.316368,4.469275,3.457566,3.51155,4.441007,4.183765,2.062493,4.711884,1.63932,,2.984233,1.66904,1.209935,2.667176,3.653089,2.620478,1.326418,1.316727,2.690936,5.895718,2.88845,3.221748,,1.619338,1.177206,2.43519,2.970708,2.129179,1.691406,2.495075,,2.514989,1.774392,1.006062,1.471134,0.980504,2.100793,1.957694,2.243784,1.238695,0.747493,2.018072,1.688807,0.973082,4.097842,1.167638,2.395549,4.74165,2.569506,3.017122,4.13282,3.739247,2.662423,1.225007,5.871323,6.094574,2.063536,4.442338,4.032097,2.62797,1.470099,1.513805,1.197103,1.006086,,1.767175,,2.265848,1.75369,1.239855,4.607458,4.915403,3.800562,4.11737,4.696124,6.113459,1.091232,2.783798,1.64437,1.199765,1.027013,1.101827,1.192087,2.72931,2.799537,2.81172,1.187927,0.975082,1.353798,2.425342,0.999442,2.193851,1.163372,,2.555313,,1.974749,2.375034,,1.246549,1.719412,4.573103,2.475476
9,1,1,1,0.583729,0.289036,1.617814,6.112131,0.487632,2.112826,5.618431,4.637327,0.866772,1.688684,2.821458,4.994349,1.519175,4.243395,7.611099,0.667207,4.560651,0.217713,6.776007,4.242079,4.788322,2.860937,4.113082,0.849154,6.041453,2.946173,1.586798,2.78749,0.701581,2.308585,1.46045,0.289524,,4.56307,1.571398,,2.393643,0.232933,3.206757,0.071037,0.398605,2.351791,3.787123,0.215625,0.488908,,4.456304,4.457344,4.713444,1.267877,4.597224,3.971874,0.178641,0.528961,1.484296,6.714635,2.72256,3.386115,3.439783,1.363117,1.276604,0.115325,1.500696,4.345539,0.31006,5.133697,4.073343,5.234958,4.346308,6.039025,5.655751,4.726231,1.162414,0.072081,0.887337,0.43064,,3.95493,8.529938,5.690221,0.210335,3.348833,3.989518,6.15835,3.818539,5.51115,2.606996,7.026392,5.178089,4.63907,2.139331,2.637822,1.596169,5.266087,3.606903,3.413779,0.418192,5.86934,2.112934,5.865914,3.403968,1.90951,5.133406,6.950665,5.997124,5.180908,3.978695,2.850311,6.549563,7.058935,4.457862,1.927594,4.611932,2.631833,4.086622,5.305084,5.112411,6.292622,3.049787,8.526183,3.009896,3.150603,...,2.629346,3.631209,3.185915,,1.150561,1.125846,1.717945,1.845401,5.335441,2.681521,2.772957,1.131402,2.374682,2.344082,2.37466,2.225383,2.643678,2.500604,1.636753,1.136186,,2.534533,1.043708,2.883635,3.299577,2.304521,4.434263,3.749995,1.115616,4.86032,1.8421,0.96297,2.976398,1.164334,1.555934,2.809689,3.498521,2.320333,1.055024,,2.916354,5.895778,2.740608,2.448564,1.62243,2.049413,1.519482,2.301306,2.615878,1.766972,1.90907,1.854315,,2.368238,3.058207,1.006062,1.388098,1.529549,2.473986,2.967594,2.292666,1.572271,1.288256,1.588068,2.052367,,4.266631,1.521646,2.238425,4.904939,2.585043,3.406713,3.733376,4.00158,2.27864,1.791069,6.019674,6.309361,0.996952,3.480409,4.941155,2.447041,1.398189,2.561799,1.003202,1.170771,1.314948,2.24452,1.192856,2.726591,1.193246,,4.417688,5.411796,4.380385,3.519256,4.500865,5.997311,,2.448339,1.64437,1.134384,1.027013,1.301448,0.99597,3.440328,1.95594,3.921188,1.498873,1.137235,1.80097,3.081801,1.167034,2.246885,1.163372,1.165423,2.418353,,2.1331,1.174086,,0.99715,1.908546,4.518285,2.153076


In [44]:
train_combined[[ # msi proteomics
                'TAP1', 'APOL2', 'LCP1', 'PTPN6', 'CASK', 'UBE2L6', 'ICAM1', 'ITGB2',
                'SDF2L1', 'CKB', 'LAP3', 'IFI30', 'PTPRC', 'HSDL2', 'RFC2', 'WARS',
                'IFI35', 'TYMP', 'CSRP2', 'TAPBP', 'ERMP1', 'ANP32E', 'HP', 'HK3',
                'ROCK2', 'ADPGK', 'HSPA4L', 'CNDP2', 'RFTN1', 'GBP1', 'GBP2', 'FMNL1',
                'NCF2', 'YARS2', 'RPL3', 'SPTLC2', 'ENO1', 'SNX12', 'DMBT1', 'ARL3',
                
                 # mis rna-seq 
                'EPDR1', 'APOL3', 'POU5F1B', 'CFTR', 'CIITA', 'RAB32', 'MAX',
                'PRSS23', 'FABP6', 'GABRP', 'LAP3', 'LY6G6D', 'SLC19A3', 'WARS',
                'DLGAP1-AS5', 'GBP1', 'RAMP1', 'AREG', 'GSPT2', 'TNFAIP2',
                'EREG', 'TNNC2', 'ANKRD27', 'PLCL2', 'TFCP2L1', 'LAG3', 'GRM8', 
                'BEX2', 'DEFB1', 'FABP1', 'IRF1', 'CCL4', 'SLC25A48', 'SLC51B', 
                'TRIM72', 'GBP4', 'HPSE',
                
                # gender proteomics
                'MYO9B', 'EMILIN1', 'MRPL16', 'EIF4G2', 'TNXB', 'ARFIP1', 'GAR1', 'SLC35A2',
                'GADD45GIP1', 'TJP2', 'ALDH1A1', 'PGM2', 'LTBP1', 'NNMT', 'COX7C', 'FBN1',
                'SERPINB1', 'U2AF1L5', 'COG3', 'ASPN', 'METTL1', 'DDT', 'FBLN1', 'FLNC', 
                'NAP1L1', 'AGRN', 'RCC2', 'ZNF706', 'C4A', 'COL4A2', 'S100A14', 
                    
                # gender rna-seq 
                'RPS4Y1', 'ZFX', 'DDX3Y', 'UTY', 'SRSF6', 'GYG2P1', 'ZRSR2', 
                'EVPL', 'LUC7L2', 'SRGAP2B', 'NAT8', 'ZNF862', 'ODAM', 'RGS4', 
                'RNASEH2A', 'EIF1AY', 'FOXD2', 'PMS2P3', 'MND1', 'FAM3D', 'DDB2', 
                'DDX3X', 'NINJ2', 'NLRX1', 'COCH', 'IRF5', 'FHDC1', 'ZNF606', 
                'FBXW9', 'RPL21P28', 'TLX1', 'HEBP2', 'TRIM29', 'ZNF273', 'KCNJ2', 
                'CRABP2', 'ABCA6', 'MMP17', 'PCNA', 'S100A16', 'UBA6', 'EMX1', 'EHBP1', 'THG1L',
                ]].isnull().sum(axis = 0)

TAP1           0
TAP1           0
APOL2          0
APOL2         15
LCP1           0
LCP1           0
PTPN6          0
PTPN6          0
CASK           0
CASK           2
UBE2L6         0
UBE2L6        15
ICAM1          0
ICAM1          1
ITGB2          0
ITGB2          1
SDF2L1         0
SDF2L1        24
CKB            0
CKB            3
LAP3           0
LAP3           0
IFI30          0
IFI30         36
PTPRC          0
PTPRC          4
HSDL2          0
HSDL2          0
RFC2           0
RFC2           9
WARS           0
WARS           0
IFI35          0
IFI35          5
TYMP           0
TYMP           0
CSRP2          0
CSRP2         16
TAPBP          0
TAPBP          2
ERMP1          0
ERMP1          1
ANP32E         0
ANP32E         0
HP            10
HP             0
HK3            0
HK3           10
ROCK2          0
ROCK2          2
ADPGK          0
ADPGK         14
HSPA4L         1
HSPA4L        37
CNDP2          0
CNDP2          0
RFTN1          0
RFTN1          7
GBP1          

In [46]:
train_combined[[ # msi proteomics
                'TAP1', 'LCP1', 'PTPN6', 'CASK', 'ICAM1', 'ITGB2',
                'CKB', 'LAP3', 'PTPRC', 'HSDL2', 'WARS',
                'IFI35', 'TYMP', 'TAPBP', 'ERMP1', 'ANP32E',
                'ROCK2', 'CNDP2', 'RFTN1', 'GBP1',
                'NCF2', 'YARS2', 'RPL3', 'ENO1', 'SNX12', 'ARL3',
                
                 # mis rna-seq 
                'EPDR1', 'APOL3', 'POU5F1B', 'CFTR', 'CIITA', 'MAX',
                'PRSS23', 'FABP6', 'GABRP', 'LAP3', 'SLC19A3', 'WARS',
                'GBP1', 'RAMP1', 'AREG',
                'EREG', 'TNNC2', 'ANKRD27', 'PLCL2', 'TFCP2L1', 'LAG3', 'GRM8', 
                'BEX2', 'DEFB1', 'IRF1', 'CCL4', 'SLC51B', 
                'GBP4', 'HPSE',
                
                # gender proteomics
                'EMILIN1', 'EIF4G2', 'ARFIP1', 'GAR1',
                'TJP2', 'ALDH1A1', 'PGM2', 'LTBP1', 'NNMT', 'COX7C', 'FBN1',
                'SERPINB1', 'U2AF1L5', 'COG3', 'FBLN1', 'FLNC', 
                'NAP1L1', 'AGRN', 'RCC2', 'COL4A2', 'S100A14', 
                    
                # gender rna-seq 
                'ZFX', 'SRSF6', 'ZRSR2', 
                'EVPL', 'LUC7L2', 'ZNF862', 'ODAM', 'RGS4', 
                'FOXD2', 'PMS2P3', 'MND1', 'FAM3D', 'DDB2', 
                'DDX3X', 'NINJ2', 'COCH', 'IRF5', 'FHDC1', 'ZNF606', 
                'FBXW9', 'RPL21P28', 'TLX1', 'HEBP2', 'TRIM29', 'ZNF273', 'KCNJ2', 
                'CRABP2', 'ABCA6', 'MMP17', 'PCNA', 'S100A16', 'UBA6', 'EHBP1', 'THG1L',
                ]].isnull().sum(axis = 0)

TAP1        0
TAP1        0
LCP1        0
LCP1        0
PTPN6       0
PTPN6       0
CASK        0
CASK        2
ICAM1       0
ICAM1       1
ITGB2       0
ITGB2       1
CKB         0
CKB         3
LAP3        0
LAP3        0
PTPRC       0
PTPRC       4
HSDL2       0
HSDL2       0
WARS        0
WARS        0
IFI35       0
IFI35       5
TYMP        0
TYMP        0
TAPBP       0
TAPBP       2
ERMP1       0
ERMP1       1
ANP32E      0
ANP32E      0
ROCK2       0
ROCK2       2
CNDP2       0
CNDP2       0
RFTN1       0
RFTN1       7
GBP1        0
GBP1        3
NCF2        0
NCF2        4
YARS2       0
YARS2       0
RPL3        0
RPL3        0
ENO1        0
ENO1        0
SNX12       0
SNX12       3
ARL3        0
ARL3        6
EPDR1       0
APOL3       0
POU5F1B     1
CFTR        0
CIITA       0
MAX         0
PRSS23      0
FABP6       0
GABRP       2
LAP3        0
LAP3        0
SLC19A3     0
WARS        0
WARS        0
GBP1        0
GBP1        3
RAMP1       0
AREG        0
EREG        0
TNNC2 