In [1]:
import pandas as pd
import numpy as np

In [2]:
# read 974 differentially expressed probe sets (these are predictors of aml)
aml_predictors_974_probsets = pd.read_csv("974_probsets_aml_predictors.csv")

In [3]:
aml_predictors_974_probsets.shape

(974, 8)

In [4]:
aml_predictors_974_probsets.head()

Unnamed: 0,Gene symbol,ProbeSet,Disease state,Disease state mean difference,Disease state lwr,Disease state upr,Disease state p-adj,Gene title
0,APP,200602_at,AML-Healthy,-0.266954,0.171985,0.361923,3.79e-08,amyloid beta (A4) precursor protein
1,CAPRIN1,200722_s_at,AML-Healthy,0.064151,-0.08159,-0.046711,0.0,cell cycle associated protein 1
2,ANXA5,200782_at,AML-Healthy,-0.122165,0.058745,0.185585,0.000162026,annexin A5
3,SCD,200832_s_at,AML-Healthy,-0.131307,0.084378,0.178236,4.38e-08,stearoyl-CoA desaturase (delta-9-desaturase)
4,GLUD1,200946_x_at,AML-Healthy,0.073083,-0.092724,-0.053443,0.0,glutamate dehydrogenase 1


In [5]:
# extract the probeset
extracted_predictors_probsets = aml_predictors_974_probsets[["ProbeSet"]].set_index('ProbeSet')

In [6]:
extracted_predictors_probsets.shape

(974, 0)

In [7]:
extracted_predictors_probsets.head()

200602_at
200722_s_at
200782_at
200832_s_at
200946_x_at


In [8]:
"""
    This data is  dataset-wise corrected for batch effect (RMA-normalized, Log2 transformed Z-Score
    standardized) for sample source, disease state (aml vs healthy) that include 2213 aml and 548
    healthy subjects.
"""
all_data_2761_subjects = pd.read_csv("All_Factors_Corrected_Data_2761_Subjects_2213_AML_548_Healthy.csv",index_col = [0])



In [9]:
all_data_2761_subjects.shape

(44754, 2761)

In [10]:
all_data_2761_subjects.head()

Unnamed: 0_level_0,GSM259087,GSM259088,GSM259089,GSM259090,GSM259091,GSM259096,GSM259100,GSM259101,GSM262030,GSM262031,...,GSM493822,GSM493823,GSM493824,GSM493825,GSM493826,GSM493827,GSM493828,GSM493829,GSM493830,GSM493831
ProbeSet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1007_s_at,0.161281,-0.228633,0.127034,0.14457,-0.094745,0.171211,0.233032,0.465901,0.456043,0.043278,...,0.190142,0.218486,0.278154,0.098316,-0.07027,0.15632,0.370477,0.24513,0.239496,0.098133
1053_at,1.041374,1.070913,0.388446,0.865756,0.90733,0.948387,0.873951,0.580145,0.675983,0.646015,...,0.847127,0.828324,0.676728,0.955802,0.456537,0.677795,0.573524,0.812244,0.678431,0.336091
117_at,-0.034644,-0.036937,-0.206103,-0.020064,-0.022168,0.469018,0.142008,-0.029783,0.545955,-0.141553,...,-0.002396,0.529174,-0.200546,0.417786,0.249252,-0.174052,0.355376,0.713864,0.647331,-0.089443
121_at,0.795588,0.82676,0.905458,0.762044,0.741079,1.006717,1.041507,1.20507,1.024954,1.127734,...,0.703615,1.092292,0.935975,0.887921,0.6158,0.986255,1.054906,1.194923,1.114506,0.81603
1255_g_at,-1.330335,-1.363635,-1.512532,-1.444253,-1.573762,-1.315563,-1.472113,-1.625229,-1.470817,-1.615853,...,-1.514659,-1.400306,-1.569716,-1.507001,-1.410055,-1.526909,-1.721585,-1.492023,-1.362162,-1.566337


In [11]:
probsets_2761_subjects_concat = pd.concat([all_data_2761_subjects, extracted_predictors_probsets], axis=1, join='inner',sort=False)

In [12]:
probsets_2761_subjects_concat.shape

(974, 2761)

In [13]:
probsets_2761_subjects_concat.head()

Unnamed: 0_level_0,GSM259087,GSM259088,GSM259089,GSM259090,GSM259091,GSM259096,GSM259100,GSM259101,GSM262030,GSM262031,...,GSM493822,GSM493823,GSM493824,GSM493825,GSM493826,GSM493827,GSM493828,GSM493829,GSM493830,GSM493831
ProbeSet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200602_at,1.297539,-1.37891,0.924651,0.824415,0.934866,1.021136,0.835133,0.759041,1.387909,1.496609,...,1.396606,1.304211,1.794907,1.171386,1.676921,2.177924,2.303967,1.145742,1.344041,0.404112
200722_s_at,1.392409,1.424218,1.382827,1.282069,1.396415,1.295529,1.4333,0.848982,1.508736,1.662617,...,1.12928,1.026279,0.913368,1.173596,0.955293,0.772798,1.156402,0.955061,1.075669,0.971523
200782_at,1.027064,1.192688,1.511259,0.01503,1.014507,1.442003,1.544493,1.345384,1.605631,-0.418933,...,1.484623,2.009642,1.488396,1.824245,1.431485,1.379323,2.263255,1.530482,1.807943,1.456095
200832_s_at,0.111663,-0.18903,0.958691,0.145804,-0.076499,1.095383,0.985151,-0.000545,0.445235,0.340515,...,0.563403,0.774552,1.972799,0.225612,0.414192,0.376793,0.616376,1.345123,1.087492,1.02648
200946_x_at,1.48336,1.472979,1.452452,1.317093,1.460708,1.371052,1.343237,0.750398,1.495441,1.341197,...,1.204417,1.130008,1.141774,1.116476,0.905636,0.595948,1.421084,0.849171,1.191834,0.874248


In [14]:
probsets_2761_subjects_concat.index.name = None

In [15]:
probsets_2761_subjects_concat.head()

Unnamed: 0,GSM259087,GSM259088,GSM259089,GSM259090,GSM259091,GSM259096,GSM259100,GSM259101,GSM262030,GSM262031,...,GSM493822,GSM493823,GSM493824,GSM493825,GSM493826,GSM493827,GSM493828,GSM493829,GSM493830,GSM493831
200602_at,1.297539,-1.37891,0.924651,0.824415,0.934866,1.021136,0.835133,0.759041,1.387909,1.496609,...,1.396606,1.304211,1.794907,1.171386,1.676921,2.177924,2.303967,1.145742,1.344041,0.404112
200722_s_at,1.392409,1.424218,1.382827,1.282069,1.396415,1.295529,1.4333,0.848982,1.508736,1.662617,...,1.12928,1.026279,0.913368,1.173596,0.955293,0.772798,1.156402,0.955061,1.075669,0.971523
200782_at,1.027064,1.192688,1.511259,0.01503,1.014507,1.442003,1.544493,1.345384,1.605631,-0.418933,...,1.484623,2.009642,1.488396,1.824245,1.431485,1.379323,2.263255,1.530482,1.807943,1.456095
200832_s_at,0.111663,-0.18903,0.958691,0.145804,-0.076499,1.095383,0.985151,-0.000545,0.445235,0.340515,...,0.563403,0.774552,1.972799,0.225612,0.414192,0.376793,0.616376,1.345123,1.087492,1.02648
200946_x_at,1.48336,1.472979,1.452452,1.317093,1.460708,1.371052,1.343237,0.750398,1.495441,1.341197,...,1.204417,1.130008,1.141774,1.116476,0.905636,0.595948,1.421084,0.849171,1.191834,0.874248


In [16]:
probsets_2761_subjects_concat_transpose = probsets_2761_subjects_concat.transpose()
probsets_2761_subjects_concat_transpose.index.names = ['ID_REF']

In [17]:
probsets_2761_subjects_concat_transpose.shape

(2761, 974)

In [18]:
probsets_2761_subjects_concat_transpose.head()

Unnamed: 0_level_0,200602_at,200722_s_at,200782_at,200832_s_at,200946_x_at,200974_at,200983_x_at,200986_at,200998_s_at,201015_s_at,...,242888_at,243001_at,243092_at,243384_at,243706_at,243797_at,244030_at,244043_at,244297_at,37079_at
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM259087,1.297539,1.392409,1.027064,0.111663,1.48336,1.449332,1.092168,1.051632,0.393156,1.148478,...,-0.774488,-0.867879,0.619717,-1.187252,-0.776018,0.641581,-0.13487,0.194044,-0.456224,-0.112474
GSM259088,-1.37891,1.424218,1.192688,-0.18903,1.472979,1.29681,1.998511,1.453404,0.880878,0.738508,...,-0.19025,-0.945929,0.177261,-1.125785,-0.778146,0.900601,-0.751019,0.858652,-0.025243,-0.723126
GSM259089,0.924651,1.382827,1.511259,0.958691,1.452452,0.53482,1.406378,1.00945,0.736137,1.239491,...,-0.643756,-0.933845,-0.091583,-0.888849,-0.574294,0.733291,-0.525242,0.158801,-0.14856,-0.301435
GSM259090,0.824415,1.282069,0.01503,0.145804,1.317093,1.005495,0.498802,-0.004537,0.091547,1.226556,...,-0.718261,-0.659499,0.952519,-1.081747,-0.640571,0.786002,-0.572304,-0.349141,-0.511744,-0.274177
GSM259091,0.934866,1.396415,1.014507,-0.076499,1.460708,1.163035,1.480175,0.979271,0.59728,0.986748,...,-0.667744,-0.738587,0.203649,-1.144351,-0.60503,0.863905,-0.462335,-0.139304,-0.245095,-0.614119


In [19]:
probsets_2761_subjects_concat_transpose.head(20)

Unnamed: 0_level_0,200602_at,200722_s_at,200782_at,200832_s_at,200946_x_at,200974_at,200983_x_at,200986_at,200998_s_at,201015_s_at,...,242888_at,243001_at,243092_at,243384_at,243706_at,243797_at,244030_at,244043_at,244297_at,37079_at
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM259087,1.297539,1.392409,1.027064,0.111663,1.48336,1.449332,1.092168,1.051632,0.393156,1.148478,...,-0.774488,-0.867879,0.619717,-1.187252,-0.776018,0.641581,-0.13487,0.194044,-0.456224,-0.112474
GSM259088,-1.37891,1.424218,1.192688,-0.18903,1.472979,1.29681,1.998511,1.453404,0.880878,0.738508,...,-0.19025,-0.945929,0.177261,-1.125785,-0.778146,0.900601,-0.751019,0.858652,-0.025243,-0.723126
GSM259089,0.924651,1.382827,1.511259,0.958691,1.452452,0.53482,1.406378,1.00945,0.736137,1.239491,...,-0.643756,-0.933845,-0.091583,-0.888849,-0.574294,0.733291,-0.525242,0.158801,-0.14856,-0.301435
GSM259090,0.824415,1.282069,0.01503,0.145804,1.317093,1.005495,0.498802,-0.004537,0.091547,1.226556,...,-0.718261,-0.659499,0.952519,-1.081747,-0.640571,0.786002,-0.572304,-0.349141,-0.511744,-0.274177
GSM259091,0.934866,1.396415,1.014507,-0.076499,1.460708,1.163035,1.480175,0.979271,0.59728,0.986748,...,-0.667744,-0.738587,0.203649,-1.144351,-0.60503,0.863905,-0.462335,-0.139304,-0.245095,-0.614119
GSM259096,1.021136,1.295529,1.442003,1.095383,1.371052,0.90491,1.70321,0.480758,0.804232,0.75632,...,-0.636273,-1.029896,0.197118,-1.101218,-0.71682,0.094377,-0.70888,0.146812,-0.346413,-0.472607
GSM259100,0.835133,1.4333,1.544493,0.985151,1.343237,1.179395,1.098126,0.728658,1.354265,0.918272,...,-0.0411,-1.237078,0.370807,-1.064741,-0.497846,1.111345,-0.213408,-0.019413,-0.765516,-0.141824
GSM259101,0.759041,0.848982,1.345384,-0.000545,0.750398,0.360325,1.226277,0.09596,1.569649,0.395147,...,-0.636574,-0.693307,0.239105,-0.6366,-0.165007,-0.624785,-0.57442,-0.186702,0.184921,-1.028991
GSM262030,1.387909,1.508736,1.605631,0.445235,1.495441,0.961326,1.226845,-0.164048,0.584732,1.365628,...,-0.214568,-0.795158,0.80274,-0.860013,-0.466079,0.383732,-0.135858,-0.262643,-0.83803,-0.337494
GSM262031,1.496609,1.662617,-0.418933,0.340515,1.341197,0.993248,0.331714,-0.339634,0.168774,1.488161,...,-0.483179,-0.758559,0.58011,-0.785063,-0.101637,-0.076572,0.254018,-0.077207,-0.712226,-0.150745


In [20]:
"""
    This data is the label for the "All_Factors_Corrected_Data_2761_subjects_2213_AML_548_Healthy"
    data, which include 2213 aml and 548 healthy subjects.
"""
all_data_2761_subjects_labels = pd.read_csv("All_Factors_Corrected_Data_2761_Subjects_2213_AML_548_Healthy_Labels.csv", index_col = [0])

In [21]:
all_data_2761_subjects_labels.shape

(2761, 6)

In [22]:
all_data_2761_subjects_labels.head(20)

Unnamed: 0_level_0,Studies_Datasets,Age,Sex,Sample_source,Disease_state,Age_group
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GSM259087,GSE10258,46.0,Male,Bone Marrow (BM),AML,40 to 49
GSM259088,GSE10258,77.0,Female,Bone Marrow (BM),AML,70 to 79
GSM259089,GSE10258,59.0,Female,Bone Marrow (BM),AML,50 to 59
GSM259090,GSE10258,95.0,Female,Bone Marrow (BM),AML,80 to 100
GSM259091,GSE10258,69.0,Female,Bone Marrow (BM),AML,60 to 69
GSM259096,GSE10258,63.0,Female,Bone Marrow (BM),AML,60 to 69
GSM259100,GSE10258,69.0,Female,Bone Marrow (BM),AML,60 to 69
GSM259101,GSE10258,71.0,Male,Bone Marrow (BM),AML,70 to 79
GSM262030,GSE10358,77.0,Male,Bone Marrow (BM),AML,70 to 79
GSM262031,GSE10358,46.0,Male,Bone Marrow (BM),AML,40 to 49


In [23]:
all_data_2761_subjects_idref_diseasestate= all_data_2761_subjects_labels[["Disease_state"]]

In [24]:
all_data_2761_subjects_idref_diseasestate.shape

(2761, 1)

In [25]:
all_data_2761_subjects_idref_diseasestate.head()

Unnamed: 0_level_0,Disease_state
ID_REF,Unnamed: 1_level_1
GSM259087,AML
GSM259088,AML
GSM259089,AML
GSM259090,AML
GSM259091,AML


In [26]:
all_data_2761_subjects_probsets_predictors_with_diseasestate = pd.concat([probsets_2761_subjects_concat_transpose, all_data_2761_subjects_idref_diseasestate],axis=1,sort=False)

In [27]:
all_data_2761_subjects_probsets_predictors_with_diseasestate.shape

(2761, 975)

In [28]:
all_data_2761_subjects_probsets_predictors_with_diseasestate.head()

Unnamed: 0_level_0,200602_at,200722_s_at,200782_at,200832_s_at,200946_x_at,200974_at,200983_x_at,200986_at,200998_s_at,201015_s_at,...,243001_at,243092_at,243384_at,243706_at,243797_at,244030_at,244043_at,244297_at,37079_at,Disease_state
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM259087,1.297539,1.392409,1.027064,0.111663,1.48336,1.449332,1.092168,1.051632,0.393156,1.148478,...,-0.867879,0.619717,-1.187252,-0.776018,0.641581,-0.13487,0.194044,-0.456224,-0.112474,AML
GSM259088,-1.37891,1.424218,1.192688,-0.18903,1.472979,1.29681,1.998511,1.453404,0.880878,0.738508,...,-0.945929,0.177261,-1.125785,-0.778146,0.900601,-0.751019,0.858652,-0.025243,-0.723126,AML
GSM259089,0.924651,1.382827,1.511259,0.958691,1.452452,0.53482,1.406378,1.00945,0.736137,1.239491,...,-0.933845,-0.091583,-0.888849,-0.574294,0.733291,-0.525242,0.158801,-0.14856,-0.301435,AML
GSM259090,0.824415,1.282069,0.01503,0.145804,1.317093,1.005495,0.498802,-0.004537,0.091547,1.226556,...,-0.659499,0.952519,-1.081747,-0.640571,0.786002,-0.572304,-0.349141,-0.511744,-0.274177,AML
GSM259091,0.934866,1.396415,1.014507,-0.076499,1.460708,1.163035,1.480175,0.979271,0.59728,0.986748,...,-0.738587,0.203649,-1.144351,-0.60503,0.863905,-0.462335,-0.139304,-0.245095,-0.614119,AML


In [29]:
all_data_2761_subjects_probsets_predictors_with_diseasestate.tail()

Unnamed: 0_level_0,200602_at,200722_s_at,200782_at,200832_s_at,200946_x_at,200974_at,200983_x_at,200986_at,200998_s_at,201015_s_at,...,243001_at,243092_at,243384_at,243706_at,243797_at,244030_at,244043_at,244297_at,37079_at,Disease_state
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM493827,2.177924,0.772798,1.379323,0.376793,0.595948,0.386725,0.989377,0.039451,1.714442,-0.229545,...,-0.826432,0.086053,-1.019571,-1.003414,-0.320319,-1.008114,0.216513,1.386816,-0.855844,Healthy
GSM493828,2.303967,1.156402,2.263255,0.616376,1.421084,1.186721,1.16374,0.127837,1.818683,0.631109,...,-1.385255,-0.912503,-0.394909,-0.479038,0.458512,-0.564428,-0.158401,-0.047358,-0.652894,Healthy
GSM493829,1.145742,0.955061,1.530482,1.345123,0.849171,1.022034,2.030401,0.980312,1.273175,0.747698,...,-1.026028,-0.635666,-0.947722,-0.664534,-0.843833,-0.635313,-0.453725,-0.290744,-1.068347,Healthy
GSM493830,1.344041,1.075669,1.807943,1.087492,1.191834,0.346187,1.371615,-0.207461,1.574367,0.094657,...,-0.782958,-0.735661,-1.153608,-0.515021,0.533538,-0.47683,-0.347121,-0.237164,-0.462761,Healthy
GSM493831,0.404112,0.971523,1.456095,1.02648,0.874248,-0.078279,1.217053,0.396528,1.029868,-0.281646,...,-1.000471,-0.007861,-0.848438,-0.474011,0.456377,-0.636405,-0.357066,0.720439,-0.824468,Healthy


In [30]:
all_data_2761_subjects_probsets_with_0AML_1Healthy = all_data_2761_subjects_probsets_predictors_with_diseasestate.replace({'Disease_state' : { 'AML' : 0, 'Healthy' :1}})



In [31]:
all_data_2761_subjects_probsets_with_0AML_1Healthy.head()

Unnamed: 0_level_0,200602_at,200722_s_at,200782_at,200832_s_at,200946_x_at,200974_at,200983_x_at,200986_at,200998_s_at,201015_s_at,...,243001_at,243092_at,243384_at,243706_at,243797_at,244030_at,244043_at,244297_at,37079_at,Disease_state
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM259087,1.297539,1.392409,1.027064,0.111663,1.48336,1.449332,1.092168,1.051632,0.393156,1.148478,...,-0.867879,0.619717,-1.187252,-0.776018,0.641581,-0.13487,0.194044,-0.456224,-0.112474,0
GSM259088,-1.37891,1.424218,1.192688,-0.18903,1.472979,1.29681,1.998511,1.453404,0.880878,0.738508,...,-0.945929,0.177261,-1.125785,-0.778146,0.900601,-0.751019,0.858652,-0.025243,-0.723126,0
GSM259089,0.924651,1.382827,1.511259,0.958691,1.452452,0.53482,1.406378,1.00945,0.736137,1.239491,...,-0.933845,-0.091583,-0.888849,-0.574294,0.733291,-0.525242,0.158801,-0.14856,-0.301435,0
GSM259090,0.824415,1.282069,0.01503,0.145804,1.317093,1.005495,0.498802,-0.004537,0.091547,1.226556,...,-0.659499,0.952519,-1.081747,-0.640571,0.786002,-0.572304,-0.349141,-0.511744,-0.274177,0
GSM259091,0.934866,1.396415,1.014507,-0.076499,1.460708,1.163035,1.480175,0.979271,0.59728,0.986748,...,-0.738587,0.203649,-1.144351,-0.60503,0.863905,-0.462335,-0.139304,-0.245095,-0.614119,0


In [32]:

target_data_diseasestate = pd.DataFrame(all_data_2761_subjects_probsets_with_0AML_1Healthy["Disease_state"])


In [33]:
target_data_diseasestate.columns = ['target']

In [34]:
target_data_diseasestate.shape

(2761, 1)

In [35]:
target_data_diseasestate.head()

Unnamed: 0_level_0,target
ID_REF,Unnamed: 1_level_1
GSM259087,0
GSM259088,0
GSM259089,0
GSM259090,0
GSM259091,0


In [36]:
dependent_data_diseasestate = all_data_2761_subjects_probsets_with_0AML_1Healthy.drop("Disease_state",1)



  dependent_data_diseasestate = all_data_2761_subjects_probsets_with_0AML_1Healthy.drop("Disease_state",1)


In [37]:
dependent_data_diseasestate.shape

(2761, 974)

In [38]:
dependent_data_diseasestate.head()

Unnamed: 0_level_0,200602_at,200722_s_at,200782_at,200832_s_at,200946_x_at,200974_at,200983_x_at,200986_at,200998_s_at,201015_s_at,...,242888_at,243001_at,243092_at,243384_at,243706_at,243797_at,244030_at,244043_at,244297_at,37079_at
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM259087,1.297539,1.392409,1.027064,0.111663,1.48336,1.449332,1.092168,1.051632,0.393156,1.148478,...,-0.774488,-0.867879,0.619717,-1.187252,-0.776018,0.641581,-0.13487,0.194044,-0.456224,-0.112474
GSM259088,-1.37891,1.424218,1.192688,-0.18903,1.472979,1.29681,1.998511,1.453404,0.880878,0.738508,...,-0.19025,-0.945929,0.177261,-1.125785,-0.778146,0.900601,-0.751019,0.858652,-0.025243,-0.723126
GSM259089,0.924651,1.382827,1.511259,0.958691,1.452452,0.53482,1.406378,1.00945,0.736137,1.239491,...,-0.643756,-0.933845,-0.091583,-0.888849,-0.574294,0.733291,-0.525242,0.158801,-0.14856,-0.301435
GSM259090,0.824415,1.282069,0.01503,0.145804,1.317093,1.005495,0.498802,-0.004537,0.091547,1.226556,...,-0.718261,-0.659499,0.952519,-1.081747,-0.640571,0.786002,-0.572304,-0.349141,-0.511744,-0.274177
GSM259091,0.934866,1.396415,1.014507,-0.076499,1.460708,1.163035,1.480175,0.979271,0.59728,0.986748,...,-0.667744,-0.738587,0.203649,-1.144351,-0.60503,0.863905,-0.462335,-0.139304,-0.245095,-0.614119


In [39]:
target_data_diseasestate.to_csv("Target_Data_All_Factors_Corrected_Data_2761_Subjects_2213_AML_548_Healthy_974_AML_probsets_Predictors.csv", encoding='utf-8', index=True)


dependent_data_diseasestate.transpose().to_csv("Dependent_Data_All_Factors_Corrected_Data_2761_Subjects_2213_AML_548_Healthy_974_AML_probsets_Predictors.csv", encoding='utf-8', index=True)

In [40]:
testing_data_diseasestate = pd.read_csv("Test_Data_All_Factors_Corrected_Data_613_Subjects.csv",index_col = [0])

In [41]:
testing_data_diseasestate.shape


(44754, 613)

In [42]:
testing_data_diseasestate.head()

Unnamed: 0,GSM2884491,GSM2884492,GSM2884499,GSM1664972,GSM1664976,GSM1664980,GSM1664983,GSM1664986,GSM1664990,GSM1664994,...,GSM376922,GSM376923,GSM376924,GSM376925,GSM376926,GSM376927,GSM376928,GSM376929,GSM376930,GSM376931
1007_s_at,0.185789,0.20492,0.127759,-0.592914,0.280449,0.223089,-0.225795,0.123038,0.501379,-0.2401,...,0.135366,0.167147,0.01581,-0.006813,0.157455,0.113765,-0.001656,-0.13664,0.084627,0.143818
1053_at,-0.076002,-0.147236,0.094784,0.958736,0.476758,0.696128,0.790149,0.859926,0.464457,0.741036,...,0.929959,0.914503,0.922456,1.057455,0.826698,0.859063,0.736877,0.99429,0.766523,0.742823
117_at,0.387563,0.409717,0.188417,0.107779,1.865739,-0.159259,1.312951,0.059105,0.290527,-0.044187,...,-0.065008,0.064108,0.636297,-0.159768,0.022635,0.066194,-0.042692,0.176942,0.160465,-0.139263
121_at,1.373841,1.511071,1.223651,0.058481,0.936253,0.296598,0.313196,0.500946,0.639195,-0.225725,...,0.878026,0.765092,0.795955,0.82742,0.876129,1.055261,0.883866,0.689586,1.008582,0.891654
1255_g_at,-1.041606,-1.040864,-1.007226,-1.480532,-1.500173,-1.366609,-1.517149,-1.681056,-1.236217,-1.731728,...,-1.478983,-1.521176,-1.662529,-1.356728,-1.497778,-1.640361,-1.622765,-1.361585,-1.323178,-1.50116


In [43]:
testing_data_diseasestate_concat = pd.concat([testing_data_diseasestate, extracted_predictors_probsets], axis=1, join='inner')



In [44]:
testing_data_diseasestate_concat.shape


(974, 613)

In [45]:
testing_data_diseasestate_concat.head()

Unnamed: 0,GSM2884491,GSM2884492,GSM2884499,GSM1664972,GSM1664976,GSM1664980,GSM1664983,GSM1664986,GSM1664990,GSM1664994,...,GSM376922,GSM376923,GSM376924,GSM376925,GSM376926,GSM376927,GSM376928,GSM376929,GSM376930,GSM376931
200602_at,2.026919,2.049811,1.709657,-1.044179,1.151768,1.687574,1.617826,-1.996655,1.898017,-0.998463,...,1.564659,1.558346,1.645205,1.445314,1.623301,1.486411,1.511975,1.465835,1.799389,1.66136
200722_s_at,0.124883,0.06074,-0.259119,0.800182,0.550376,0.829392,0.960425,0.991897,0.849874,0.980783,...,1.054061,1.210309,1.349245,1.394589,1.243525,1.188862,1.049569,0.974627,1.216194,1.01687
200782_at,2.163627,2.200357,1.737634,-1.455042,1.595869,-1.643454,1.388912,0.241918,-1.446513,-1.612265,...,1.912012,1.755833,1.831788,1.677988,1.821006,1.805425,1.787862,1.541202,1.815979,1.839551
200832_s_at,0.551225,-0.165847,0.844776,1.018332,-0.403719,1.566858,0.530625,-0.744727,0.373864,0.010042,...,0.763336,1.053646,1.174593,0.955716,1.202944,0.94392,1.009638,0.658432,1.373627,1.270815
200946_x_at,0.523429,0.093164,-0.27451,0.423954,0.506949,0.828788,0.724154,0.993947,0.747143,0.870918,...,1.29348,1.116778,1.187397,1.09942,1.350748,0.914676,0.936274,0.906215,1.009757,1.201025


In [46]:
testing_data_diseasestate_concat.to_csv("Test_Data_All_Factors_Corrected_Data_613_Subjects_974_AML_probsets_Predictors.csv.csv", encoding='utf-8', index=True)