In [95]:
import pandas as pd
from src.mofa_utils import transform_df_for_mofa, preprocess_data_for_mofa

Загружаем данные (CCLE 23Q2). Оставляем релевантные колонки

In [96]:
targets_df = pd.read_excel('data/gdsc/GDSC1_fitted_dose_response_27Oct23.xlsx')
targets_df = targets_df[['COSMIC_ID', 'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'DRUG_NAME', 'PATHWAY_NAME', 'LN_IC50', 'AUC', 'RMSE']]

targets_df

Unnamed: 0,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,DRUG_NAME,PATHWAY_NAME,LN_IC50,AUC,RMSE
0,684057,ES5,SIDM00263,Erlotinib,EGFR signaling,3.966813,0.985678,0.026081
1,684059,ES7,SIDM00269,Erlotinib,EGFR signaling,2.692090,0.972690,0.110059
2,684062,EW-11,SIDM00203,Erlotinib,EGFR signaling,2.477990,0.944459,0.087019
3,684072,SK-ES-1,SIDM01111,Erlotinib,EGFR signaling,2.033564,0.950758,0.016290
4,687448,COLO-829,SIDM00909,Erlotinib,EGFR signaling,2.966007,0.954778,0.180255
...,...,...,...,...,...,...,...,...
333156,1659823,SNU-1040,SIDM00217,I-CBP112,Chromatin histone acetylation,5.085294,0.972251,0.040661
333157,1660035,SNU-61,SIDM00194,I-CBP112,Chromatin histone acetylation,5.725399,0.976109,0.045453
333158,1660036,SNU-81,SIDM00193,I-CBP112,Chromatin histone acetylation,4.930753,0.970851,0.038612
333159,1674021,SNU-C5,SIDM00498,I-CBP112,Chromatin histone acetylation,4.551784,0.972330,0.042649


In [97]:
targets_df.isna().any()

COSMIC_ID          False
CELL_LINE_NAME     False
SANGER_MODEL_ID    False
DRUG_NAME          False
PATHWAY_NAME       False
LN_IC50            False
AUC                False
RMSE               False
dtype: bool

Отбираем таргеты для 1 лекарства с наибольшим кол-вом сэмплов. Это Cisplatin


In [98]:
targets_df = targets_df[targets_df.DRUG_NAME == targets_df.DRUG_NAME.value_counts().index[0]]
targets_df

Unnamed: 0,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,DRUG_NAME,PATHWAY_NAME,LN_IC50,AUC,RMSE
178031,683667,PFSK-1,SIDM01132,Cisplatin,DNA replication,2.413061,0.887261,0.130150
178032,684052,A673,SIDM00848,Cisplatin,DNA replication,0.931652,0.725126,0.085855
178033,684057,ES5,SIDM00263,Cisplatin,DNA replication,1.139422,0.724618,0.095790
178034,684059,ES7,SIDM00269,Cisplatin,DNA replication,0.981337,0.716411,0.108396
178035,684062,EW-11,SIDM00203,Cisplatin,DNA replication,1.354476,0.739942,0.180735
...,...,...,...,...,...,...,...,...
326428,1660034,SNU-407,SIDM00214,Cisplatin,DNA replication,1.616463,0.893384,0.045506
326429,1660035,SNU-61,SIDM00194,Cisplatin,DNA replication,3.834480,0.962849,0.052188
326430,1660036,SNU-81,SIDM00193,Cisplatin,DNA replication,2.752124,0.942381,0.065188
326431,1674021,SNU-C5,SIDM00498,Cisplatin,DNA replication,1.465027,0.887056,0.047462


In [99]:
targets_df[targets_df.duplicated(subset=['COSMIC_ID'], keep=False)].sort_values(by='COSMIC_ID')

Unnamed: 0,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,DRUG_NAME,PATHWAY_NAME,LN_IC50,AUC,RMSE
325514,683667,PFSK-1,SIDM01132,Cisplatin,DNA replication,2.010083,0.941060,0.025012
178031,683667,PFSK-1,SIDM01132,Cisplatin,DNA replication,2.413061,0.887261,0.130150
325515,684052,A673,SIDM00848,Cisplatin,DNA replication,-0.666270,0.664762,0.032636
178032,684052,A673,SIDM00848,Cisplatin,DNA replication,0.931652,0.725126,0.085855
325516,684057,ES5,SIDM00263,Cisplatin,DNA replication,1.149465,0.854437,0.058883
...,...,...,...,...,...,...,...,...
178962,1660036,SNU-81,SIDM00193,Cisplatin,DNA replication,4.109470,0.948090,0.036118
178963,1674021,SNU-C5,SIDM00498,Cisplatin,DNA replication,2.764805,0.894065,0.087766
326431,1674021,SNU-C5,SIDM00498,Cisplatin,DNA replication,1.465027,0.887056,0.047462
326432,1789883,DiFi,SIDM00049,Cisplatin,DNA replication,2.777253,0.964621,0.044126


Видим, что почти все образцы с дупликатами. Чтобы не выбрасывать дубликат случайно, воспользуемся метриками качества - среди дубликатов будем брать образец с наибольшей средней метрикой (между AUC и RMSE)

In [100]:
targets_df['Average_Score'] = targets_df[['AUC', 'RMSE']].mean(axis=1)
targets_df_sorted = targets_df.sort_values(by=['COSMIC_ID', 'Average_Score'], ascending=[True, False])
targets_df_unique = targets_df_sorted.drop_duplicates(subset='COSMIC_ID', keep='first').drop(columns=['Average_Score', 'AUC', 'RMSE'])

targets_df_unique

Unnamed: 0,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,DRUG_NAME,PATHWAY_NAME,LN_IC50
178031,683667,PFSK-1,SIDM01132,Cisplatin,DNA replication,2.413061
178032,684052,A673,SIDM00848,Cisplatin,DNA replication,0.931652
325516,684057,ES5,SIDM00263,Cisplatin,DNA replication,1.149465
178034,684059,ES7,SIDM00269,Cisplatin,DNA replication,0.981337
178035,684062,EW-11,SIDM00203,Cisplatin,DNA replication,1.354476
...,...,...,...,...,...,...
326428,1660034,SNU-407,SIDM00214,Cisplatin,DNA replication,1.616463
326429,1660035,SNU-61,SIDM00194,Cisplatin,DNA replication,3.834480
326430,1660036,SNU-81,SIDM00193,Cisplatin,DNA replication,2.752124
178963,1674021,SNU-C5,SIDM00498,Cisplatin,DNA replication,2.764805


In [101]:
sample_df = pd.read_csv('data/gdsc/sample_info.csv', index_col=0)
sample_df = sample_df[['cell_line_name', 'CCLE_Name', 'COSMICID', 'sex', 'age', 'Sanger_Model_ID']]
sample_df = sample_df.dropna(subset=['cell_line_name', 'COSMICID', 'Sanger_Model_ID'], how='all')
sample_df = sample_df.rename(columns={'COSMICID': 'COSMIC_ID', 'Sanger_Model_ID': 'SANGER_MODEL_ID', 'cell_line_name': 'CELL_LINE_NAME'})
sample_df = sample_df.reset_index()

sample_df

Unnamed: 0,DepMap_ID,CELL_LINE_NAME,CCLE_Name,COSMIC_ID,sex,age,SANGER_MODEL_ID
0,ACH-000016,SLR 21,SLR21_KIDNEY,,,,
1,ACH-000032,MHH-CALL-3,MHHCALL3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Female,11,
2,ACH-000033,NCI-H1819,NCIH1819_LUNG,,Female,55,
3,ACH-000043,Hs 895.T,HS895T_FIBROBLAST,,Female,48,
4,ACH-000049,HEK TE,HEKTE_KIDNEY,,,,
...,...,...,...,...,...,...,...
1772,ACH-002393,CRO-AP3,CROAP3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Male,42,SIDM00070
1773,ACH-002394,GEO,GEO_LARGE_INTESTINE,,,,SIDM00068
1774,ACH-002395,HuH-6 Clone 5,HUH6CLONE5_LIVER,,Male,1,SIDM01257
1775,ACH-002396,Sarc9371,SARC9371_BONE,,,,SIDM00033


Матчим таргеты LN_IC50 по DepMap_ID и Сosmic ID

In [109]:
merge_on_sanger_model_id = pd.merge(sample_df, targets_df_unique, on='SANGER_MODEL_ID', how='inner')

final_df = merge_on_sanger_model_id.drop_duplicates(subset=['DepMap_ID', 'LN_IC50']).reset_index(drop=True)
final_df = final_df.set_index('DepMap_ID')

final_target_df = final_df.sort_index()
final_target_df = final_target_df.dropna(axis=1, how='any')

final_target_df

Unnamed: 0_level_0,CCLE_Name,SANGER_MODEL_ID,COSMIC_ID_y,CELL_LINE_NAME_y,DRUG_NAME,PATHWAY_NAME,LN_IC50
DepMap_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACH-000001,NIHOVCAR3_OVARY,SIDM00105,905933,OVCAR-3,Cisplatin,DNA replication,2.240063
ACH-000002,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00829,905938,HL-60,Cisplatin,DNA replication,2.031773
ACH-000004,HEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00594,907053,HEL,Cisplatin,DNA replication,0.503415
ACH-000006,MONOMAC6_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM01023,908148,MONO-MAC-6,Cisplatin,DNA replication,3.429265
ACH-000007,LS513_LARGE_INTESTINE,SIDM00677,907795,LS-513,Cisplatin,DNA replication,3.795200
...,...,...,...,...,...,...,...
ACH-002316,WIL2NS_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM01102,1331049,WIL2-NS,Cisplatin,DNA replication,2.227209
ACH-002317,YT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00410,946358,YT,Cisplatin,DNA replication,2.879614
ACH-002339,MOT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00623,908149,Mo-T,Cisplatin,DNA replication,2.926873
ACH-002345,CAR1_LARGE_INTESTINE,SIDM00488,924108,CaR-1,Cisplatin,DNA replication,0.859573


In [115]:
targets_df[targets_df.duplicated(subset=['LN_IC50'], keep=False)].sort_values(by='LN_IC50')

Unnamed: 0,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,DRUG_NAME,PATHWAY_NAME,LN_IC50,AUC,RMSE,Average_Score


In [116]:
final_target_df[final_target_df.duplicated(subset=['LN_IC50'], keep=False)].sort_values(by='LN_IC50')

Unnamed: 0_level_0,CCLE_Name,SANGER_MODEL_ID,COSMIC_ID_y,CELL_LINE_NAME_y,DRUG_NAME,PATHWAY_NAME,LN_IC50
DepMap_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACH-001543,KOSC2CL343_UPPER_AERODIGESTIVE_TRACT,SIDM00603,753570,KOSC-2,Cisplatin,DNA replication,1.258019
ACH-002260,[Merged to ACH-001543] KOSC2_UPPER_AERODIGESTI...,SIDM00603,753570,KOSC-2,Cisplatin,DNA replication,1.258019


Так как исходно в таргете по этому лекарству не было дубликатов по тарегту, нужно убрать 1 лишний сэмпл, который заматчился дважды по SANGER_MODEL_ID

In [117]:
final_target_df = final_target_df.drop('ACH-002260', axis=0)
final_target_df

Unnamed: 0_level_0,CCLE_Name,SANGER_MODEL_ID,COSMIC_ID_y,CELL_LINE_NAME_y,DRUG_NAME,PATHWAY_NAME,LN_IC50
DepMap_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACH-000001,NIHOVCAR3_OVARY,SIDM00105,905933,OVCAR-3,Cisplatin,DNA replication,2.240063
ACH-000002,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00829,905938,HL-60,Cisplatin,DNA replication,2.031773
ACH-000004,HEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00594,907053,HEL,Cisplatin,DNA replication,0.503415
ACH-000006,MONOMAC6_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM01023,908148,MONO-MAC-6,Cisplatin,DNA replication,3.429265
ACH-000007,LS513_LARGE_INTESTINE,SIDM00677,907795,LS-513,Cisplatin,DNA replication,3.795200
...,...,...,...,...,...,...,...
ACH-002316,WIL2NS_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM01102,1331049,WIL2-NS,Cisplatin,DNA replication,2.227209
ACH-002317,YT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00410,946358,YT,Cisplatin,DNA replication,2.879614
ACH-002339,MOT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00623,908149,Mo-T,Cisplatin,DNA replication,2.926873
ACH-002345,CAR1_LARGE_INTESTINE,SIDM00488,924108,CaR-1,Cisplatin,DNA replication,0.859573


Итого остались данные по Citoplatin для 941 клеточных линии

In [104]:
copy_number_df = pd.read_csv('data/gdsc/OmicsCNGene-2.csv', index_col=0)
copy_number_df

Unnamed: 0,DDX11L1 (84771),WASH7P (653635),MIR6859-1 (102466751),MIR1302-2 (100302278),FAM138A (645520),OR4F5 (79501),WASH9P (102723897),MIR6859-2 (102465909),OR4F29 (729759),AL669831.3 (101928626),...,BPY2C (442868),TTTY4C (474150),CSPG4P1Y (114758),CDY1 (9085),TTTY3 (114760),SNORD38B (94163),SCARNA4 (677771),SNORA50A (677830),SNORD3D (780854),POLR2J3 (548644)
ACH-000667,2.114107,2.114107,2.114107,2.114107,2.114107,3.880916e-08,1.417715,1.417715,1.417715,1.417715,...,,,,,,,,,,
ACH-001148,1.019314,1.019314,1.019314,1.019314,1.019314,1.019314e+00,1.019314,1.019314,1.019314,1.019314,...,,,,,,,,,,
ACH-000159,0.977269,0.977269,0.977269,0.977269,0.977269,9.772694e-01,0.977269,0.977269,0.977269,0.977269,...,0.729053,0.729053,0.729053,0.729053,0.729053,,,,,
ACH-001675,2.476574,2.476574,2.476574,2.476574,2.476574,2.476574e+00,0.796649,0.702516,0.702516,0.702516,...,,,,,,,,,,
ACH-000947,1.241205,1.241205,1.241205,1.241205,1.241205,1.241205e+00,1.241205,1.241205,0.064566,0.842838,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-000185,1.000350,1.000350,1.000350,1.000350,1.000350,1.000350e+00,1.000350,1.000350,1.000350,1.000350,...,0.047106,0.047106,0.155943,0.089672,0.089672,1.000950,1.040914,1.003705,0.995258,1.008223
ACH-001044,0.542823,0.542823,0.542823,0.542823,0.542823,5.428233e-01,1.043950,1.043950,1.043950,1.043950,...,0.711068,0.711068,0.711068,0.711068,0.711068,1.057242,1.067719,0.852284,0.994610,1.258299
ACH-000494,1.005360,1.005360,1.005360,1.005360,1.005360,1.005360e+00,1.005360,1.005360,1.005360,1.005360,...,0.078468,0.078468,0.140482,0.140482,0.140482,1.008726,0.990482,1.003003,1.002252,1.005009
ACH-001087,1.038911,1.038911,1.038911,1.038911,1.038911,1.038911e+00,1.038911,1.038911,1.038911,1.038911,...,0.218842,0.218842,0.218842,0.218842,0.218842,1.045753,1.025980,1.090470,1.036552,1.074846


In [105]:
expression_df = pd.read_csv('data/gdsc/OmicsExpressionProteinCodingGenesTPMLogp1-2.csv', index_col=0)
expression_df

Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),NFYA (4800),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
ACH-001113,4.331992,0.000000,7.364660,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,4.739848,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.000000,0.275007,0.0,0.000000,0.000000
ACH-001289,4.567424,0.584963,7.106641,2.543496,3.504620,0.000000,0.189034,3.813525,4.221877,3.481557,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.000000,0.014355,0.0,0.000000,0.000000
ACH-001339,3.150560,0.000000,7.379118,2.333424,4.228049,0.056584,1.310340,6.687201,3.682573,3.273516,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.000000,0.084064,0.0,0.000000,0.042644
ACH-001538,5.085340,0.000000,7.154211,2.545968,3.084064,0.000000,5.868390,6.165309,4.489928,3.956986,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.000000,0.028569,0.0,0.000000,0.000000
ACH-000242,6.729417,0.000000,6.537917,2.456806,3.867896,0.799087,7.208478,5.570159,7.127117,4.568032,...,1.117695,2.358959,0.084064,1.910733,0.000000,0.000000,0.464668,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-000285,0.056584,0.000000,6.604368,3.266037,4.973152,0.411426,0.097611,0.704872,4.829850,5.178715,...,2.229588,0.084064,1.310340,3.039138,0.344828,0.000000,0.000000,0.0,0.475085,0.042644
ACH-002669,3.111031,0.000000,7.031329,1.541019,3.664483,0.014355,3.624101,6.805421,4.472488,4.397118,...,0.189034,0.400538,0.356144,1.327687,0.000000,0.000000,0.014355,0.0,0.000000,0.000000
ACH-001858,4.390943,0.000000,7.013239,1.887525,3.252476,0.028569,3.286881,6.902194,5.410748,3.401903,...,1.097611,0.400538,0.613532,1.992768,0.704872,0.000000,1.464668,0.0,0.000000,0.526069
ACH-001997,5.057450,0.000000,7.815191,2.538538,3.893362,0.028569,4.079805,6.971659,4.469886,3.463361,...,0.831877,0.847997,1.292782,2.153805,0.687061,0.000000,0.000000,0.0,0.000000,0.000000


In [106]:
mutations_df = pd.read_csv('data/gdsc/OmicsSomaticMutations-2.csv', index_col=0)
mutations_df = mutations_df.reset_index().set_index('ModelID')
mutations_df = mutations_df[['Chrom', 'Pos', 'Ref', 'Alt', 'GT']]

mutations_df

Unnamed: 0_level_0,Chrom,Pos,Ref,Alt,GT
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACH-000839,chr1,1242864,GC,CT,0/1
ACH-000839,chr1,10647969,A,G,0|1
ACH-000839,chr1,10648097,T,G,0/1
ACH-000839,chr1,13198424,G,A,0/1
ACH-000839,chr1,13225068,A,G,0/1
...,...,...,...,...,...
ACH-000840,chrX,139562030,C,T,1|1
ACH-000840,chrX,141897189,C,G,1|1
ACH-000840,chrX,143509150,G,A,1|1
ACH-000840,chrM,13651,A,G,0/1


Выбираем имеющиеся по выбранному таргету клеточные линии во всех 3х модальнастях

In [118]:
common_indices = final_target_df.index.intersection(expression_df.index).intersection(mutations_df.index).intersection(copy_number_df.index)

copy_number_df_common = copy_number_df.loc[common_indices]
expression_df_common = expression_df.loc[common_indices]
mutations_df_common = mutations_df.loc[common_indices]
final_target_df_common = final_target_df.loc[common_indices]

final_target_df_common.to_csv('data/gdsc/cisplatin/target.csv')

In [124]:
print('Итого количество клеточных линий для Cisplatin-a: ', common_indices.shape[0])

Итого количество клеточных линий для Cisplatin-a:  680


Преобразовываем модальности. Мутации - создаем сводную таблицу, где для каждой клеточной линии (ModelID) будут заданы мутации, определяемые комбинацей Chrom, Pos, Ref, Alt, а её значение - колонкой GT. Если встречаются дубликаты, то берется первое встречающееся значение. Также учитывая что на выходе получается более миллиона признаков, оставляем только те, которые встретились более чем у 2х клеточных линий. Пропущенные значения заменяем классом 00 - доминантные аллели. Значения 01, 10 объединяем в 1 класс.  Сортируем индекс. Делаем one-hot encoding для моделей, которые не умеют обрабатывать категориальные признаки

In [125]:
mutations_pivot = mutations_df_common.pivot_table(index=mutations_df_common.index, columns=['Chrom', 'Pos', 'Ref', 'Alt'], values='GT', aggfunc='first')

In [126]:
mutations_prepared = mutations_pivot.dropna(axis='columns', thresh=3)
mutations_prepared = mutations_prepared.fillna('0|0')

In [127]:
mutation_labels = mutations_df['GT'].unique()

def transform_label(l):
    if l in mutation_labels and l != '1|1':
        return '0|1'
    return l

mutations_prepared = mutations_prepared.applymap(transform_label)

In [128]:
mutations_prepared = mutations_prepared.sort_index()
new_column_names = ['_'.join(map(str, col)) for col in mutations_prepared.columns]
mutations_prepared.columns = new_column_names

mutations_prepared.to_csv('data/gdsc/cisplatin/mutations.csv')
mutations_prepared.head()

Unnamed: 0,chr1_935889_A_C,chr1_965180_A_C,chr1_974557_G_C,chr1_979866_T_G,chr1_999417_TC_GG,chr1_1046480_G_GC,chr1_1051505_T_G,chr1_1086003_C_T,chr1_1206512_G_A,chr1_1242864_GC_CT,...,chrY_9549396_G_T,chrY_12726767_C_T,chrY_12790481_G_A,chrY_12842354_T_C,chrY_13355944_A_G,chrY_14723127_A_G,chrY_14824201_C_T,chrY_15551598_G_T,chrY_19735435_T_C,chrY_26587559_G_A
ACH-000001,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
ACH-000002,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
ACH-000004,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,1|1,0|0,1|1,0|0,0|0,0|0
ACH-000006,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
ACH-000007,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [129]:
mutations_prepared_one_hot =  pd.get_dummies(mutations_prepared, prefix_sep='_', drop_first=True)
mutations_prepared_one_hot.to_csv('data/gdsc/cisplatin/mutations-one-hot.csv')

mutations_prepared_one_hot.head()

Unnamed: 0,chr1_935889_A_C_0|1,chr1_965180_A_C_0|1,chr1_974557_G_C_0|1,chr1_979866_T_G_0|1,chr1_999417_TC_GG_0|1,chr1_1046480_G_GC_0|1,chr1_1051505_T_G_0|1,chr1_1086003_C_T_0|1,chr1_1086003_C_T_1|1,chr1_1206512_G_A_0|1,...,chrY_13355944_A_G_0|1,chrY_13355944_A_G_1|1,chrY_14723127_A_G_1|1,chrY_14824201_C_T_0|1,chrY_14824201_C_T_1|1,chrY_15551598_G_T_0|1,chrY_15551598_G_T_1|1,chrY_19735435_T_C_0|1,chrY_19735435_T_C_1|1,chrY_26587559_G_A_1|1
ACH-000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACH-000002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACH-000004,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
ACH-000006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACH-000007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Copy Number - делаем стандартную предобработку: выбрасывыаем высокодисперсные признаки, признаки с нулевой дисперсией и полностью нулевые, применяем стандартизацию. Сортируем индекс

In [130]:
copy_number_df_common = copy_number_df_common.sort_index()
copy_number_prepared = preprocess_data_for_mofa(copy_number_df_common)

copy_number_prepared.to_csv('data/gdsc/cisplatin/copy-number.csv')
copy_number_prepared.head()

Unnamed: 0,WASH9P (102723897),MIR6859-2 (102465909),OR4F29 (729759),AL669831.3 (101928626),OR4F16 (81399),LINC01409 (105378580),FAM87B (400728),LINC01128 (643837),LINC00115 (79854),AL669831.7 (107984850),...,F8A2 (474383),MIR1184-2 (100422985),F8A3 (474384),MIR1184-3 (100422977),H2AB3 (83740),TMLHE-AS1 (100507404),TMLHE (55217),SPRY3 (10251),VAMP7 (6845),IL9R (3581)
ACH-000001,2.327387,2.297834,2.269431,2.604793,2.390284,2.70157,2.703685,2.709489,2.703685,2.710027,...,-0.231426,-0.231426,-0.231426,-0.231426,-0.231426,-0.231426,-0.2295,-0.223739,-0.230811,-0.219968
ACH-000002,0.11307,0.116412,0.180748,0.185584,0.158814,0.198621,0.196568,0.195278,0.196568,0.194778,...,-1.039235,-1.039235,-1.039235,-1.039235,-1.039235,-1.039235,-1.039124,-1.031197,-0.990086,-0.978734
ACH-000004,1.873567,1.850756,1.841359,2.108981,1.932948,2.188596,2.189857,2.194206,2.189857,2.194531,...,-1.581348,-1.581348,-1.581348,-1.581348,-1.581348,-1.581348,-1.582454,-1.573074,-1.499628,-1.487934
ACH-000006,0.232484,0.234053,0.293387,0.316048,0.279153,0.333601,0.331773,0.330865,0.331773,0.330421,...,-1.057338,-1.057338,-1.057338,-1.057338,-1.057338,-1.057338,-1.057268,-1.049293,-1.007102,-0.995738
ACH-000007,-0.116733,-0.109977,-0.036017,-0.065483,-0.07277,-0.061136,-0.063622,-0.065649,-0.063622,-0.066257,...,-1.024087,-1.024087,-1.024087,-1.024087,-1.024087,-1.024087,-1.023942,-1.016056,-0.975848,-0.964505


Expression - делаем стандартную предобработку: выбрасывыаем высокодисперсные признаки, признаки с нулевой дисперсией и полностью нулевые, применяем стандартизацию. Сортируем индекс

In [131]:
expression_df_common = expression_df_common.sort_index()
expression_df_prepared = preprocess_data_for_mofa(expression_df_common)
expression_df_prepared.to_csv('data/gdsc/cisplatin/expression.csv')

expression_df_prepared.head()

Unnamed: 0,DPM1 (8813),SCYL3 (57147),C1orf112 (55732),GCLC (2729),NFYA (4800),NIPAL3 (57185),LAS1L (81887),ANKIB1 (54467),CYP51A1 (1595),KRIT1 (889),...,EEF1AKMT4 (110599564),TBCE (6905),AP000812.4 (220074),AC093512.2 (112694756),ARHGAP11B (89839),ABCF2-H2BE1 (114483834),POLR2J3 (548644),DERPC (113455421),AP001453.3 (114841035),DUS4L-BCAP29 (115253422)
ACH-000001,1.641226,-0.552143,0.572658,-0.102124,-0.638458,-0.612718,0.279548,0.15272,-0.218935,-0.212063,...,0.805509,0.44612,-0.23454,-1.986296,-0.167756,-1.052911,-0.815528,0.20738,0.83151,-0.449274
ACH-000002,-1.337799,-2.058465,-0.986219,-0.598595,-2.53091,-0.894489,-0.420378,-3.826429,-2.305144,-1.403615,...,0.733671,-0.554542,-0.373646,-0.967497,0.080618,-2.60858,-1.59407,1.008256,0.772881,-0.857331
ACH-000004,-1.748603,0.093754,0.152388,0.073865,0.891333,0.921307,-0.503105,-1.428801,-2.149849,0.131533,...,0.625247,0.220097,-0.518393,-0.363157,1.023162,-0.042969,0.878563,0.850104,-0.186497,0.792687
ACH-000006,-0.370631,0.584615,0.537108,0.037109,1.064696,-0.261253,1.045273,-0.861881,0.208629,0.818548,...,0.656845,0.490049,-0.747151,0.653187,2.363263,0.750854,0.356389,-0.51997,0.001876,0.674626
ACH-000007,-0.914525,1.218887,-0.137635,0.154078,-0.532662,0.092888,-0.951995,-0.909038,0.660057,0.091397,...,0.577694,-0.289902,-1.048244,-0.557306,0.415959,-0.107154,0.00495,-1.827315,0.888717,0.118009


И наконец делаем отдельную таблицу со всеми модальностями в формате MOFA

In [134]:
expression_mofa = transform_df_for_mofa(expression_df_prepared, 'DNA')
copy_number_mofa = transform_df_for_mofa(copy_number_prepared, 'Copy Number')
mutatations_mofa = transform_df_for_mofa(mutations_prepared_one_hot, 'Mutations')

combined_gdsc_mofa = pd.concat([expression_mofa, copy_number_mofa, mutatations_mofa], axis=0)

combined_gdsc_mofa.to_csv('data/gdsc/cisplatin/gdsc_mofa.csv')

In [135]:
combined_gdsc_mofa

Unnamed: 0,sample,feature,value,view
0,ACH-000001,DPM1 (8813),1.641226,DNA
1,ACH-000002,DPM1 (8813),-1.337799,DNA
2,ACH-000004,DPM1 (8813),-1.748603,DNA
3,ACH-000006,DPM1 (8813),-0.370631,DNA
4,ACH-000007,DPM1 (8813),-0.914525,DNA
...,...,...,...,...
9449955,ACH-002059,chrY_26587559_G_A_1|1,0.000000,Mutations
9449956,ACH-002062,chrY_26587559_G_A_1|1,0.000000,Mutations
9449957,ACH-002067,chrY_26587559_G_A_1|1,0.000000,Mutations
9449958,ACH-002077,chrY_26587559_G_A_1|1,0.000000,Mutations
