In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from sklearn.preprocessing import MinMaxScaler


import sys
sys.path.append("..")

from src.tcga_preprocess_utils import prepare_data, choose_common_patients, prepare_survival_data
from src.mofa_utils import transform_df_for_mofa, preprocess_data_for_mofa
from src.constants import RANDOM_STATE

Делаем общую предобработку для всех моделей: отбираем только тех пациентов, для которых есть данные по выживаемости; убираем признаки у которых все значения - нули, признаки с нулевой дисперсией и самые высокодисперсные; применяем стандартизацию. Сохраняем каждую модальность отдельно

In [25]:
survival_data_breast = prepare_survival_data('data/breast/survival')
survival_data_breast.head()

Unnamed: 0_level_0,Survival,Death
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
tcga.ol.a66h,812.0,0.0
tcga.3c.aalk,1448.0,0.0
tcga.ar.a1ah,3807.0,0.0
tcga.ac.a5eh,511.0,0.0
tcga.ew.a2fw,672.0,0.0


In [26]:
expression_data_breast = prepare_data('data/breast/exp', survival_data_breast.index)
expression_data_breast = preprocess_data_for_mofa(expression_data_breast)

methylation_data_breast = prepare_data('data/breast/methy', survival_data_breast.index)
methylation_data_breast = preprocess_data_for_mofa(methylation_data_breast)

mirna_data_breast = prepare_data('data/breast/mirna', survival_data_breast.index)
mirna_data_breast = preprocess_data_for_mofa(mirna_data_breast)

survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast = choose_common_patients(
    [survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast])

survival_data_breast.to_csv('data/breast/survival_data_breast.csv')
expression_data_breast.to_csv('data/breast/expression_data_breast.csv')
methylation_data_breast.to_csv('data/breast/methylation_data_breast.csv')
mirna_data_breast.to_csv('data/breast/mirna_data_breast.csv')

In [34]:
expression_data_breast

Unnamed: 0,?|10357,?|10431,?|155060,?|57714,?|653553,?|8225,A1BG|1,A2LD1|87769,A4GALT|53947,AAAS|8086,...,C17orf42|79736,C17orf44|284029,C17orf48|56985,C17orf49|124944,C17orf51|339263,C17orf53|78995,C17orf56|146705,C17orf57|124989,C17orf58|284018,C17orf59|54785
tcga.3c.aaau,-1.042741,-1.469611,4.905180,-0.079635,-0.283610,-0.245277,0.016729,0.133848,-0.866425,-0.336109,...,0.140471,0.692837,-0.879200,-0.781063,0.509753,0.071321,1.684896,-0.037832,-0.274853,-0.049517
tcga.3c.aali,-0.756164,-1.134796,1.452860,-0.219104,-0.036897,0.429875,0.217103,-0.438575,-0.595285,0.739018,...,-0.545562,-0.464121,-0.896319,-0.209870,-0.369968,0.312433,3.409759,0.998973,6.653162,0.601862
tcga.3c.aalj,0.620028,0.577640,1.843803,-0.612154,0.965641,0.351226,1.141290,1.173464,0.666249,-0.002081,...,-0.698915,-0.506104,-0.891959,-0.080649,-0.148268,-0.136716,2.743670,-0.320084,0.364310,0.678228
tcga.3c.aalk,0.450196,-0.486638,1.641744,-0.529859,-0.103758,0.738735,-0.013464,-0.587613,0.461720,0.239011,...,-1.140635,-0.756857,-0.008866,-0.006844,-0.215736,-0.770094,2.210296,-0.390151,1.024533,-0.504080
tcga.4h.aaak,-0.520822,-0.699694,0.139825,-0.540581,-0.586631,0.495877,0.373725,1.050593,-0.036247,0.428913,...,0.602491,-0.315387,0.292861,0.006581,-0.099668,-0.193647,1.054093,1.165472,-0.105951,-0.451738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tcga.wt.ab44,-1.203667,1.687656,2.422749,-0.710010,1.616320,-0.213024,1.380219,-0.601459,0.008075,2.444485,...,0.424327,0.951437,2.096385,1.465707,-0.164237,-0.696548,2.131348,-0.475877,0.730159,
tcga.xx.a899,0.351083,-0.943852,0.995565,-0.279016,0.079985,0.191197,0.146670,0.637900,0.458989,-0.075941,...,-0.374396,0.186234,1.787286,-0.292468,0.218853,-0.406883,1.206433,1.368660,-0.667216,
tcga.xx.a89a,-0.847527,-0.793514,0.734753,-0.636260,0.040936,0.492178,0.305369,-0.276926,0.792356,0.801031,...,-0.657879,0.177535,0.828008,-0.322122,0.382840,-0.407048,0.361603,-0.332805,-0.744002,
tcga.z7.a8r5,-0.507257,0.103722,0.094635,-0.954967,-0.301723,1.127923,1.222374,-0.252461,0.309977,1.439708,...,0.574358,1.230961,3.552271,2.732227,-0.075169,-0.806529,1.633640,-0.620726,-0.391855,


Трансформируем в общий датафрейм в формате MOFA 

In [27]:
transformed_expression_data_breast = transform_df_for_mofa(
    expression_data_breast.copy(), 'DNA')
transformed_methylation_breast = transform_df_for_mofa(
    methylation_data_breast.copy(), 'Methylation')
transformed_mirna_data_breast = transform_df_for_mofa(
    mirna_data_breast.copy(), 'RNA')
combined_data_breast = pd.concat([transformed_expression_data_breast,
                                  transformed_methylation_breast, transformed_mirna_data_breast], axis=0)

combined_data_breast.to_csv('data/breast/mofa_data_breast.csv')
combined_data_breast

Unnamed: 0,sample,feature,value,view
0,tcga.3c.aaau,?|10357,-1.042741,DNA
1,tcga.3c.aali,?|10357,-0.756164,DNA
2,tcga.3c.aalj,?|10357,0.620028,DNA
3,tcga.3c.aalk,?|10357,0.450196,DNA
4,tcga.4h.aaak,?|10357,-0.520822,DNA
...,...,...,...,...
112835,tcga.wt.ab44,hsa-mir-99a,0.353295,RNA
112836,tcga.xx.a899,hsa-mir-99a,1.334040,RNA
112837,tcga.xx.a89a,hsa-mir-99a,0.603946,RNA
112838,tcga.z7.a8r5,hsa-mir-99a,0.377321,RNA


То же самое для данных по раку почек, за исключением немного дополненной очистки данных

In [28]:
survival_data_kidney = prepare_survival_data('data/kidney/survival', id_sep='-')
survival_data_kidney.head()

Unnamed: 0_level_0,Survival,Death
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
tcga.3z.a93z,385.0,0.0
tcga.6d.aa2e,362.0,0.0
tcga.a3.3306,1120.0,0.0
tcga.a3.3307,1436.0,0.0
tcga.a3.3308,16.0,0.0


In [30]:
expression_data_kidney = prepare_data('data/kidney/exp', survival_data_kidney.index)
expression_data_kidney = preprocess_data_for_mofa(expression_data_kidney)

methylation_data_kidney = prepare_data('data/kidney/methy', survival_data_kidney.index)
methylation_data_kidney = preprocess_data_for_mofa(methylation_data_kidney)

mirna_data_kidney = prepare_data('data/kidney/mirna', survival_data_kidney.index)
mirna_data_kidney = preprocess_data_for_mofa(mirna_data_kidney)

survival_data_kidney, expression_data_kidney, methylation_data_kidney, mirna_data_kidney = choose_common_patients(
    [survival_data_kidney, expression_data_kidney, methylation_data_kidney, mirna_data_kidney])

survival_data_kidney.to_csv('data/kidney/survival_data_kidney.csv')
expression_data_kidney.to_csv('data/kidney/expression_data_kidney.csv')
methylation_data_kidney.to_csv('data/kidney/methylation_data_kidney.csv')
mirna_data_kidney.to_csv('data/kidney/mirna_data_kidney.csv')

In [35]:
expression_data_kidney

Unnamed: 0,?|10431,?|155060,?|57714,?|8225,A1BG|1,A2LD1|87769,A4GALT|53947,AAAS|8086,AACS|65985,AADAT|51166,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,psiTPTE22|387590
tcga.3z.a93z,-0.549493,-0.391412,-0.888968,0.215268,0.302785,-0.463168,2.169632,-0.025186,-0.146019,-0.730892,...,0.002808,-0.387239,-1.292603,-1.345999,-1.614691,-1.169082,-0.374467,-1.023092,-1.140734,-0.230621
tcga.6d.aa2e,1.889997,-0.216696,-0.957456,1.379341,-0.095197,0.842429,-0.333308,-0.159761,2.840645,0.435272,...,-1.015834,-1.142749,-0.835688,-1.305215,0.069022,-0.379610,-0.705230,0.891328,1.275252,1.768646
tcga.a3.3358,-0.463598,-0.411836,0.285120,-0.315057,-0.110232,0.158915,-0.480158,0.435642,-0.157068,-0.585498,...,-0.192913,0.158295,0.544916,-0.300444,-0.249859,0.037484,-0.587609,0.065306,0.833653,-0.439251
tcga.a3.3387,0.030597,0.173887,0.253526,-0.628067,-0.080006,-0.399059,-0.748547,-0.273866,-0.212709,-0.555677,...,1.166093,0.291422,0.429694,-0.232010,1.131407,-0.256120,0.057511,0.638138,1.197214,-0.351663
tcga.a3.a6ni,-0.076982,-0.045054,-0.125749,1.705175,-0.045627,-0.210988,0.697161,0.787205,1.398619,-0.131601,...,-0.456537,-0.118492,-0.330311,-0.601206,1.154269,-0.755156,-0.041402,0.745577,-0.610319,-0.045568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tcga.mm.a563,-0.271866,1.512331,-0.878810,0.665314,0.006693,0.072381,0.268651,0.518977,-0.614947,0.160143,...,0.141469,0.492466,-0.994216,-1.046597,0.108669,-0.477513,0.509783,-0.073631,-0.497594,-0.016846
tcga.mm.a564,-0.255402,4.139697,-1.441793,0.631147,-0.063910,-0.409252,0.994533,0.690185,-1.372453,0.779027,...,-0.725363,-0.967519,-1.112824,-0.627336,-1.568025,-0.913615,-0.497450,-1.063581,-0.557021,-0.300898
tcga.mm.a84u,1.703824,0.407407,-0.937617,-0.159475,0.035775,-0.162910,1.453862,0.437693,-0.003824,-0.475029,...,-0.340248,-0.131260,-1.187160,-1.284242,-0.787219,-1.014723,0.697904,-0.082626,-0.854342,4.287010
tcga.mw.a4ec,-0.493848,0.115885,-1.179917,0.686648,0.042471,0.672391,0.538725,-0.248294,-0.131953,0.102036,...,-0.702823,-0.144664,-0.323799,-0.372489,-1.505750,-1.017401,-0.222418,-0.454334,-1.177500,-0.359585


Для задачи определения типа рака - выделяем общие признаки у breast и kidney, создаем общую таблицу для MOFA

In [36]:
common_genes = set(expression_data_breast.columns).intersection(expression_data_kidney.columns)
print('Кол-во общих генов: ', len(common_genes))
common_meth = set(methylation_data_breast.columns).intersection(methylation_data_kidney.columns)
print('Кол-во общих метил: ', len(common_meth))
common_mirna = set(mirna_data_breast.columns).intersection(mirna_data_kidney.columns)
print('Кол-во общих мрнк: ', len(common_mirna))
print('Кол-во общих пациентов: ', len(set(expression_data_breast.index).intersection(expression_data_kidney.index)))

Кол-во общих генов:  1164
Кол-во общих метил:  129
Кол-во общих мрнк:  169
Кол-во общих пациентов:  0


In [37]:
expression_data_breast,  expression_data_kidney = expression_data_breast[
    common_genes], expression_data_kidney[common_genes]
methylation_data_breast, methylation_data_kidney = methylation_data_breast[
    common_meth], methylation_data_kidney[common_meth]
mirna_data_breast, mirna_data_kidney = mirna_data_breast[
    common_mirna], mirna_data_kidney[common_mirna]

In [41]:
transformed_expression_data_breast = transform_df_for_mofa(
    expression_data_breast.copy(), 'DNA')
transformed_methylation_breast = transform_df_for_mofa(
    methylation_data_breast.copy(), 'Methylation')
transformed_mirna_data_breast = transform_df_for_mofa(
    mirna_data_breast.copy(), 'RNA')
combined_data_breast = pd.concat([transformed_expression_data_breast,
                                  transformed_methylation_breast, transformed_mirna_data_breast], axis=0)
combined_data_breast['group'] = 'group_0'

transformed_expression_data_kidney = transform_df_for_mofa(
    expression_data_kidney.copy(), 'DNA')
transformed_methylation_kidney  = transform_df_for_mofa(
    methylation_data_kidney.copy(), 'Methylation')
transformed_mirna_data_kidney  = transform_df_for_mofa(
    mirna_data_kidney.copy(), 'RNA')
combined_data_kidney = pd.concat([transformed_expression_data_kidney,
                                  transformed_methylation_kidney, transformed_mirna_data_kidney], axis=0)
combined_data_kidney['group'] = 'group_1'

combined_data = pd.concat([combined_data_breast, combined_data_kidney], axis=0)
scaler = MinMaxScaler()
combined_data['value'] = scaler.fit_transform(combined_data[['value']])

combined_data

Unnamed: 0,sample,feature,value,view,group
0,tcga.3c.aaau,ATXN10|25814,0.153021,DNA,group_0
1,tcga.3c.aali,ATXN10|25814,0.125107,DNA,group_0
2,tcga.3c.aalj,ATXN10|25814,0.119677,DNA,group_0
3,tcga.3c.aalk,ATXN10|25814,0.168294,DNA,group_0
4,tcga.4h.aaak,ATXN10|25814,0.167698,DNA,group_0
...,...,...,...,...,...
30584,tcga.mm.a563,hsa-mir-17,0.194593,RNA,group_1
30585,tcga.mm.a564,hsa-mir-17,0.274693,RNA,group_1
30586,tcga.mm.a84u,hsa-mir-17,0.142128,RNA,group_1
30587,tcga.mw.a4ec,hsa-mir-17,0.213486,RNA,group_1


In [42]:
combined_data.to_csv('data/tsga_breast_kidney_for_mofa.csv')

Сохраняем все предобработанные группы данных c общими генами

In [43]:
views = ['DNA', 'Methylation', 'RNA']
groups = ['group_0', 'group_1']

for group in groups:
    for view in views:
        df_unmelted = combined_data[(combined_data.view == view) & (combined_data.group == group)].drop(columns=['view', 'group']).pivot(index='sample', columns='feature')
        df_unmelted = df_unmelted['value'].reset_index()
        df_unmelted.columns.name = None
        group_name = 'breast' if group == 'group_0' else 'kidney'
        df_unmelted.to_csv(f'data/{group_name}/{view}_{group_name}_common_features.csv')