In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from sklearn.preprocessing import MinMaxScaler


import sys
sys.path.append("..")

from src.tcga_preprocess_utils import prepare_data, choose_common_patients, prepare_survival_data, prepare_clinical_data
from src.mofa_utils import transform_df_for_mofa, preprocess_data_for_mofa
from src.constants import RANDOM_STATE, CANCER_SUBTYPE_FEATURE

Делаем общую предобработку для всех моделей: отбираем только тех пациентов, для которых есть данные по выживаемости / подтипу рака; убираем признаки у которых все значения - нули, признаки с нулевой дисперсией и самые высокодисперсные; применяем стандартизацию. Сохраняем каждую модальность отдельно

In [2]:
survival_data_breast = prepare_survival_data('data/breast/survival')
survival_data_breast

Unnamed: 0_level_0,Survival,Death
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
tcga.3c.aaau,4047.0,0.0
tcga.3c.aali,4005.0,0.0
tcga.3c.aalj,1474.0,0.0
tcga.3c.aalk,1448.0,0.0
tcga.4h.aaak,348.0,0.0
...,...,...
tcga.wt.ab44,883.0,0.0
tcga.xx.a899,467.0,0.0
tcga.xx.a89a,488.0,0.0
tcga.z7.a8r5,3287.0,0.0


In [25]:
prepare_data('data/breast/mirna')

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-941-3,hsa-mir-941-4,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
tcga.3c.aaau,8962.996542,17779.575039,9075.200383,24749.898857,341.298400,406.164781,1470.179650,14.716795,3627.642977,387.417272,...,0.0,0.0,5.530515,0.187475,2.062226,4.124452,119.984057,53.992826,130.201449,46548.939810
tcga.3c.aali,7739.739862,15524.941906,7713.626636,23374.640471,801.487258,513.297924,560.962427,20.922042,6557.093894,350.955461,...,0.0,0.0,8.180047,0.000000,0.629234,1.258469,60.249189,86.047798,236.434808,12644.149725
tcga.3c.aalj,8260.612670,16497.981335,8355.342958,10957.355911,635.811272,620.351816,2694.331127,39.799878,11830.760394,600.725980,...,0.0,0.0,3.618171,0.000000,0.767491,1.644623,97.252043,117.645369,191.434123,33083.456616
tcga.3c.aalk,9056.241254,18075.168478,9097.666150,26017.522731,2919.348415,334.245155,1322.434475,17.866463,6438.725384,354.957604,...,0.0,0.0,3.478426,0.000000,3.478426,1.739213,72.572624,41.583007,1046.690127,24067.232290
tcga.4h.aaak,10897.303665,21822.338727,10963.956320,22204.253575,3313.009950,350.615669,1711.886682,22.541895,8246.117280,333.425447,...,0.0,0.0,2.108235,0.000000,1.135203,0.810860,19.947145,34.380445,1081.037952,25715.275426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tcga.wt.ab44,10628.975280,21125.108661,10585.686678,23396.813364,3892.051211,367.141461,1484.663795,23.402901,10570.535667,571.680109,...,0.0,0.0,1.217492,0.000000,0.405831,1.217492,79.813361,57.627952,1100.883277,16338.471420
tcga.xx.a899,16799.785282,33603.904432,16883.338223,20731.006597,5263.331356,201.676038,2173.283559,36.888271,18227.341203,870.301142,...,0.0,0.0,5.341744,0.000000,3.124416,2.318115,16.629958,57.348159,1919.601107,14080.736733
tcga.xx.a89a,13120.807001,26337.935723,13229.425112,18796.895124,6581.549565,375.598820,2547.029500,28.505268,16838.042944,778.398745,...,0.0,0.0,1.863089,0.000000,0.558927,0.931545,41.919511,54.215901,1310.124456,17072.605898
tcga.z7.a8r5,7979.531224,16006.280243,8106.687917,20462.010937,4040.296936,295.594442,962.166120,23.885025,7625.121634,428.411748,...,0.0,0.0,2.070956,0.000000,2.209020,1.656765,55.225491,53.016472,1120.939408,18696.866174


In [3]:
expression_data_breast = prepare_data('data/breast/exp', survival_data_breast.index)
expression_data_breast = preprocess_data_for_mofa(expression_data_breast)

methylation_data_breast = prepare_data('data/breast/methy', survival_data_breast.index)
methylation_data_breast = preprocess_data_for_mofa(methylation_data_breast)

mirna_data_breast = prepare_data('data/breast/mirna', survival_data_breast.index)
mirna_data_breast = preprocess_data_for_mofa(mirna_data_breast)

survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast = choose_common_patients(
    [survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast])

survival_data_breast.to_csv('data/breast/survival_data_breast.csv')
expression_data_breast.to_csv('data/breast/expression_data_breast.csv')
methylation_data_breast.to_csv('data/breast/methylation_data_breast.csv')
mirna_data_breast.to_csv('data/breast/mirna_data_breast.csv')

Почти то же самое, но только когда таргет - гистологический тип, а так же нужно удалить те группы, размер которых меньше их количества, чтобы была возможна стратификация по таргету

In [19]:
clinical_data = prepare_clinical_data('data/clinical/breast')
clinical_data[CANCER_SUBTYPE_FEATURE].value_counts()

Infiltrating Ductal Carcinoma       784
Infiltrating Lobular Carcinoma      203
Other, specify                       46
Mixed Histology (please specify)     30
Mucinous Carcinoma                   17
Metaplastic Carcinoma                 9
Medullary Carcinoma                   6
Infiltrating Carcinoma NOS            1
Name: histological_type, dtype: int64

In [20]:
clinical_data = clinical_data.groupby(CANCER_SUBTYPE_FEATURE).filter(lambda x: len(x) >= 7)

In [21]:
expression_data_breast_for_subtype_pred = prepare_data(
    'data/breast/exp', clinical_data.index)
expression_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    expression_data_breast_for_subtype_pred)

methylation_data_breast_for_subtype_pred = prepare_data(
    'data/breast/methy', clinical_data.index)
methylation_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    methylation_data_breast_for_subtype_pred)

mirna_data_breast_for_subtype_pred = prepare_data(
    'data/breast/mirna', clinical_data.index)
mirna_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    mirna_data_breast_for_subtype_pred)

clinical_data_breast_for_subtype_pred, expression_data_breast_for_subtype_pred, methylation_data_breast_for_subtype_pred, mirna_data_breast_for_subtype_pred = choose_common_patients(
    [clinical_data, expression_data_breast_for_subtype_pred, methylation_data_breast_for_subtype_pred, mirna_data_breast_for_subtype_pred])

expression_data_breast_for_subtype_pred.to_csv(
    'data/breast/expression_data_breast_for_subtype_pred.csv')
methylation_data_breast_for_subtype_pred.to_csv(
    'data/breast/methylation_data_breast_for_subtype_pred.csv')
mirna_data_breast_for_subtype_pred.to_csv(
    'data/breast/mirna_data_breast_for_subtype_pred.csv')

clinical_data_breast_for_subtype_pred.to_csv(
    'data/breast/clinical_data_breast_for_subtype_pred.csv')

Трансформируем в общий датафрейм в формате MOFA 

In [4]:
transformed_expression_data_breast = transform_df_for_mofa(
    expression_data_breast.copy(), 'DNA')
transformed_methylation_breast = transform_df_for_mofa(
    methylation_data_breast.copy(), 'Methylation')
transformed_mirna_data_breast = transform_df_for_mofa(
    mirna_data_breast.copy(), 'RNA')
combined_data_breast = pd.concat([transformed_expression_data_breast,
                                  transformed_methylation_breast, transformed_mirna_data_breast], axis=0)

combined_data_breast.to_csv('data/breast/mofa_data_breast.csv')
combined_data_breast

Unnamed: 0,sample,feature,value,view
0,tcga.3c.aaau,?|10357,-1.042741,DNA
1,tcga.3c.aali,?|10357,-0.756164,DNA
2,tcga.3c.aalj,?|10357,0.620028,DNA
3,tcga.3c.aalk,?|10357,0.450196,DNA
4,tcga.4h.aaak,?|10357,-0.520822,DNA
...,...,...,...,...
112835,tcga.wt.ab44,hsa-mir-99a,0.353295,RNA
112836,tcga.xx.a899,hsa-mir-99a,1.334040,RNA
112837,tcga.xx.a89a,hsa-mir-99a,0.603946,RNA
112838,tcga.z7.a8r5,hsa-mir-99a,0.377321,RNA


То же самое для данных, где таргет - гистологический тип

In [22]:
transformed_expression_data_breast_for_subtype_pred = transform_df_for_mofa(
    expression_data_breast_for_subtype_pred.copy(), 'DNA')
transformed_methylation_breast_for_subtype_pred = transform_df_for_mofa(
    methylation_data_breast_for_subtype_pred.copy(), 'Methylation')
transformed_mirna_data_breast_for_subtype_pred = transform_df_for_mofa(
    mirna_data_breast_for_subtype_pred.copy(), 'RNA')

combined_data_breast_for_subtype_pred = pd.concat([transformed_expression_data_breast_for_subtype_pred,
                                                   transformed_methylation_breast_for_subtype_pred, transformed_mirna_data_breast_for_subtype_pred], axis=0)

combined_data_breast_for_subtype_pred.to_csv(
    'data/breast/mofa_data_breast_for_subtype_pred.csv')
combined_data_breast_for_subtype_pred

Unnamed: 0,sample,feature,value,view
0,tcga.3c.aaau,?|10357,-1.046201,DNA
1,tcga.3c.aali,?|10357,-0.757116,DNA
2,tcga.3c.aalj,?|10357,0.631118,DNA
3,tcga.3c.aalk,?|10357,0.459800,DNA
4,tcga.4h.aaak,?|10357,-0.519715,DNA
...,...,...,...,...
111743,tcga.wt.ab44,hsa-mir-99a,0.350831,RNA
111744,tcga.xx.a899,hsa-mir-99a,1.328710,RNA
111745,tcga.xx.a89a,hsa-mir-99a,0.600750,RNA
111746,tcga.z7.a8r5,hsa-mir-99a,0.374787,RNA


Теперь для данных по раку почек, за исключением немного дополненной очистки данных, но не полностью, так как пока эти данные не используются

In [6]:
survival_data_kidney = prepare_survival_data('data/kidney/survival', id_sep='-')
survival_data_kidney.head()

Unnamed: 0_level_0,Survival,Death
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
tcga.3z.a93z,385.0,0.0
tcga.6d.aa2e,362.0,0.0
tcga.a3.3306,1120.0,0.0
tcga.a3.3307,1436.0,0.0
tcga.a3.3308,16.0,0.0


In [6]:
expression_data_kidney = prepare_data('data/kidney/exp', survival_data_kidney.index)
expression_data_kidney = preprocess_data_for_mofa(expression_data_kidney)

methylation_data_kidney = prepare_data('data/kidney/methy', survival_data_kidney.index)
methylation_data_kidney = preprocess_data_for_mofa(methylation_data_kidney)

mirna_data_kidney = prepare_data('data/kidney/mirna', survival_data_kidney.index)
mirna_data_kidney = preprocess_data_for_mofa(mirna_data_kidney)

survival_data_kidney, expression_data_kidney, methylation_data_kidney, mirna_data_kidney = choose_common_patients(
    [survival_data_kidney, expression_data_kidney, methylation_data_kidney, mirna_data_kidney])

survival_data_kidney.to_csv('data/kidney/survival_data_kidney.csv')
expression_data_kidney.to_csv('data/kidney/expression_data_kidney.csv')
methylation_data_kidney.to_csv('data/kidney/methylation_data_kidney.csv')
mirna_data_kidney.to_csv('data/kidney/mirna_data_kidney.csv')

In [7]:
expression_data_kidney

Unnamed: 0,?|10431,?|155060,?|57714,?|8225,A1BG|1,A2LD1|87769,A4GALT|53947,AAAS|8086,AACS|65985,AADAT|51166,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,psiTPTE22|387590
tcga.3z.a93z,-0.549493,-0.391412,-0.888968,0.215268,0.302785,-0.463168,2.169632,-0.025186,-0.146019,-0.730892,...,0.002808,-0.387239,-1.292603,-1.345999,-1.614691,-1.169082,-0.374467,-1.023092,-1.140734,-0.230621
tcga.6d.aa2e,1.889997,-0.216696,-0.957456,1.379341,-0.095197,0.842429,-0.333308,-0.159761,2.840645,0.435272,...,-1.015834,-1.142749,-0.835688,-1.305215,0.069022,-0.379610,-0.705230,0.891328,1.275252,1.768646
tcga.a3.3358,-0.463598,-0.411836,0.285120,-0.315057,-0.110232,0.158915,-0.480158,0.435642,-0.157068,-0.585498,...,-0.192913,0.158295,0.544916,-0.300444,-0.249859,0.037484,-0.587609,0.065306,0.833653,-0.439251
tcga.a3.3387,0.030597,0.173887,0.253526,-0.628067,-0.080006,-0.399059,-0.748547,-0.273866,-0.212709,-0.555677,...,1.166093,0.291422,0.429694,-0.232010,1.131407,-0.256120,0.057511,0.638138,1.197214,-0.351663
tcga.a3.a6ni,-0.076982,-0.045054,-0.125749,1.705175,-0.045627,-0.210988,0.697161,0.787205,1.398619,-0.131601,...,-0.456537,-0.118492,-0.330311,-0.601206,1.154269,-0.755156,-0.041402,0.745577,-0.610319,-0.045568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tcga.mm.a563,-0.271866,1.512331,-0.878810,0.665314,0.006693,0.072381,0.268651,0.518977,-0.614947,0.160143,...,0.141469,0.492466,-0.994216,-1.046597,0.108669,-0.477513,0.509783,-0.073631,-0.497594,-0.016846
tcga.mm.a564,-0.255402,4.139697,-1.441793,0.631147,-0.063910,-0.409252,0.994533,0.690185,-1.372453,0.779027,...,-0.725363,-0.967519,-1.112824,-0.627336,-1.568025,-0.913615,-0.497450,-1.063581,-0.557021,-0.300898
tcga.mm.a84u,1.703824,0.407407,-0.937617,-0.159475,0.035775,-0.162910,1.453862,0.437693,-0.003824,-0.475029,...,-0.340248,-0.131260,-1.187160,-1.284242,-0.787219,-1.014723,0.697904,-0.082626,-0.854342,4.287010
tcga.mw.a4ec,-0.493848,0.115885,-1.179917,0.686648,0.042471,0.672391,0.538725,-0.248294,-0.131953,0.102036,...,-0.702823,-0.144664,-0.323799,-0.372489,-1.505750,-1.017401,-0.222418,-0.454334,-1.177500,-0.359585


Для задачи определения типа рака - выделяем общие признаки у breast и kidney, создаем общую таблицу для MOFA

In [8]:
common_genes = set(expression_data_breast.columns).intersection(expression_data_kidney.columns)
print('Кол-во общих генов: ', len(common_genes))
common_meth = set(methylation_data_breast.columns).intersection(methylation_data_kidney.columns)
print('Кол-во общих метил: ', len(common_meth))
common_mirna = set(mirna_data_breast.columns).intersection(mirna_data_kidney.columns)
print('Кол-во общих мрнк: ', len(common_mirna))
print('Кол-во общих пациентов: ', len(set(expression_data_breast.index).intersection(expression_data_kidney.index)))

Кол-во общих генов:  1164
Кол-во общих метил:  129
Кол-во общих мрнк:  169
Кол-во общих пациентов:  0


In [9]:
expression_data_breast,  expression_data_kidney = expression_data_breast[
    common_genes], expression_data_kidney[common_genes]
methylation_data_breast, methylation_data_kidney = methylation_data_breast[
    common_meth], methylation_data_kidney[common_meth]
mirna_data_breast, mirna_data_kidney = mirna_data_breast[
    common_mirna], mirna_data_kidney[common_mirna]

In [10]:
transformed_expression_data_breast = transform_df_for_mofa(
    expression_data_breast.copy(), 'DNA')
transformed_methylation_breast = transform_df_for_mofa(
    methylation_data_breast.copy(), 'Methylation')
transformed_mirna_data_breast = transform_df_for_mofa(
    mirna_data_breast.copy(), 'RNA')
combined_data_breast = pd.concat([transformed_expression_data_breast,
                                  transformed_methylation_breast, transformed_mirna_data_breast], axis=0)
combined_data_breast['group'] = 'group_0'

transformed_expression_data_kidney = transform_df_for_mofa(
    expression_data_kidney.copy(), 'DNA')
transformed_methylation_kidney  = transform_df_for_mofa(
    methylation_data_kidney.copy(), 'Methylation')
transformed_mirna_data_kidney  = transform_df_for_mofa(
    mirna_data_kidney.copy(), 'RNA')
combined_data_kidney = pd.concat([transformed_expression_data_kidney,
                                  transformed_methylation_kidney, transformed_mirna_data_kidney], axis=0)
combined_data_kidney['group'] = 'group_1'

combined_data = pd.concat([combined_data_breast, combined_data_kidney], axis=0)
scaler = MinMaxScaler()
combined_data['value'] = scaler.fit_transform(combined_data[['value']])

combined_data

Unnamed: 0,sample,feature,value,view,group
0,tcga.3c.aaau,BASP1|10409,0.128890,DNA,group_0
1,tcga.3c.aali,BASP1|10409,0.146894,DNA,group_0
2,tcga.3c.aalj,BASP1|10409,0.227582,DNA,group_0
3,tcga.3c.aalk,BASP1|10409,0.157439,DNA,group_0
4,tcga.4h.aaak,BASP1|10409,0.176092,DNA,group_0
...,...,...,...,...,...
30584,tcga.mm.a563,hsa-mir-16-1,0.148144,RNA,group_1
30585,tcga.mm.a564,hsa-mir-16-1,0.142032,RNA,group_1
30586,tcga.mm.a84u,hsa-mir-16-1,0.136779,RNA,group_1
30587,tcga.mw.a4ec,hsa-mir-16-1,0.141107,RNA,group_1


In [11]:
combined_data.to_csv('data/tsga_breast_kidney_for_mofa.csv')

Сохраняем все предобработанные группы данных c общими генами

In [12]:
views = ['DNA', 'Methylation', 'RNA']
groups = ['group_0', 'group_1']

for group in groups:
    for view in views:
        df_unmelted = combined_data[(combined_data.view == view) & (combined_data.group == group)].drop(columns=['view', 'group']).pivot(index='sample', columns='feature')
        df_unmelted = df_unmelted['value'].reset_index()
        df_unmelted.columns.name = None
        group_name = 'breast' if group == 'group_0' else 'kidney'
        df_unmelted.to_csv(f'data/{group_name}/{view}_{group_name}_common_features.csv')

Также обрабатываем и сохраняем клинические даные