In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from sklearn.preprocessing import MinMaxScaler


import sys
sys.path.append("..")

from src.tcga_preprocess_utils import prepare_data, choose_common_patients, prepare_survival_data, prepare_clinical_data
from src.mofa_utils import transform_df_for_mofa, preprocess_data_for_mofa
from src.constants import RANDOM_STATE, CANCER_SUBTYPE_FEATURE

Делаем общую предобработку для всех моделей: отбираем только тех пациентов, для которых есть данные по выживаемости / подтипу рака; убираем признаки у которых все значения - нули, признаки с нулевой дисперсией и самые высокодисперсные; применяем стандартизацию. Сохраняем каждую модальность отдельно

In [2]:
survival_data_breast = prepare_survival_data('data/breast/survival')
survival_data_breast

Unnamed: 0_level_0,Survival,Death
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
tcga.3c.aaau,4047.0,0.0
tcga.3c.aali,4005.0,0.0
tcga.3c.aalj,1474.0,0.0
tcga.3c.aalk,1448.0,0.0
tcga.4h.aaak,348.0,0.0
...,...,...
tcga.wt.ab44,883.0,0.0
tcga.xx.a899,467.0,0.0
tcga.xx.a89a,488.0,0.0
tcga.z7.a8r5,3287.0,0.0


In [5]:
pd.read_csv('data/breast/exp', sep=' ')

Unnamed: 0,TCGA.3C.AAAU.01,TCGA.3C.AALI.01,TCGA.3C.AALJ.01,TCGA.3C.AALK.01,TCGA.4H.AAAK.01,TCGA.5L.AAT0.01,TCGA.5L.AAT1.01,TCGA.5T.A9QA.01,TCGA.A1.A0SB.01,TCGA.A1.A0SD.01,...,TCGA.UL.AAZ6.01,TCGA.UU.A93S.01,TCGA.V7.A7HQ.01,TCGA.W8.A86G.01,TCGA.WT.AB41.01,TCGA.WT.AB44.01,TCGA.XX.A899.01,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01
?|100130426,0.0000,0.0000,0.9066,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
?|100133144,16.3644,9.2659,11.6228,12.0894,6.8468,3.9889,0.0000,1.4644,15.3396,9.5178,...,0.3992,4.3126,0.0000,5.5624,0.0000,0.0000,14.3858,22.3240,2.2638,6.8865
?|100134869,12.9316,17.3790,9.2294,11.0799,14.4298,13.6090,10.5949,8.9958,14.3935,11.3241,...,14.3720,10.8828,3.0792,14.3711,6.3091,3.2580,21.4409,27.2744,7.2933,24.7795
?|10357,52.1503,69.7553,154.2974,143.8643,84.2128,114.2572,115.9984,107.5628,116.3870,60.2630,...,135.6241,136.1288,29.9974,128.3151,53.6278,42.2643,137.7756,64.1427,85.0461,167.5511
?|10431,408.0760,563.8934,1360.8341,865.5358,766.3830,807.7431,1108.3945,1420.5021,657.2812,977.9175,...,1570.1445,2886.3965,1721.8816,697.6744,1245.2681,1877.4180,652.7559,722.7208,1140.2801,1003.5668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C17orf55|284185,13.7864,50.5710,108.7942,16.1357,5.9574,19.3577,4.8900,268.3054,1.3515,6.9473,...,65.8718,73.8061,63.4313,42.7148,69.4006,5.7015,16.9291,9.9197,24.4900,33.4058
C17orf56|146705,634.2170,942.4470,823.4180,728.1051,521.4936,445.8014,451.3366,518.9906,240.3018,212.9253,...,655.6082,337.7062,497.5166,848.5050,812.2476,713.9971,548.7165,397.7468,625.0575,459.6120
C17orf57|124989,33.0872,64.7091,24.4787,22.3417,69.7872,24.6370,10.5949,25.1046,40.5451,37.3832,...,20.7596,3.6179,9.8534,99.6678,16.3249,19.7271,75.9843,24.0907,15.3093,14.9630
C17orf58|284018,477.3525,4373.5726,836.8087,1208.1092,572.3404,702.7423,499.5925,265.1674,165.3339,689.7692,...,610.4121,284.3705,576.4239,716.1841,484.2271,1042.5575,256.6929,213.5097,411.5521,526.8378


In [3]:
expression_data_breast = prepare_data('data/breast/exp', survival_data_breast.index)
expression_data_breast = preprocess_data_for_mofa(expression_data_breast)

methylation_data_breast = prepare_data('data/breast/methy', survival_data_breast.index)
methylation_data_breast = preprocess_data_for_mofa(methylation_data_breast)

mirna_data_breast = prepare_data('data/breast/mirna', survival_data_breast.index)
mirna_data_breast = preprocess_data_for_mofa(mirna_data_breast)

survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast = choose_common_patients(
    [survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast])

expression_data_breast = expression_data_breast.dropna(axis=1, how='any')
methylation_data_breast = methylation_data_breast.dropna(axis=1, how='any')
mirna_data_breast = mirna_data_breast.dropna(axis=1, how='any')

survival_data_breast.to_csv('data/breast/survival_data_breast.csv')
expression_data_breast.to_csv('data/breast/expression_data_breast.csv')
methylation_data_breast.to_csv('data/breast/methylation_data_breast.csv')
mirna_data_breast.to_csv('data/breast/mirna_data_breast.csv')

Почти то же самое, но только когда таргет - гистологический тип, а так же нужно удалить те группы, размер которых меньше их количества, чтобы была возможна стратификация по таргету

In [4]:
clinical_data = prepare_clinical_data('data/clinical/breast')
clinical_data[CANCER_SUBTYPE_FEATURE].value_counts()

Infiltrating Ductal Carcinoma       784
Infiltrating Lobular Carcinoma      203
Other, specify                       46
Mixed Histology (please specify)     30
Mucinous Carcinoma                   17
Metaplastic Carcinoma                 9
Medullary Carcinoma                   6
Infiltrating Carcinoma NOS            1
Name: histological_type, dtype: int64

In [5]:
clinical_data = clinical_data.groupby(CANCER_SUBTYPE_FEATURE).filter(lambda x: len(x) >= 7)

In [6]:
expression_data_breast_for_subtype_pred = prepare_data(
    'data/breast/exp', clinical_data.index)
expression_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    expression_data_breast_for_subtype_pred)

methylation_data_breast_for_subtype_pred = prepare_data(
    'data/breast/methy', clinical_data.index)
methylation_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    methylation_data_breast_for_subtype_pred)

mirna_data_breast_for_subtype_pred = prepare_data(
    'data/breast/mirna', clinical_data.index)
mirna_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    mirna_data_breast_for_subtype_pred)

clinical_data_breast_for_subtype_pred, expression_data_breast_for_subtype_pred, methylation_data_breast_for_subtype_pred, mirna_data_breast_for_subtype_pred = choose_common_patients(
    [clinical_data, expression_data_breast_for_subtype_pred, methylation_data_breast_for_subtype_pred, mirna_data_breast_for_subtype_pred])

expression_data_breast_for_subtype_pred = expression_data_breast_for_subtype_pred.dropna(axis=1, how='any')
methylation_data_breast_for_subtype_pred = methylation_data_breast_for_subtype_pred.dropna(axis=1, how='any')
mirna_data_breast_for_subtype_pred = mirna_data_breast_for_subtype_pred.dropna(axis=1, how='any')

expression_data_breast_for_subtype_pred.to_csv(
    'data/breast/expression_data_breast_for_subtype_pred.csv')
methylation_data_breast_for_subtype_pred.to_csv(
    'data/breast/methylation_data_breast_for_subtype_pred.csv')
mirna_data_breast_for_subtype_pred.to_csv(
    'data/breast/mirna_data_breast_for_subtype_pred.csv')

clinical_data_breast_for_subtype_pred.to_csv(
    'data/breast/clinical_data_breast_for_subtype_pred.csv')

Трансформируем в общий датафрейм в формате MOFA 

In [7]:
transformed_expression_data_breast = transform_df_for_mofa(
    expression_data_breast.copy(), 'DNA')
transformed_methylation_breast = transform_df_for_mofa(
    methylation_data_breast.copy(), 'Methylation')
transformed_mirna_data_breast = transform_df_for_mofa(
    mirna_data_breast.copy(), 'RNA')
combined_data_breast = pd.concat([transformed_expression_data_breast,
                                  transformed_methylation_breast, transformed_mirna_data_breast], axis=0)

combined_data_breast.to_csv('data/breast/mofa_data_breast.csv')
combined_data_breast

Unnamed: 0,sample,feature,value,view
0,tcga.3c.aaau,?|10357,-1.042741,DNA
1,tcga.3c.aali,?|10357,-0.756164,DNA
2,tcga.3c.aalj,?|10357,0.620028,DNA
3,tcga.3c.aalk,?|10357,0.450196,DNA
4,tcga.4h.aaak,?|10357,-0.520822,DNA
...,...,...,...,...
112835,tcga.wt.ab44,hsa-mir-99a,0.353295,RNA
112836,tcga.xx.a899,hsa-mir-99a,1.334040,RNA
112837,tcga.xx.a89a,hsa-mir-99a,0.603946,RNA
112838,tcga.z7.a8r5,hsa-mir-99a,0.377321,RNA


То же самое для данных, где таргет - гистологический тип

In [8]:
transformed_expression_data_breast_for_subtype_pred = transform_df_for_mofa(
    expression_data_breast_for_subtype_pred.copy(), 'DNA')
transformed_methylation_breast_for_subtype_pred = transform_df_for_mofa(
    methylation_data_breast_for_subtype_pred.copy(), 'Methylation')
transformed_mirna_data_breast_for_subtype_pred = transform_df_for_mofa(
    mirna_data_breast_for_subtype_pred.copy(), 'RNA')

combined_data_breast_for_subtype_pred = pd.concat([transformed_expression_data_breast_for_subtype_pred,
                                                   transformed_methylation_breast_for_subtype_pred, transformed_mirna_data_breast_for_subtype_pred], axis=0)

combined_data_breast_for_subtype_pred.to_csv(
    'data/breast/mofa_data_breast_for_subtype_pred.csv')
combined_data_breast_for_subtype_pred

Unnamed: 0,sample,feature,value,view
0,tcga.3c.aaau,?|10357,-1.046201,DNA
1,tcga.3c.aali,?|10357,-0.757116,DNA
2,tcga.3c.aalj,?|10357,0.631118,DNA
3,tcga.3c.aalk,?|10357,0.459800,DNA
4,tcga.4h.aaak,?|10357,-0.519715,DNA
...,...,...,...,...
111743,tcga.wt.ab44,hsa-mir-99a,0.350831,RNA
111744,tcga.xx.a899,hsa-mir-99a,1.328710,RNA
111745,tcga.xx.a89a,hsa-mir-99a,0.600750,RNA
111746,tcga.z7.a8r5,hsa-mir-99a,0.374787,RNA
