In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from sklearn.preprocessing import MinMaxScaler


import sys
sys.path.append("..")

from src.tcga_preprocess_utils import prepare_data, choose_common_patients, prepare_survival_data, prepare_clinical_data
from src.mofa_utils import transform_df_for_mofa, preprocess_data_for_mofa
from src.constants import RANDOM_STATE, CANCER_SUBTYPE_FEATURE

Делаем общую предобработку для всех моделей: отбираем только тех пациентов, для которых есть данные по выживаемости / подтипу рака; убираем признаки у которых все значения - нули, признаки с нулевой дисперсией и самые высокодисперсные; применяем стандартизацию. Сохраняем каждую модальность отдельно

In [2]:
survival_data_breast = prepare_survival_data('data/breast/survival')
survival_data_breast

Unnamed: 0_level_0,Survival,Death
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
tcga.3c.aaau,4047.0,0.0
tcga.3c.aali,4005.0,0.0
tcga.3c.aalj,1474.0,0.0
tcga.3c.aalk,1448.0,0.0
tcga.4h.aaak,348.0,0.0
...,...,...
tcga.wt.ab44,883.0,0.0
tcga.xx.a899,467.0,0.0
tcga.xx.a89a,488.0,0.0
tcga.z7.a8r5,3287.0,0.0


In [3]:
expression_data_breast = prepare_data('data/breast/exp', survival_data_breast.index)
expression_data_breast = preprocess_data_for_mofa(expression_data_breast)

methylation_data_breast = prepare_data('data/breast/methy', survival_data_breast.index)
methylation_data_breast = preprocess_data_for_mofa(methylation_data_breast)

mirna_data_breast = prepare_data('data/breast/mirna', survival_data_breast.index)
mirna_data_breast = preprocess_data_for_mofa(mirna_data_breast)

survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast = choose_common_patients(
    [survival_data_breast, expression_data_breast, methylation_data_breast, mirna_data_breast])

expression_data_breast = expression_data_breast.dropna(axis=1, how='any')
methylation_data_breast = methylation_data_breast.dropna(axis=1, how='any')
mirna_data_breast = mirna_data_breast.dropna(axis=1, how='any')

survival_data_breast.to_csv('data/breast/survival_data_breast.csv')
expression_data_breast.to_csv('data/breast/expression_data_breast.csv')
methylation_data_breast.to_csv('data/breast/methylation_data_breast.csv')
mirna_data_breast.to_csv('data/breast/mirna_data_breast.csv')

Почти то же самое, но только когда таргет - гистологический тип, а так же нужно удалить те группы, размер которых меньше их количества, чтобы была возможна стратификация по таргету

In [4]:
clinical_data = prepare_clinical_data('data/clinical/breast')
clinical_data[CANCER_SUBTYPE_FEATURE].value_counts()

Infiltrating Ductal Carcinoma       784
Infiltrating Lobular Carcinoma      203
Other, specify                       46
Mixed Histology (please specify)     30
Mucinous Carcinoma                   17
Metaplastic Carcinoma                 9
Medullary Carcinoma                   6
Infiltrating Carcinoma NOS            1
Name: histological_type, dtype: int64

In [5]:
clinical_data = clinical_data.groupby(CANCER_SUBTYPE_FEATURE).filter(lambda x: len(x) >= 7)

In [6]:
expression_data_breast_for_subtype_pred = prepare_data(
    'data/breast/exp', clinical_data.index)
expression_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    expression_data_breast_for_subtype_pred)

methylation_data_breast_for_subtype_pred = prepare_data(
    'data/breast/methy', clinical_data.index)
methylation_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    methylation_data_breast_for_subtype_pred)

mirna_data_breast_for_subtype_pred = prepare_data(
    'data/breast/mirna', clinical_data.index)
mirna_data_breast_for_subtype_pred = preprocess_data_for_mofa(
    mirna_data_breast_for_subtype_pred)

clinical_data_breast_for_subtype_pred, expression_data_breast_for_subtype_pred, methylation_data_breast_for_subtype_pred, mirna_data_breast_for_subtype_pred = choose_common_patients(
    [clinical_data, expression_data_breast_for_subtype_pred, methylation_data_breast_for_subtype_pred, mirna_data_breast_for_subtype_pred])

expression_data_breast_for_subtype_pred = expression_data_breast_for_subtype_pred.dropna(axis=1, how='any')
methylation_data_breast_for_subtype_pred = methylation_data_breast_for_subtype_pred.dropna(axis=1, how='any')
mirna_data_breast_for_subtype_pred = mirna_data_breast_for_subtype_pred.dropna(axis=1, how='any')

expression_data_breast_for_subtype_pred.to_csv(
    'data/breast/expression_data_breast_for_subtype_pred.csv')
methylation_data_breast_for_subtype_pred.to_csv(
    'data/breast/methylation_data_breast_for_subtype_pred.csv')
mirna_data_breast_for_subtype_pred.to_csv(
    'data/breast/mirna_data_breast_for_subtype_pred.csv')

clinical_data_breast_for_subtype_pred.to_csv(
    'data/breast/clinical_data_breast_for_subtype_pred.csv')

Трансформируем в общий датафрейм в формате MOFA 

In [7]:
transformed_expression_data_breast = transform_df_for_mofa(
    expression_data_breast.copy(), 'DNA')
transformed_methylation_breast = transform_df_for_mofa(
    methylation_data_breast.copy(), 'Methylation')
transformed_mirna_data_breast = transform_df_for_mofa(
    mirna_data_breast.copy(), 'RNA')
combined_data_breast = pd.concat([transformed_expression_data_breast,
                                  transformed_methylation_breast, transformed_mirna_data_breast], axis=0)

combined_data_breast.to_csv('data/breast/mofa_data_breast.csv')
combined_data_breast

Unnamed: 0,sample,feature,value,view
0,tcga.3c.aaau,?|10357,-1.042741,DNA
1,tcga.3c.aali,?|10357,-0.756164,DNA
2,tcga.3c.aalj,?|10357,0.620028,DNA
3,tcga.3c.aalk,?|10357,0.450196,DNA
4,tcga.4h.aaak,?|10357,-0.520822,DNA
...,...,...,...,...
112835,tcga.wt.ab44,hsa-mir-99a,0.353295,RNA
112836,tcga.xx.a899,hsa-mir-99a,1.334040,RNA
112837,tcga.xx.a89a,hsa-mir-99a,0.603946,RNA
112838,tcga.z7.a8r5,hsa-mir-99a,0.377321,RNA


То же самое для данных, где таргет - гистологический тип

In [8]:
transformed_expression_data_breast_for_subtype_pred = transform_df_for_mofa(
    expression_data_breast_for_subtype_pred.copy(), 'DNA')
transformed_methylation_breast_for_subtype_pred = transform_df_for_mofa(
    methylation_data_breast_for_subtype_pred.copy(), 'Methylation')
transformed_mirna_data_breast_for_subtype_pred = transform_df_for_mofa(
    mirna_data_breast_for_subtype_pred.copy(), 'RNA')

combined_data_breast_for_subtype_pred = pd.concat([transformed_expression_data_breast_for_subtype_pred,
                                                   transformed_methylation_breast_for_subtype_pred, transformed_mirna_data_breast_for_subtype_pred], axis=0)

combined_data_breast_for_subtype_pred.to_csv(
    'data/breast/mofa_data_breast_for_subtype_pred.csv')
combined_data_breast_for_subtype_pred

Unnamed: 0,sample,feature,value,view
0,tcga.3c.aaau,?|10357,-1.046201,DNA
1,tcga.3c.aali,?|10357,-0.757116,DNA
2,tcga.3c.aalj,?|10357,0.631118,DNA
3,tcga.3c.aalk,?|10357,0.459800,DNA
4,tcga.4h.aaak,?|10357,-0.519715,DNA
...,...,...,...,...
111743,tcga.wt.ab44,hsa-mir-99a,0.350831,RNA
111744,tcga.xx.a899,hsa-mir-99a,1.328710,RNA
111745,tcga.xx.a89a,hsa-mir-99a,0.600750,RNA
111746,tcga.z7.a8r5,hsa-mir-99a,0.374787,RNA
