In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from sklearn.preprocessing import MinMaxScaler


import sys
sys.path.append("..")

from src.tcga_preprocess_utils import prepare_data, choose_common_patients
from src.mofa_utils import transform_df_for_mofa, preprocess_data_for_mofa
from src.constants import RANDOM_STATE

Делаем общую предобработку для всех моделей

In [2]:
expression_data_breast = prepare_data('data/breast/exp')
expression_data_breast = preprocess_data_for_mofa(expression_data_breast)

methylation_data_breast = prepare_data('data/breast/methy')
methylation_data_breast = preprocess_data_for_mofa(methylation_data_breast)

mirna_data_breast = prepare_data('data/breast/mirna')
mirna_data_breast = preprocess_data_for_mofa(mirna_data_breast)

expression_data_kidney = prepare_data('data/kidney/exp')
expression_data_kidney = preprocess_data_for_mofa(expression_data_kidney)

methylation_data_kidney = prepare_data('data/kidney/methy')
methylation_data_kidney = preprocess_data_for_mofa(methylation_data_kidney)

mirna_data_kidney = prepare_data('data/kidney/mirna')
mirna_data_kidney = preprocess_data_for_mofa(mirna_data_kidney)

expression_data_breast, methylation_data_breast, mirna_data_breast = choose_common_patients(
    [expression_data_breast, methylation_data_breast, mirna_data_breast])

expression_data_kidney, methylation_data_kidney, mirna_data_kidney = choose_common_patients(
    [expression_data_kidney, methylation_data_kidney, mirna_data_kidney])

In [3]:
common_genes = set(expression_data_breast.columns).intersection(expression_data_kidney.columns)
print('Кол-во общих генов: ', len(common_genes))
common_meth = set(methylation_data_breast.columns).intersection(methylation_data_kidney.columns)
print('Кол-во общих метил: ', len(common_meth))
common_mirna = set(mirna_data_breast.columns).intersection(mirna_data_kidney.columns)
print('Кол-во общих мрнк: ', len(common_mirna))
print('Кол-во общих пациентов: ', len(set(expression_data_breast.index).intersection(expression_data_kidney.index)))

Кол-во общих генов:  1155
Кол-во общих метил:  128
Кол-во общих мрнк:  168
Кол-во общих пациентов:  0


In [4]:
expression_data_breast,  expression_data_kidney = expression_data_breast[
    common_genes], expression_data_kidney[common_genes]
methylation_data_breast, methylation_data_kidney = methylation_data_breast[
    common_meth], methylation_data_kidney[common_meth]
mirna_data_breast, mirna_data_kidney = mirna_data_breast[
    common_mirna], mirna_data_kidney[common_mirna]

In [5]:
transformed_expression_data_breast = transform_df_for_mofa(
    expression_data_breast.copy(), 'DNA')
transformed_methylation_breast = transform_df_for_mofa(
    methylation_data_breast.copy(), 'Methylation')
transformed_mirna_data_breast = transform_df_for_mofa(
    mirna_data_breast.copy(), 'RNA')
combined_data_breast = pd.concat([transformed_expression_data_breast,
                                  transformed_methylation_breast, transformed_mirna_data_breast], axis=0)
combined_data_breast['group'] = 'group_0'

transformed_expression_data_kidney = transform_df_for_mofa(
    expression_data_kidney.copy(), 'DNA')
transformed_methylation_kidney  = transform_df_for_mofa(
    methylation_data_kidney.copy(), 'Methylation')
transformed_mirna_data_kidney  = transform_df_for_mofa(
    mirna_data_kidney.copy(), 'RNA')
combined_data_kidney = pd.concat([transformed_expression_data_kidney,
                                  transformed_methylation_kidney, transformed_mirna_data_kidney], axis=0)
combined_data_kidney['group'] = 'group_1'

combined_data = pd.concat([combined_data_breast, combined_data_kidney], axis=0)
scaler = MinMaxScaler()
combined_data['value'] = scaler.fit_transform(combined_data[['value']])

combined_data

Unnamed: 0,sample,feature,value,view,group
0,tcga.3c.aaau,ADAMTSL1|92949,0.105534,DNA,group_0
1,tcga.3c.aali,ADAMTSL1|92949,0.120799,DNA,group_0
2,tcga.3c.aalj,ADAMTSL1|92949,0.117392,DNA,group_0
3,tcga.3c.aalk,ADAMTSL1|92949,0.114669,DNA,group_0
4,tcga.4h.aaak,ADAMTSL1|92949,0.111725,DNA,group_0
...,...,...,...,...,...
34939,tcga.mm.a563,hsa-mir-203,0.127541,RNA,group_1
34940,tcga.mm.a564,hsa-mir-203,0.123825,RNA,group_1
34941,tcga.mm.a84u,hsa-mir-203,0.104273,RNA,group_1
34942,tcga.mw.a4ec,hsa-mir-203,0.125738,RNA,group_1


Сохраняем готовый датафрейм для MOFA

In [6]:
combined_data.to_csv('data/tsga_breast_kidney_for_mofa.csv')

Сохраняем все предобработанные группы данных для других моделей

In [None]:
views = ['DNA', 'Methylation', 'RNA']
groups = ['group_0', 'group_1']

for group in groups:
    for view in views:
        df_unmelted = combined_data[(combined_data.view == view) & (combined_data.group == group)].drop(columns=['view', 'group']).pivot(index='sample', columns='feature')
        df_unmelted = df_unmelted['value'].reset_index()
        df_unmelted.columns.name = None
        group_name = 'breast' if group == 'group_0' else 'kidney'
        df_unmelted.to_csv(f'data/{group_name}/{view}_{group_name}.csv')