### Imports

In [4]:
import pandas as pd
import numpy as np

import pickle

### Замечание: Модель должна быть обучена на нормализованном с помощью MinMaxScaler датасете. Это для того, чтобы после перевода miRNA в mRNA с уровнями экспрессий в промежутке [0, 1] модель смогла корректно работать.

### Проверяем что все mRNA из пула генов содержатся в качестве признаков в датасете c mRNA (датасет для обучения модели). Иначе отбрасываем соответсвующие строки с mRNA в пуле генов.

In [38]:
with open('Saved data/X.pickle', 'rb') as input_file:
    df = pickle.load(input_file)

genes_pool = pd.read_csv("modified_genes_pool.csv")

In [39]:
df_mRNA_s = list(df.columns)
pool_mRNA_s = list(set(genes_pool.mrna))

In [40]:
for mRNA in pool_mRNA_s:
    if mRNA not in df_mRNA_s:
        genes_pool = genes_pool.loc[genes_pool['mrna'] != mRNA]


### Берем только те miRNA из пула генов, которые содержатся в качестве признаков в датасете c miRNA (input в пайплайне). Иначе отбрасываем соответсвующие строки с miRNA в пуле генов.

In [74]:
input_path = 'input_data.csv'
input_df = pd.read_csv(input_path, index_col=0)

In [75]:
input_df

Unnamed: 0,hsa-miR-331-3p,hsa-miR-150-5p,hsa-miR-612,hsa-miR-652-3p\t,hsa-miR-1285-3p\t,hsa-miR-3922-5p\t,hsa-miR-6760-5p\t,hsa-miR-7-5p,hsa-miR-31-5p\t,hsa-miR-141-3p\t,hsa-miR-10a-5p,hsa-miR-10b-5p,hsa-miR-485-5p,hsa-miR-583,hsa-miR-15b-5p
user1,8.638829,5.382134,5.415584,5.08792,5.996549,5.352016,8.100511,5.223516,5.682623,6.598368,5.484325,5.507772,5.385737,5.400263,5.289086
user2,6.393672,5.618323,5.20352,6.113439,8.818824,6.227852,5.528075,5.654365,5.498968,6.491727,5.420149,5.458875,9.009982,9.431167,7.436167
user3,7.622139,7.877542,5.586845,5.356201,7.945687,5.272935,5.611189,5.53894,9.902776,6.230033,5.610641,7.242081,5.317966,5.767766,5.307996


In [76]:
input_df_miRNA_s = list(input_df.columns)
pool_miRNA_s = list(set(genes_pool.mirna))

In [77]:
for miRNA in pool_miRNA_s:
    if miRNA not in input_df_miRNA_s:
        genes_pool = genes_pool.loc[genes_pool['mirna'] != miRNA]

### Preprocessing датасета (input в пайплане) с miRNA

In [81]:
from sklearn.preprocessing import MinMaxScaler

def preprocess_miRNA_df(df : pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df))
    scaled_df.index = df.index
    scaled_df.columns = df.columns
    return scaled_df

In [84]:
input_df_scaled = preprocess_miRNA_df(input_df)

### Утилиты для использования пула генов

In [64]:
def Get_mRNA(miRNA: str) -> list:
    tmp = genes_pool.loc[genes_pool['mirna']  == miRNA]
    return list(tmp.mrna)

In [65]:
def Get_miRNA(mRNA: str) -> list:
    tmp = genes_pool.loc[genes_pool['mrna']  == mRNA]
    return list(tmp.mirna)

In [68]:
def Get_integration_rate(mRNA: str, miRNA: str) -> float:
    tmp = genes_pool.loc[(genes_pool['mrna']  == mRNA) & (genes_pool['mirna'] == miRNA)]
    return list(tmp.int_rate)[0]

In [90]:
pool_mRNA_s = list(set(genes_pool.mrna))

In [91]:
data = pd.DataFrame()
class_weights = []


In [100]:
for mRNA in pool_mRNA_s:
    corresponding_miRNA_s = Get_miRNA(mRNA)
    best_miRNA = corresponding_miRNA_s[0]
    for miRNA in corresponding_miRNA_s:
        if Get_integration_rate(mRNA, miRNA) > Get_integration_rate(mRNA, best_miRNA):
            best_miRNA = miRNA
    data[mRNA] = input_df_scaled[best_miRNA]
    class_weights.append(Get_integration_rate(mRNA, best_miRNA))

### miRNA to mRNA transition finished

In [101]:
data

Unnamed: 0,ERBB2,GATA4,PTEN,PIK3CA,CDC42BPA,MYB,CCND1,SERPINE1,TFRC,GATA5,MUC1,TP53,TSPYL5
user1,1.0,0.02742,0.018356,0.02742,0.018356,0.0,0.0,0.02742,0.336896,0.336896,0.018356,0.0,0.0
user2,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.09465,0.09465
user3,0.547163,1.0,0.0,1.0,0.0,0.008807,0.008807,1.0,1.0,1.0,0.0,1.0,1.0


In [105]:
for feature, weight, in zip(list(data.columns), class_weights):
    print(f'{feature} : {weight}')

ERBB2 : 0.423546126526264
GATA4 : 0.651163663841603
PTEN : 0.61526768782774
PIK3CA : 0.633551258744417
CDC42BPA : 0.572103726275568
MYB : 0.885870164464218
CCND1 : 0.896082604874785
SERPINE1 : 0.611293617254197
TFRC : 0.681634754556599
GATA5 : 0.647914067167453
MUC1 : 0.509062795459844
TP53 : 0.58270914166979
TSPYL5 : 0.471455198396602
