# Predict reporters (lasso final model)

In [1]:
import numpy as np
import pandas as pd
import joblib

from optimalcodon.projects.rnastability.dataprocessing import get_data

# load the predictive model
lasso = joblib.load("../191005-EvaluateModelLearningCurve/results-data/final_model.joblib")

In [2]:
reporters = (
    pd.read_csv("reporters.csv")
    .rename(columns={'sequence': 'coding'})
    .assign(
        gene_id = lambda x: x.reporter_id + '|' + x.optimality,
        utrlenlog = np.nan,
        cdslenlog = lambda x: np.log(x.coding.str.len()),
        key = 'k' # tmp var
    )
    .drop(['reporter_id', 'optimality', 'description'], axis=1)

)
reporters.head()

Unnamed: 0,coding,gene_id,utrlenlog,cdslenlog,key
0,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,cherry-P2A-fish|optimal,,7.334982,k
1,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,cherry-P2A-fish|non-optimal,,7.334982,k
2,ATGGCGAGAAGGTGTCTTCGTTTATGGCAACGGAGGCGTAGGAGCA...,embo2016-B|non-optimal,,5.743003,k
3,ATGGCAGAAGGTGTCTTCGTTTATGGCAACGGAGGCGTAGGAGCAT...,embo2016-B|optimal,,5.752573,k
4,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,cherry-P2A-293t|optimal,,7.334982,k


In [3]:
(train_x, train_y), (test_x, test_y) = get_data('../data/191004-TrainAndTestSets/')

In [4]:
train_x.head()

Unnamed: 0_level_0,specie,cell_type,datatype,utrlenlog,coding,cdslenlog
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000122965,human,293t,endogenous,7.286876,ATGTCGCGACTGATCGTGAAGAATCTCCCGAATGGGATGAAGGAGG...,7.966933
ENSDARG00000009350,fish,embryo mzt,aamanitin ribo,6.393591,ATGCAGAACGTGATAAACTCGGTGAAAGGCACTGCTCTGGGGGTCG...,6.861711
ENSG00000101856,human,hela,endogenous,7.100852,ATGGCTGCCGAGGATGTGGTGGCGACTGGCGCCGACCCAAGCGATC...,6.378426
ENSMUSG00000040586,mouse,mES cells,slam-seq,6.891626,ATGAGGATGGCTCAGTCCAACATGCCCCACAAGTCTGATGTGTTGA...,8.026497
ENSG00000183826,human,293t,endogenous,8.784928,ATGAGTAACAGCCACCCTCTTCGCCCCTTTACTGCAGTGGGGGAAA...,7.517521


In [5]:
cat_data = train_x[['specie', 'cell_type', 'datatype']].drop_duplicates().assign(key='k')

reporters = cat_data.merge(reporters, how='outer').drop(['key'], axis=1).set_index('gene_id')
reporters.sample(10)

Unnamed: 0_level_0,specie,cell_type,datatype,coding,utrlenlog,cdslenlog
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cherry-P2A-293t|non-optimal,fish,embryo mzt,aamanitin polya,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,,7.334982
embo2016-A|optimal,human,hela,endogenous,ATGGCTGCAGCCACCAAGTCCGACAGCGAGGAGAGTGAACCAGAAA...,,5.713733
cherry-P2A-293t|non-optimal,fish,embryo mzt,aamanitin ribo,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,,7.334982
embo2016-B|non-optimal,human,293t,endogenous,ATGGCGAGAAGGTGTCTTCGTTTATGGCAACGGAGGCGTAGGAGCA...,,5.743003
cherry-P2A-293t|non-optimal,human,k562,endogenous,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,,7.334982
cherry-P2A-fish|non-optimal,human,293t,endogenous,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,,7.334982
cherry-P2A-293t|non-optimal,human,RPE,endogenous,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,,7.334982
cherry-P2A-293t|non-optimal,mouse,mES cells,slam-seq,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,,7.334982
cherry-P2A-293t|optimal,human,hela,endogenous,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,,7.334982
embo2016-A|non-optimal,human,293t,endogenous,ATGCTGCAGCCACCAAGTCCGACAGCGAGGAGAGTGAACCAGAAAC...,,5.703782


In [6]:
reporters = reporters.loc[:, train_x.columns.values]

In [7]:
reporters['predicted_stability'] = lasso.predict(reporters)

In [8]:
reporters.to_csv('reporters_with_predicted_stability_all_sepcies.csv')