# Predictions with confidence interval

In [1]:
import pandas as pd
import numpy as np
from joblib import load

from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline
from optimalcodon.projects.rnastability.predictuncertainty import predict_seq_with_uncertainty

***
### 1. Predict Test Data

In [2]:
# load the test data and models
(train_x, train_y), (test_x, test_y) = get_data("../19-04-30-PredictiveModelDecayAllSpecies/19-04-30-EDA/results_data/")

models = load("../../data/19-08-08-PredictiveModelStability/predictivemodel.joblib")

In [3]:
test_x['y_observed'] = test_y
test_x['median_prediction'] = models['median_gbm'].predict(test_x)
test_x['lower_ci'] = models['lower_gbm'].predict(test_x)
test_x['upper_ci'] = models['upper_gbm'].predict(test_x)

In [4]:
(
    test_x
    .reset_index()
    .drop(['coding', 'utrlenlog', 'cdslenlog'], axis=1)
    .to_csv("results_data/prediction_interavals_test_data.csv", index=False)
)


***
### 2. Reporter sequences

In [5]:
reporters = (
    pd.read_csv("../19-03-13-PredictReportersWithModel/reporters.csv")
    .rename(columns={'sequence': 'coding'})
    .assign(
        gene_id = lambda x: x.reporter_id + '|' + x.optimality,
        utrlenlog = np.nan,
        cdslenlog = lambda x: np.log(x.coding.str.len()),
        key = 'k' # tmp var
    )
    .drop(['reporter_id', 'optimality', 'description'], axis=1)
)
reporters.head()

Unnamed: 0,coding,gene_id,utrlenlog,cdslenlog,key
0,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,cherry-P2A-fish|optimal,,7.334982,k
1,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,cherry-P2A-fish|non-optimal,,7.334982,k
2,ATGGCGAGAAGGTGTCTTCGTTTATGGCAACGGAGGCGTAGGAGCA...,embo2016-B|non-optimal,,5.743003,k
3,ATGGCAGAAGGTGTCTTCGTTTATGGCAACGGAGGCGTAGGAGCAT...,embo2016-B|optimal,,5.752573,k
4,ATGGTTTCAAAAGGAGAAGAAGATAATATGGCGATAATTAAAGAAT...,cherry-P2A-293t|optimal,,7.334982,k


In [6]:
# obtain data type features
dtypefeaturs = (
    test_x[['specie', 'cell_type', 'datatype']]
    .drop_duplicates()
    .reset_index()
    .drop('gene_id', axis=1)
    .assign(key = 'k')
)

# combain the frames to predict the reporter for each case:
# specie, datatype, celltype
reporters = (
    pd.merge(reporters, dtypefeaturs, on='key')
    .drop('key', axis=1)
    .set_index('gene_id')
)

In [7]:
reporters['median_prediction'] = models['median_gbm'].predict(reporters)
reporters['lower_ci'] = models['lower_gbm'].predict(reporters)
reporters['upper_ci'] = models['upper_gbm'].predict(reporters)

In [8]:
reporters.reset_index().to_csv("results_data/reporters_predictions_intervals.csv", index=False)

***
## Predict GFPs sequences

In [14]:
ls ../../

[1m[31mREADME.md[m[m*  [1m[36mdata[m[m/       [1m[36mdoc[m[m/        [1m[36mmanuscript[m[m/ [1m[36mresults[m[m/    [1m[36msrc[m[m/


In [12]:
??predict_seq_with_uncertainty