In [3]:
import pandas as pd
import numpy as np
from joblib import load

from optimalcodon.projects.rnastability.dataprocessing import get_data, general_preprocesing_pipeline
from optimalcodon.projects.rnastability.predictdecay import predict_sequence

(train_x, train_y), (test_x, test_y) = get_data(
    "../data/191004-TrainAndTestSets/")

# TODO: replace this line with the new model (this model is overfitting)
model = load("../191005-EvaluateModelLearningCurve/results-data/final_model.joblib")


In [4]:
# load the data for fish
fish_data = pd.read_csv("../../data/19-01-17-Get-ORFS-UTRS-codon-composition/sequence-data/fish_seq_data_cds_3utr.csv")
fish_data['specie'] = "fish"
fish_data['cell_type'] = "embryo mzt"
fish_data['datatype'] = "aamanitin polya"
fish_data['utrlenlog'] = np.log(fish_data['3utr'].str.len() + 1)
fish_data['cdslenlog'] = np.log(fish_data['coding'].str.len() + 1)
# rename the undex

fish_data = (
    fish_data.
    rename(columns={'ensembl_gene_id': 'gene_id'})
    .set_index('gene_id')
    .drop(['3utr'], axis=1)
    .loc[:, train_x.columns.values]
)


fish_data.sample(5)

Unnamed: 0_level_0,specie,cell_type,datatype,utrlenlog,coding,cdslenlog
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSDARG00000054530,fish,embryo mzt,aamanitin polya,4.75359,ATGGGGGACCCCGTTACCGAATACGCGTCACGACTTCAGCGTCAGG...,7.594381
ENSDARG00000101297,fish,embryo mzt,aamanitin polya,6.55108,ATGAGCCAGCAGACACAGGCCCCTCAGCAGACGCCGGCGGCGGCCC...,7.401842
ENSDARG00000019646,fish,embryo mzt,aamanitin polya,6.934397,ATGCGAGAGGAACAGACTTGTGGAGATTTTCCTGAAAGTGGGATCC...,6.398595
ENSDARG00000090192,fish,embryo mzt,aamanitin polya,5.549076,ATGGCAGAGGAGCGAGTAAAGGACTCTCTCTCAGAGAAACACAGTG...,8.185629
ENSDARG00000012929,fish,embryo mzt,aamanitin polya,4.553877,ATGTTTGCAAAAGCTTTTCGCGTGAAATCTAACACTGTTATCAAGG...,7.480992


In [5]:
# load xen data
xen_data = pd.read_csv("../../data/19-06-12-XenopusData/xenopus_seq_data_cds_3utr.csv")
xen_data['specie'] = "xenopus"
xen_data['cell_type'] = "embryo mzt"
xen_data['datatype'] = "aamanitin polya"
xen_data['utrlenlog'] = np.log(xen_data['3utr'].str.len() + 1)
xen_data['cdslenlog'] = np.log(xen_data['coding'].str.len() + 1)

xen_data = (
    xen_data.
    rename(columns={'ensembl_gene_id': 'gene_id'})
    .set_index('gene_id')
    .drop(['3utr'], axis=1)
    .loc[:, train_x.columns.values]
)
xen_data.sample(5)

Unnamed: 0_level_0,specie,cell_type,datatype,utrlenlog,coding,cdslenlog
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSXETG00000025680,xenopus,embryo mzt,aamanitin polya,5.236442,GGGGCTGCCATGCCAGAGGCTGGATTAGTCATGTTATCCCCCCAAG...,6.546785
ENSXETG00000000907,xenopus,embryo mzt,aamanitin polya,7.277939,ATGGAGTCCTTTTCTGAGCTTTCCTTGCAGTTTTCCCAGCTGTCCA...,6.75227
ENSXETG00000004054,xenopus,embryo mzt,aamanitin polya,,CAGATCATTTTCACAGTTACATTTTGGCTTTTGCTCAGACAACACC...,8.703673
ENSXETG00000002710,xenopus,embryo mzt,aamanitin polya,7.372118,ATGCCTTCTGTGATGGAGAAGTCAACCGGCTCCTCTGCTGTTATCT...,7.313887
ENSXETG00000005930,xenopus,embryo mzt,aamanitin polya,,TACACAGATCCCAGGAATGAAGATTTAGCTAAAGCCGAAGTGGCAT...,7.023759


In [6]:
fish_and_xen_data = pd.concat([xen_data, fish_data])
fish_and_xen_data.sample(10)

Unnamed: 0_level_0,specie,cell_type,datatype,utrlenlog,coding,cdslenlog
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSDARG00000041301,fish,embryo mzt,aamanitin polya,4.962845,ATGTCGGATCAGCAGGGAGTCCCAGATCAGCAGGTGGCTGGGAAAA...,6.660575
ENSDARG00000069627,fish,embryo mzt,aamanitin polya,4.204693,ATGGATCTTCCTTTTAAATTTAGTGGTGGGTCGGTGACCTTCGCCC...,7.939515
ENSXETG00000012355,xenopus,embryo mzt,aamanitin polya,7.424165,ATGGTGGATCTAAGAGGGGCCAAGGTGGCTTCTTTCACAGTAGATG...,7.349231
ENSXETG00000016872,xenopus,embryo mzt,aamanitin polya,,ATGGCGTCCCACCTGAAGTACATCTCGTTGGGGGTGTTAGTGTTCC...,6.895683
ENSXETG00000001650,xenopus,embryo mzt,aamanitin polya,6.716595,ATGGAGACCAAGCTTCCCGTCACCCCAACTAGTCCATCCTCCCCAG...,7.303843
ENSXETG00000015678,xenopus,embryo mzt,aamanitin polya,3.555348,GAGAAGCACTGCTCCTGTTTCAGCCAAACCTTCCTGAGGGTCCTTT...,6.09131
ENSDARG00000040884,fish,embryo mzt,aamanitin polya,7.913521,ATGGTGAGGAGCGAGAGGCAGCAGACAGAAGTGAAGGCCGAAGACT...,6.712956
ENSDARG00000017312,fish,embryo mzt,aamanitin polya,7.46164,ATGAGCGGGGCAGCGCTGGGAATAGAGATTGTGGTGGTCTTTTTCC...,7.657283
ENSDARG00000097289,fish,embryo mzt,aamanitin polya,7.380879,ATGGCTTCGACGCCCTCCGCTTCAGCCCTGTCAGCTGTTCTCCGCT...,6.555357
ENSXETG00000006760,xenopus,embryo mzt,aamanitin polya,,GAGGATTTCAACTTTGGTCCTGTGCTGGGAAAAGGATCGTATGGAA...,6.871091


In [7]:
fish_and_xen_data['predicted_stability'] = model.predict(fish_and_xen_data)

In [8]:
fish_and_xen_data.to_csv("results-data/predicted_stabilit_fish_and_xenopus_all_genes.csv")