# Load the preprocessed data, add predictions to the test data, and save them as CSV file for the supplementary

In [25]:
import math
import os
import sys

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model

import RNAutils
import sequence_logo
import utils as U
from quad_model import *

SEED = 981
print("Using seed:", SEED, file=sys.stderr)
np.random.seed(SEED)  # for reproducibility

Using seed: 981


# Load data and model

In [2]:
barcode_statistics_tr = pd.read_csv('data/barcode_statistics_train_ES7_HeLa_ABC.csv.gz')
barcode_statistics_te = pd.read_csv('data/barcode_statistics_test_ES7_HeLa_ABC.csv.gz')

In [3]:
assert(list(barcode_statistics_te.columns) == list(barcode_statistics_tr.columns))

In [4]:
def reformat_exon_data_for_export(df):
    new_column_order = ['exon', 'badly_coupled', 'contains_restriction_site', 'num_reads', 
                        'num_exon_inclusion', 'num_exon_skipping',
                        'num_intron_retention', 
                        'num_splicing_in_exon',
                        'num_bad_exon1', 'num_bad_reads', 'num_unknown_splicing', 'others', 
                        'total']

    df = df[new_column_order]
    df.rename({"num_reads":"num_DNA_reads", "others":"total_noncanonical"}, axis=1, inplace=True)
    df["num_DNA_reads"] //= 3  # Mukund accidentally summed up all the columns, including the DNA count, which is identical in all three replicates; so need to divide by 3
    assert((df.contains_restriction_site).sum() == 0)
    assert((df.badly_coupled).sum() == 0)
    df.drop(["contains_restriction_site", "badly_coupled"],axis=1, inplace=True)
    return df

In [5]:
barcode_statistics_tr = reformat_exon_data_for_export(barcode_statistics_tr)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({"num_reads":"num_DNA_reads", "others":"total_noncanonical"}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["num_DNA_reads"] //= 3  # Mukund accidentally summed up all the columns, including the DNA count, which is identical in all three replicates; so need to divide by 3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(["contains_restriction_site", "badly_coupled"],ax

In [11]:
from joblib import load
yTr = load(f'data/yTr_ES7_HeLa_ABC.pkl.gz')

In [12]:
barcode_statistics_tr["PSI"] = yTr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barcode_statistics_tr["PSI"] = yTr


In [14]:
barcode_statistics_tr.to_csv("data/training_data.csv")

In [15]:
barcode_statistics_te = reformat_exon_data_for_export(barcode_statistics_te)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({"num_reads":"num_DNA_reads", "others":"total_noncanonical"}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["num_DNA_reads"] //= 3  # Mukund accidentally summed up all the columns, including the DNA count, which is identical in all three replicates; so need to divide by 3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(["contains_restriction_site", "badly_coupled"],ax

In [16]:
yTe = load(f'data/yTe_ES7_HeLa_ABC.pkl.gz')

In [17]:
barcode_statistics_te["PSI"] = yTe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barcode_statistics_te["PSI"] = yTe


Add predictions to the test set (and only the test set)

In [18]:
model_fname = f'custom_adjacency_regularizer_20210731_124_step3.h5'
model = load_model(model_fname)

In [21]:
xTe = load(f'data/xTe_ES7_HeLa_ABC.pkl.gz')

In [22]:
barcode_statistics_te["predicted_PSI"] = np.array(model(xTe)).flatten()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barcode_statistics_te["predicted_PSI"] = np.array(model(xTe)).flatten()


In [24]:
barcode_statistics_te.to_csv("data/test_data.csv")