In [1]:
import sys, os
import numpy as np

# Import the rouskinhf package
sys.path.append('..')
from rouskinhf import import_dataset, DataFolder, setup_env

# make sure to source the env file
# OPTION 1: source the env file in the notebook
# %load_ext dotenv
# %dotenv env
# # OPTION 2: source the env file in the terminal
setup_env(RNASTRUCTURE_PATH='/Users/alberic/RNAstructure/exe')

In [2]:
def upload_data(datafolder, commit_message):
    datafolder.create_repo(
        exist_ok=True,
        private=True
    )

    future = datafolder.upload_folder(
        revision='main', # branch name
        commit_message=commit_message,
        commit_description='',
        run_as_future=True,
    )

    future.done() # True if the upload is done
    future.result() # Wait for the upload to complete (blocking action)

## SARS2 dataset

In [5]:
datafolder = DataFolder.from_dreem_output(
    name='SARS2', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/SARS2/processed/SARS2.json', 
    path_out='data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )

Over a total of 38 datapoints, there are:
    - 38 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms


## UTR dataset

In [25]:
datafolder = DataFolder.from_dreem_output(
    name='UTR', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/UTR/processed/UTR.json', 
    path_out='data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )
# upload_data(datafolder, 'Two filter thresholds on high signal')

Over a total of 1400 datapoints, there are:
    - 1400 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms


In [3]:
datafolder2 = DataFolder.from_local(name='UTR', 
                                    path='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/RNA_data/data/datafolders', 
                                    generate_npy=True)

Parsing json file: 1402it [00:00, 4388.93it/s]                         


Over a total of 1400 datapoints, there are:
    - 1234 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 166 datapoints removed because of low AUROC (<0.8)


In [40]:
import pandas as pd
from sklearn.metrics import roc_auc_score
UKN = -1000

aurocs = []
data_processed = pd.read_json('/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/RNA_data/data/datafolders/UTR/data.json').T

for i, row in filtered_data.iterrows():

    dms = np.array(row['dms'])

    isUnpaired = np.ones_like(dms)
    isUnpaired[np.array(row['paired_bases']).flatten()] = 0

    aurocs.append(roc_auc_score(isUnpaired[dms!=UKN], dms[dms!=UKN]))

np.mean(aurocs), np.std(aurocs)

(0.8821937148657288, 0.03588750560593514)

In [39]:
import pandas as pd
import numpy as np

def calculate_auroc(row):
    dms = np.array(row['dms'])
    isUnpaired = np.ones_like(dms)
    isUnpaired[np.array(row['paired_bases']).flatten()] = 0
    return roc_auc_score(isUnpaired[dms!=UKN], dms[dms!=UKN])

# Assuming data_processed is a pandas DataFrame
# Create a boolean mask for rows with auroc score greater than or equal to a threshold
threshold = 0.8
mask = data_processed.apply(lambda row: calculate_auroc(row) >= threshold, axis=1)

# Filter the DataFrame using the mask
filtered_data = data_processed[mask]

In [41]:
import plotly.express as px
px.ecdf(aurocs, marginal='histogram')

## miRNA dataset

In [19]:
datafolder = DataFolder.from_dreem_output(
    name='pri-miRNA', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/pri-miRNA/processed/pri-miRNA.json', 
    path_out='data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )
upload_data(datafolder, 'Initial commit')

Over a total of 513 datapoints, there are:
    - 513 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
