In [1]:
import sys, os
import numpy as np

# Import the rouskinhf package
sys.path.append('..')
from rouskinhf import import_dataset, DataFolder, setup_env

# make sure to source the env file
# OPTION 1: source the env file in the notebook
# %load_ext dotenv
# %dotenv env
# # OPTION 2: source the env file in the terminal
setup_env(RNASTRUCTURE_PATH='/Users/alberic/RNAstructure/exe')

In [4]:
def upload_data(datafolder, commit_message):
    datafolder.create_repo(
        exist_ok=True,
        private=True
    )

    future = datafolder.upload_folder(
        revision='main', # branch name
        commit_message=commit_message,
        commit_description='',
        run_as_future=True,
    )

    future.done() # True if the upload is done
    future.result() # Wait for the upload to complete (blocking action)

# PDB

In [5]:
# datafolder = DataFolder.from_ct_folder(
#     name='PDB', # name of the input data by default
#     path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/PDB/CT_files_pdbee', 
#     path_out='data/datafolders', 
#     tqdm=False,
#     generate_npy=True,
#     predict_dms=False
#     )
# upload_data(datafolder, 'Cleaned PDB data')


datafolder = DataFolder.from_bpseq_folder(
    name='PDB', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/PDB/bpSEQ_files_pdbee', 
    path_out='data/datafolders', 
    tqdm=False,
    generate_npy=True,
    predict_dms=False
    )
upload_data(datafolder, 'Using bpseq files')

Over a total of 702 datapoints, there are:
    - 353 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 302 duplicate sequences with the same structure / dms
    - 47 duplicate sequences with different structure / dms


# Sarah_supermodel

In [3]:
datafolder = DataFolder.from_dreem_output(
    name='sarah_supermodel', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/supermodels_fragments/data/fragment_RNA/new_run/processed/fragment_RNA.json', 
    path_out='data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )
upload_data(datafolder, 'Combining replicates')

Over a total of 113 datapoints, there are:
    - 107 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 6 datapoints removed because of low AUROC (<0.8)


## SARS2 dataset

In [4]:
datafolder = DataFolder.from_dreem_output(
    name='SARS2', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/SARS2/processed/SARS2.json', 
    path_out='data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )
upload_data(datafolder, 'Checking AUROC')

Over a total of 38 datapoints, there are:
    - 38 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 0 datapoints removed because of low AUROC (<0.8)


## UTR dataset

In [6]:
datafolder = DataFolder.from_dreem_output(
    name='UTR', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/UTR/processed/UTR.json', 
    path_out='data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )
upload_data(datafolder, 'Correcteed info.json')

Over a total of 1400 datapoints, there are:
    - 1234 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 166 datapoints removed because of low AUROC (<0.8)


## miRNA dataset

In [5]:
datafolder = DataFolder.from_dreem_output(
    name='pri-miRNA', # name of the input data by default
    path_in='/Users/alberic/Desktop/Pro/RouskinLab/projects/deep_learning/datasets/pri-miRNA/processed/pri-miRNA.json', 
    path_out='data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )
upload_data(datafolder, 'Masked primer part')

Over a total of 513 datapoints, there are:
    - 503 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 10 datapoints removed because of low AUROC (<0.8)


In [9]:
from sklearn.metrics import roc_auc_score
UKN = -1000
def calculate_auroc(row):
    dms = np.array(row['dms'])
    isUnpaired = np.ones_like(dms)
    isUnpaired[np.array(row['paired_bases']).flatten()] = 0
    return roc_auc_score(isUnpaired[dms!=UKN], dms[dms!=UKN])

auroc = []
for i, row in datafolder.datapoints.to_pandas().iterrows():
    auroc.append( calculate_auroc(row) )

import plotly.express as px
px.histogram(auroc, nbins=100, title=f'mean AUROC: {np.mean(auroc):.3f} | Number of datapoints above 0.8: {np.sum(np.array(auroc)>0.8)}')

In [13]:
from sklearn.metrics import roc_auc_score
UKN = -1000
def calculate_auroc(row):
    dms = np.array(row['dms'])
    dms[:19] = UKN
    dms[-19:] = UKN
    isUnpaired = np.ones_like(dms)
    isUnpaired[np.array(row['paired_bases']).flatten()] = 0
    return roc_auc_score(isUnpaired[dms!=UKN], dms[dms!=UKN])

auroc = []
for i, row in datafolder.datapoints.to_pandas().iterrows():
    auroc.append( calculate_auroc(row) )

import plotly.express as px
px.histogram(auroc, nbins=100, title=f'mean AUROC: {np.mean(auroc):.3f} | Number of datapoints above 0.8: {np.sum(np.array(auroc)>0.8)}')