In [2]:
import pandas as pd
import numpy as np
import torch

In [3]:
import sys
sys.path.append('../viral_fragments')
from src import compare_frags, fragment_RNA

In [114]:
def read_fasta(path):
    """Reads a fasta file and returns a dictionary with the sequence
    and structure (if structure=True)"""
    refs  = []
    seqs  = []
    structures = []
    with open(path) as f:
        lines = f.readlines()

    for i in range(0, len(lines), 3):
        refs.append(lines[i].strip().split('>')[1])
    for i in range(1, len(lines), 3):
        seqs.append(lines[i].strip().upper())
    for i in range(2, len(lines), 3):
        structures.append(lines[i].strip())

    df = pd.DataFrame({'reference': refs, 'sequence': seqs, 'structure': structures})
    return df

def dotbracket2Matrix(dotbracket):
    """
    Convert a dotbracket notation to a pairing matrix.

    :param dotbracket: Dotbracket notation
    :param len_seq: Length of the sequence
    :return: Pairing matrix
    """

    pair_list = []
    stack = []

    for i, char in enumerate(dotbracket):
        if char == '(':
            stack.append(i)
        elif char == ')':
            pair_list.append([stack.pop(), i])

    return pairList2pairMatrix(pair_list, len(dotbracket))

def pairList2pairMatrix(pair_list, len_seq):
    pair_list = np.array(pair_list).astype(int)
    pairing_matrix = torch.zeros((len_seq, len_seq))

    if len(pair_list) > 0:
        pairing_matrix[pair_list[:,0], pair_list[:,1]] = 1.0
        pairing_matrix[pair_list[:,1], pair_list[:,0]] = 1.0

    return pairing_matrix

def compute_f1(pred_matrix, target_matrix, threshold=0.5):
    """
    Compute the F1 score of the predictions.

    :param pred_matrix: Predicted pairing matrix probability  (L,L)
    :param target_matrix: True binary pairing matrix (L,L)
    :return: F1 score for this RNA structure
    """

    pred_matrix = (pred_matrix > threshold).float()

    sum_pair = torch.sum(pred_matrix) + torch.sum(target_matrix)

    if sum_pair == 0:
        return 1.0
    else:
        return (2 * torch.sum(pred_matrix * target_matrix) / sum_pair).item()
    
def pairList2pairMatrix(pair_list, len_seq):
    pair_list = np.array(pair_list).astype(int)
    pairing_matrix = torch.zeros((len_seq, len_seq))

    if len(pair_list) > 0:
        pairing_matrix[pair_list[:,0], pair_list[:,1]] = 1.0
        pairing_matrix[pair_list[:,1], pair_list[:,0]] = 1.0

    return pairing_matrix

def dotbracket2Pairs(dotbracket):
    """
    Convert a dotbracket notation to a pairing matrix.

    :param dotbracket: Dotbracket notation
    :return: list of pairs
    """

    pair_list = []
    stack = []

    for i, char in enumerate(dotbracket):
        if char == '(':
            stack.append(i)
        elif char == ')':
            pair_list.append([stack.pop(), i])
    
    pair_list.sort( key=lambda x: x[0])

    return pair_list

def pairs2dot(pairs, len_seq):
    dotbracket = ['.' for _ in range(len_seq)]
    for pair in pairs:
        dotbracket[pair[0]] = '('
        dotbracket[pair[1]] = ')'
    return ''.join(dotbracket)

## Create dataset of lncRNA

In [108]:
from sklearn.metrics import roc_auc_score
import numpy as np
def auroc_dms(dms: list, structure: str):

    UKN = -1000

    dms = np.array(dms)
    assert len(structure) == len(dms), 'Sequence, dms and structure must have the same length'

    isNotPaired = np.array([1 if structure[i] == '.' else 0 for i in range(len(structure))])
    mask = (dms >=0)

    # If the non masked bases are all paired or all unpaired
    if len(np.unique(isNotPaired[mask])) == 1:
        return np.nan

    return roc_auc_score(isNotPaired[mask], dms[mask])

In [109]:
ground_truth_lcRNA['signal'] = [data_dms.loc[ref][[str(i) for i in range(len(ground_truth_lcRNA.loc[ref]['sequence']))]].values if ref in data_dms.index else None for ref in ground_truth_lcRNA.index]
ground_truth_lcRNA['auroc'] = ground_truth_lcRNA.apply(lambda x: auroc_dms(x['signal'], pairs2dot(x['structure'], len(x['sequence']))) if not x['signal'] is None else None, axis=1)


ground_truth_lcRNA[['auroc']]

Unnamed: 0_level_0,auroc
reference,Unnamed: 1_level_1
lincRNAp21_IRAlu_Sense,0.660933
lincRNAp21_IRAlu_Antisense,0.740216
MEG3,0.777572
ROX2,
NORAD1_37C,0.444165
NORAD2_37C,0.393038
NORAD3_37C,0.357118
RepA,
PAN,0.79169


In [110]:
ground_truth_lcRNA = read_fasta('data/lncRNAs.fasta').set_index('reference')
ground_truth_lcRNA['structure'] = ground_truth_lcRNA['structure'].apply(lambda x: dotbracket2Pairs(x))

ground_truth_lcRNA.drop(['NORAD1_23C', 'NORAD1_55C', 'NORAD2_23C', 'NORAD2_55C', 'NORAD3_23C', 'NORAD3_55C'], inplace=True)

lcRNA_toCut = ground_truth_lcRNA[ground_truth_lcRNA['sequence'].apply(len) > 2000]

ground_truth_lcRNA = ground_truth_lcRNA[ground_truth_lcRNA['sequence'].apply(len) <=2000]
ground_truth_lcRNA

Unnamed: 0_level_0,sequence,structure
reference,Unnamed: 1_level_1,Unnamed: 2_level_1
lincRNAp21_IRAlu_Sense,GGCUGGGCGUGGUGGCUCACGCCUGUAAUCCCACCACUUUGGGAGG...,"[[1, 46], [2, 45], [3, 44], [4, 23], [5, 22], ..."
lincRNAp21_IRAlu_Antisense,UUCUUUUUUUUUUUUUAUUGGAGAUGGAGUCUCACUCUGUUGCUCA...,"[[19, 118], [20, 117], [25, 112], [26, 111], [..."
MEG3,AGCCCCUAGCGCAGACGGCGGAGAGCAGAGAGGGAGCGCGCCUUGG...,"[[1, 195], [2, 194], [3, 193], [4, 192], [5, 1..."
ROX2,TGTTGCGGCATTCGCGGCCTGGTCACACTAAGCTAGGGCTACTTTT...,"[[113, 167], [114, 166], [115, 165], [116, 164..."
NORAD1_37C,AGUUCCGGUCCGGCAGAGAUCGCGGAGAGACGCAGAACGCAGCCCG...,"[[3, 752], [4, 751], [5, 750], [6, 749], [10, ..."
NORAD2_37C,CCACCUUUGUGAACAGUAUAGUAAUGUCUAUACUUGUUCAAUAGUU...,"[[9, 39], [10, 38], [11, 37], [12, 36], [13, 3..."
NORAD3_37C,GAGGUCAAGAGAUCAGGACCAUCUUGGCCAACAUGGUGAAACCCCA...,"[[2, 28], [3, 27], [4, 26], [5, 25], [6, 24], ..."
RepA,CCCAUCGGGGCCACGGAUACCUGUGUGUCCUCCCCGCCAUUCCAUG...,"[[5, 35], [6, 34], [7, 33], [8, 32], [9, 31], ..."
PAN,ACUGGGACUGCCCAGUCACCUUGGCUGCCGCUUCACCUAUGGAUUU...,"[[0, 15], [1, 14], [2, 13], [3, 12], [4, 11], ..."


In [111]:
lcRNA_toCut

Unnamed: 0_level_0,sequence,structure
reference,Unnamed: 1_level_1,Unnamed: 2_level_1
HOTAIR,GACUCGCCUGUGCUCUGGAGCUUGAUCCGAAAGCUUCCACAGUGAG...,"[[2, 427], [3, 426], [4, 425], [5, 424], [6, 4..."
XIST,CGGCTTGCTCCAGCCATGTTTGCTCGTTTCCCGTGGATGTGCGGTT...,"[[0, 32], [1, 31], [2, 30], [6, 26], [7, 25], ..."
MALAT1,GUAAAGGACUGGGGCCCCGCAACUGGCCUCUCCUGCCCUCUUAAGC...,"[[10, 29], [11, 28], [12, 27], [13, 26], [14, ..."
CYRANO,TCGATACTGCAGCGTACGTGCGCATGCATTCGAACGAGCTCAGATC...,"[[0, 32], [1, 31], [2, 30], [3, 29], [7, 27], ..."


## Cut the longest pieces into independent fragments

In [112]:
data_dms = pd.read_csv('data/lncRNAs_probing_scores.csv').set_index('Unnamed: 0', drop=True)#

frag_longRNA = pd.DataFrame()

for ref in lcRNA_toCut.index:
    sequence = lcRNA_toCut.loc[ref]['sequence']
    dms = data_dms.loc[ref][[str(i) for i in range(len(sequence))]].values
    dms = np.nan_to_num(dms, nan=-999)
    structure = lcRNA_toCut.loc[ref]['structure']

    fragments = fragment_RNA(sequence, structure, dms, ref, 'SHAPE', min_length=1000, min_unpaired_length=-1, min_auroc=0.7)
    frag_longRNA = pd.concat([frag_longRNA, fragments])

frag_longRNA = frag_longRNA[frag_longRNA['sequence'].apply(len) <= 2000]
frag_longRNA

Unnamed: 0,SHAPE,sequence,structure
HOTAIR_0,"[-999.0, -999.0, -999.0, -999.0, -999.0, -999....",GACUCGCCUGUGCUCUGGAGCUUGAUCCGAAAGCUUCCACAGUGAG...,..((((((.(((((((((.(((...(((....((((((((...(((...
XIST_0,"[-999.0, -999.0, -999.0, -999.0, -999.0, -999....",CGGCTTGCTCCAGCCATGTTTGCTCGTTTCCCGTGGATGTGCGGTT...,(((...((...(((.(....)))).))...))).((((.((((((....
XIST_1,"[0.570091229947, 0.413187640853, 0.32770733777...",AAAGGCTTTCTTTATATGTGCGGGGTTGCGGGATTCGCCTTGATTT...,...((((((((........(((((.........))))).....(((...
XIST_4,"[-999.0, 0.0167314653451, 0.228301856617, 1.0,...",GCCACCTTTTACTTGGGGCTTTCCTTTACAGTATGAACTGAAAATT...,(((.((........)))))..((((.(((((((.......((((((...
XIST_5,"[0.170556374861, 0.342062327847, 0.05857362182...",CCAAAGGGACAAACAATCCCTATGTGAGACTCAAGGACTGCCAGCA...,.((.(((((.......)))))...)).......................
XIST_6,"[0.215898878318, 1.0, 0.282254843593, 0.546931...",TCTTTCTTGCTTTTGTGTGTCTATTTCTTCCTTGCAGTTGTGTCTA...,.............(((((((.............((((..(((.......
XIST_7,"[0.924365953298, -999.0, -999.0, 0.13450248456...",GTTTTACAACTTCCATTTCTCTTCACATCTGCTCCACTTGAGACGG...,.......................(((((((((.................
XIST_8,"[0.0298145343031, -999.0, -999.0, -999.0, -999...",TCTTCTGCTTTGGTGAGGCTCAGTAAGTTATATTATACCAGGTAGC...,(((((((((((((((.((((.....))))......))))))..)))...
XIST_9,"[0.589973571296, 0.0773516778948, 1.0, 0.05475...",ATATTTGCCTGGTGTGCAATGACTTTGCTTTTATCCCAGGCATGCA...,......((((((.((((((.....))))....)).)))))).((((...
XIST_10,"[0.184699268968, 1.0, 1.0, 1.0, 1.0, 0.6036105...",TTTTAACTATAATGGCTGTTTGCGAAACCCAACCAAGGCCAAGATT...,.............((((....((............(((((((((.....


In [113]:
from rnastructure_wrapper import RNAstructure

rna = RNAstructure(path = "/Users/alberic/RNAstructure/exe/")
results = rna.fold(frag_longRNA.sequence.tolist(), shape=frag_longRNA.SHAPE.tolist(), nproc=8) 

frag_longRNA['structure_new'] = [result['dotbracket'] for result in results]

In [115]:
def compute_f1_pd(row):
    pred_matrix = pairList2pairMatrix(dotbracket2Pairs(row.structure_new), len(row.sequence) )
    target_matrix = pairList2pairMatrix(dotbracket2Pairs(row.structure), len(row.sequence) )
    return compute_f1(pred_matrix, target_matrix)


frag_longRNA.loc[:, 'F1'] = frag_longRNA.apply(compute_f1_pd, axis=1)
frag_longRNA = frag_longRNA[frag_longRNA.F1 > 0.8]
frag_longRNA

Unnamed: 0,SHAPE,sequence,structure,structure_new,F1
XIST_7,"[0.924365953298, -999.0, -999.0, 0.13450248456...",GTTTTACAACTTCCATTTCTCTTCACATCTGCTCCACTTGAGACGG...,.......................(((((((((.................,............((.(((.(((((((((((((....((((.........,0.800681


In [116]:
frag_longRNA = frag_longRNA.drop(['structure', "SHAPE", 'F1'], axis=1).rename(columns={'structure_new': 'structure'})

frag_longRNA['structure'] = frag_longRNA['structure'].apply(dotbracket2Pairs)

## Convert to rouskinHF

In [55]:
import envbash
envbash.load.load_envbash('../.env')
from rouskinhf import convert, upload_dataset, get_dataset

## RouskinHF filtering

In [56]:
from rouskinhf import convert, dump_json

dump_json(pd.concat([ground_truth_lcRNA, frag_longRNA]).to_dict(orient='index'),
          'data/lncRNA.json')

data = convert(
    'json',
    'data/lncRNA.json',
    name='lncRNA',
    path_out='data',
    filter=True,
)

Parsing json file: 100%|██████████| 10/10 [00:00<00:00, 19812.49it/s]

Over a total of 10 datapoints, there are:
### OUTPUT
- ALL: 10 valid datapoints
- INCLUDED: 0 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape





## To HuggingFace

In [57]:
from rouskinhf import upload_dataset

upload_dataset(
    'data/lncRNA/data.json',
    commit_message='removed NORAD duplicates and added fragments',
    exist_ok=True
)

# Second version of the dataset without filtering

In [117]:
ground_truth_lcRNA = read_fasta('data/lncRNAs.fasta').set_index('reference')
ground_truth_lcRNA['structure'] = ground_truth_lcRNA['structure'].apply(lambda x: dotbracket2Pairs(x))

lcRNA_toCut = ground_truth_lcRNA[ground_truth_lcRNA['sequence'].apply(len) > 2000]

ground_truth_lcRNA = ground_truth_lcRNA[ground_truth_lcRNA['sequence'].apply(len) <=2000]
ground_truth_lcRNA

Unnamed: 0_level_0,sequence,structure
reference,Unnamed: 1_level_1,Unnamed: 2_level_1
lincRNAp21_IRAlu_Sense,GGCUGGGCGUGGUGGCUCACGCCUGUAAUCCCACCACUUUGGGAGG...,"[[1, 46], [2, 45], [3, 44], [4, 23], [5, 22], ..."
lincRNAp21_IRAlu_Antisense,UUCUUUUUUUUUUUUUAUUGGAGAUGGAGUCUCACUCUGUUGCUCA...,"[[19, 118], [20, 117], [25, 112], [26, 111], [..."
MEG3,AGCCCCUAGCGCAGACGGCGGAGAGCAGAGAGGGAGCGCGCCUUGG...,"[[1, 195], [2, 194], [3, 193], [4, 192], [5, 1..."
ROX2,TGTTGCGGCATTCGCGGCCTGGTCACACTAAGCTAGGGCTACTTTT...,"[[113, 167], [114, 166], [115, 165], [116, 164..."
NORAD1_23C,AGUUCCGGUCCGGCAGAGAUCGCGGAGAGACGCAGAACGCAGCCCG...,"[[3, 752], [4, 751], [5, 750], [6, 749], [10, ..."
NORAD1_37C,AGUUCCGGUCCGGCAGAGAUCGCGGAGAGACGCAGAACGCAGCCCG...,"[[3, 752], [4, 751], [5, 750], [6, 749], [10, ..."
NORAD1_55C,AGUUCCGGUCCGGCAGAGAUCGCGGAGAGACGCAGAACGCAGCCCG...,"[[1, 1895], [2, 1894], [3, 1893], [4, 1892], [..."
NORAD2_23C,CCACCUUUGUGAACAGUAUAGUAAUGUCUAUACUUGUUCAAUAGUU...,"[[0, 653], [1, 652], [2, 651], [3, 650], [4, 6..."
NORAD2_37C,CCACCUUUGUGAACAGUAUAGUAAUGUCUAUACUUGUUCAAUAGUU...,"[[9, 39], [10, 38], [11, 37], [12, 36], [13, 3..."
NORAD2_55C,CCACCUUUGUGAACAGUAUAGUAAUGUCUAUACUUGUUCAAUAGUU...,"[[3, 51], [4, 50], [5, 49], [6, 48], [7, 47], ..."


## Cut the longest pieces into independent fragments

In [118]:
frag_longRNA = pd.DataFrame()

for ref in lcRNA_toCut.index:
    sequence = lcRNA_toCut.loc[ref]['sequence']
    dms = data_dms.loc[ref][[str(i) for i in range(len(sequence))]].values
    dms = np.nan_to_num(dms, nan=-999)
    structure = lcRNA_toCut.loc[ref]['structure']

    fragments = fragment_RNA(sequence, structure, dms, ref, 'SHAPE', min_length=1000, min_unpaired_length=-1, min_auroc=0.0)
    frag_longRNA = pd.concat([frag_longRNA, fragments])

frag_longRNA = frag_longRNA[frag_longRNA['sequence'].apply(len) <= 2000]
frag_longRNA

Unnamed: 0,SHAPE,sequence,structure
HOTAIR_0,"[-999.0, -999.0, -999.0, -999.0, -999.0, -999....",GACUCGCCUGUGCUCUGGAGCUUGAUCCGAAAGCUUCCACAGUGAG...,..((((((.(((((((((.(((...(((....((((((((...(((...
XIST_0,"[-999.0, -999.0, -999.0, -999.0, -999.0, -999....",CGGCTTGCTCCAGCCATGTTTGCTCGTTTCCCGTGGATGTGCGGTT...,(((...((...(((.(....)))).))...))).((((.((((((....
XIST_1,"[0.570091229947, 0.413187640853, 0.32770733777...",AAAGGCTTTCTTTATATGTGCGGGGTTGCGGGATTCGCCTTGATTT...,...((((((((........(((((.........))))).....(((...
XIST_3,"[1.0, 0.0888895640316, -999.0, 1.0, 0.51282672...",ATACTGTTGGCATGCTGTCATGGGTGCTATCGCCCCAGGTCACATC...,........((((...))))..(((((....)))))((((.(((.((...
XIST_4,"[0.301212758802, -999.0, 1.0, 0.349039112766, ...",AAACCCCTTCCTCAAACTATTTATGTACATACTGGCAATTTTAGTA...,..(((((.((((....................(((.......((((...
XIST_5,"[1.0, 1.0, 0.208982242727, 0.593031496957, 0.5...",TATTTCTGAGCACTTCTCTTGTCAATATTAATCTGTACCCTTACAC...,.......(((((..((..(((((....................(((...
XIST_6,"[-999.0, 0.0167314653451, 0.228301856617, 1.0,...",GCCACCTTTTACTTGGGGCTTTCCTTTACAGTATGAACTGAAAATT...,(((.((........)))))..((((.(((((((.......((((((...
XIST_7,"[0.359103572674, 0.532689714563, 0.55434391883...",GTGTTTGATATGACATTGCTGATGAAAATAATCATCACAACAGCAG...,...................((((((......))))))............
XIST_8,"[0.128047378869, 0.687639683215, 0.34271874358...",TATCTATTTCTTCCTTGCTTTGTGTCTATTTCTTCCTTGCAGTTGT...,..............((((....................)))).......
XIST_9,"[0.191129162013, -999.0, 0.025923651447, 0.230...",AGGAACTGTTTCTACAGGACACCTGTGACTTCCAAGAGCGGGGAAC...,.............(((((...))))).......................


In [119]:
from rnastructure_wrapper import RNAstructure

rna = RNAstructure(path = "/Users/alberic/RNAstructure/exe/")
results = rna.fold(frag_longRNA.sequence.tolist(), shape=frag_longRNA.SHAPE.tolist(), nproc=8) 

frag_longRNA['structure_new'] = [result['dotbracket'] for result in results]

In [121]:
frag_longRNA = frag_longRNA.drop(['structure', "SHAPE", 'F1'], axis=1).rename(columns={'structure_new': 'structure'})

frag_longRNA['structure'] = frag_longRNA['structure'].apply(dotbracket2Pairs)

## Convert to rouskinHF

In [122]:
import envbash
envbash.load.load_envbash('../.env')
from rouskinhf import convert, upload_dataset, get_dataset

## RouskinHF filtering

In [124]:
from rouskinhf import convert, dump_json

dump_json(pd.concat([ground_truth_lcRNA, frag_longRNA]).to_dict(orient='index'),
          'data/lncRNA_nonFiltered.json')

data = convert(
    'json',
    'data/lncRNA_nonFiltered.json',
    name='lncRNA_nonFiltered',
    path_out='data',
    filter=True,
)

Parsing json file:   0%|          | 0/30 [00:00<?, ?it/s]

Parsing json file: 100%|██████████| 30/30 [00:00<00:00, 21586.74it/s]

Over a total of 30 datapoints, there are:
### OUTPUT
- ALL: 30 valid datapoints
- INCLUDED: 6 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape





## To HuggingFace

In [126]:
from rouskinhf import upload_dataset

upload_dataset(
    'data/lncRNA_nonFiltered/data.json',
    commit_message='first non filtered dataset',
    exist_ok=True
)