In [None]:
#https://github.com/eternagame/EternaFold v1.3.1

eternafold = "/home/pbohn/tools/EternaFold/src/contrafold"
!{eternafold}

In [None]:
import pandas as pd
samplesheet = pd.read_excel("./samplesheet.xlsx", engine="openpyxl")
samples = samplesheet["Sample"].values
samples

In [None]:
data_folder = "/vol/projects/pbohn/AnSo_DMS_MaP/_for_publication/1_US_default_and_optimization/data/"

In [None]:
from slurmpy import Slurm

import os
os.makedirs(f"{data_folder}/eterna", exist_ok=True)

num_threads = 1
slurm_eterna = Slurm("eterna", {"partition" : "cpu", "mem" : "2G", "cpus-per-task" : num_threads, "time" : "30","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})



for sample in samples:

    if ("Rep1" in sample):
        combined_sample = "_".join(sample.split("_")[1:])
        
        for isoform in os.listdir(f"{data_folder}/rfcombine/{combined_sample}"):
    
            commands = []
            for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
                
                for reactive_nt in ["AC", "ACT"]: # ["ACGT", "AC", "ACT", "G"]
                    
                    #predict RNA structure based on whole HIV 5' -UTR unspliced sequence (537 nt)
                    bp_file = f"{data_folder}/rfcombine/{combined_sample}/{isoform}/{option}_{reactive_nt}/{isoform}.bp2seq"
                    if os.path.isfile(bp_file):
                        os.makedirs(f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}", exist_ok=True)
                        eterna_outfile = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/predict_{isoform}.eterna"
                        command = f"{eternafold} predict {bp_file} --evidence --numdatasources 1 --params /home/pbohn/tools/EternaFold/parameters/EternaFoldParams_PLUS_POTENTIALS.v1 > {eterna_outfile}"
                        commands.append(command)
                    
                    
                    #predict RNA structure based on first 380 nt sequence and data
                    bp_file = f"{data_folder}/rfcombine/{combined_sample}/{isoform}/{option}_{reactive_nt}/{isoform}_380nt.bp2seq"
                    if os.path.isfile(bp_file):
                        os.makedirs(f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}", exist_ok=True)
                        eterna_outfile = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/predict_{isoform}_380nt.eterna"
                        command = f"{eternafold} predict {bp_file} --evidence --numdatasources 1 --params /home/pbohn/tools/EternaFold/parameters/EternaFoldParams_PLUS_POTENTIALS.v1 > {eterna_outfile}"
                        commands.append(command)
            slurm_eterna.run("\n".join(commands))
            

In [None]:
import xml.etree.ElementTree as ET
import numpy as np

def read_in_xml(xml_file, sample, with_stdev = False):

    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    transcript_id = root[0].attrib["id"]
    length = root[0].attrib["length"]
    sequence = root[0][0].text.replace("\t", "").replace("\n", "")
    reactivity = np.array(root[0][1].text.replace("\t", "").replace("\n", "").split(",")).astype(float)
    if with_stdev:
        stdev = np.array(root[0][2].text.replace("\t", "").replace("\n", "").split(",")).astype(float)
        return {"sample" : sample, 
            "transcript_id" : transcript_id,
            "length" : length,
            "sequence" : sequence, 
            "reactivity" : reactivity,
            "stdev": stdev
           }
    else:
        return {"sample" : sample, 
            "transcript_id" : transcript_id,
            "length" : length,
            "sequence" : sequence, 
            "reactivity" : reactivity
           }

from sklearn.metrics import roc_auc_score

def generate_db_from_eterna(eterna_outfile, sample, outdir, bp2seq_file, mode="sample"):
    db_files = []
    
    tmp_data = pd.read_csv(bp2seq_file, sep="\s", names = ["sequence", "-", "reactivity"])
    reactivity = tmp_data["reactivity"].values
    sequence = tmp_data["sequence"].values
    
    mask = reactivity == -1
    reactivity = reactivity[~mask]
    
    if mode == "sample":
        with open(eterna_outfile, "r") as infile:
            for i, line in enumerate(infile):
                structure = np.array(list(line.strip()))

                rocauc = np.round(roc_auc_score(np.array(list(structure))[~mask] == ".", reactivity),3)

                with open(f"{outdir}/{sample}_{i}.db", "w") as outfile:
                    outfile.write(f">{sample}_{i}_ROCAUC{rocauc}\n")
                    outfile.write(f"{''.join(sequence)}\n")
                    outfile.write(f"{''.join(structure)}")
                db_files.append(f"{outdir}/{sample}_{i}.db")
        return db_files
    elif mode == "predict":
        filename = f"predict_{bp2seq_file.split('/')[-1].split('.')[0]}"
        with open(eterna_outfile, "r") as infile:

            _, _, _, _, structure = infile.readlines()
            structure = np.array(list(structure.strip()))
            rocauc = np.round(roc_auc_score(np.array(list(structure))[~mask] == ".", reactivity),3)

            with open(f"{outdir}/{filename}.db", "w") as outfile:
                outfile.write(f">{sample}_predict_ROCAUC{rocauc}\n")
                outfile.write(f"{''.join(sequence)}\n")
                outfile.write(f"{''.join(structure)}")
            db_files.append(f"{outdir}/{filename}.db")
            
        return db_files
    

    
#generate varna file for fast visualization of RNA structures (includes reactivity data and colormap)
#to run varna one needs to copy/link the class file into the working directory
#note: varna has a bug depending on localization setting of the computer where when opening .varna files with varna and then saving, the colormap is broken because the values, e.g. 1.000 is written as 1,000. To fix, manually open the file in a text editor and replace the , with .

def generate_varna(db_file, reactivity_file, sample, varna_outprefix):
    reactivities = read_in_xml(reactivity_file, "")["reactivity"]
    reactivities = np.nan_to_num(reactivities, nan=-1.0)
    with open(db_file, "r") as infile:
        title = infile.readline().strip()
        sequence = infile.readline().strip().replace("T", "U")
        structure = infile.readline().strip()
    
    for algorithm in ["radiate", "line"]:
        varna_outfile = f"{varna_outprefix}_{algorithm}.varna"
        
        colormap = '-1:#888888;0:#0000FF;0.5:#FFFFFF;1:#FF0000'
        command = f'java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd -algorithm {algorithm} -sequenceDBN "{sequence}" -structureDBN "{structure}" -o {varna_outfile} -colorMap "{";".join(reactivities.astype(str))}" -colorMapStyle "{colormap}" -title "{title}" -flat True'
        #print(command)
        !{command}
    return

In [None]:
for sample in samples:

    if ("Rep1" in sample):
        combined_sample = "_".join(sample.split("_")[1:])
        
        for isoform in os.listdir(f"{data_folder}/rfcombine/{combined_sample}"):

            for option in ["q22_eq10_ndni", "q22_eq10", "default"]:
                for reactive_nt in ["ACGT", "AC", "ACT", "G"]:
                    
                    eterna_outfile = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/predict_{isoform}.eterna"
                    if os.path.isfile(eterna_outfile):
                        
                        db_outfolder = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/"
                        bp2_file = f"{data_folder}/rfcombine/{combined_sample}/{isoform}/{option}_{reactive_nt}/{isoform}.bp2seq"
                        xml_file = f"{data_folder}/rfcombine/{combined_sample}/{isoform}/{option}_{reactive_nt}/{isoform}.xml"
                        
                        db_files = generate_db_from_eterna(eterna_outfile, combined_sample, db_outfolder, bp2_file, mode = "predict")

                        for db_file in db_files:
                            varna_outprefix = f"{db_outfolder}/{db_file.split('/')[-1].split('.db')[0]}"
                            generate_varna(db_file, xml_file, combined_sample, varna_outprefix)
                            
                    eterna_outfile = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/predict_{isoform}_380nt.eterna"
                    if os.path.isfile(eterna_outfile):
                        db_outfolder = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/"
                        bp2_file = f"{data_folder}/rfcombine/{combined_sample}/{isoform}/{option}_{reactive_nt}/{isoform}_380nt.bp2seq"

                        db_files = generate_db_from_eterna(eterna_outfile, combined_sample, db_outfolder, bp2_file, mode = "predict")

                        for db_file in db_files:
                            varna_outprefix = f"{db_outfolder}/{db_file.split('/')[-1].split('.db')[0]}"
                            generate_varna(db_file, xml_file, combined_sample, varna_outprefix)

# Calculate Base-pairing sensitivity and positive predictive value of predictions

In [None]:
# Definition according to:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1370608/

#to calculate BPS and PPV we need to convert dot-bracket format to base-pairing format (i.e. we need to know which base is predicted to pair with which other base)

import numpy as np

def read_in_db(file):
    with open(file, "r") as infile:
        name, sequence, structure = infile.readlines()
        name = name.replace(">", "")
    return name.strip(), sequence.strip(), structure.strip()


def convert_db_to_bp(db_structure, sequence):
    positions = np.array(np.arange(len(db_structure)), dtype=int)+1
    sequence = np.array(list(sequence.replace("T", "U")))

    db_structure = np.array(list(db_structure))
    
    open_bracket_index = np.where(db_structure == "(")[0]
    closed_bracket_index = np.where(db_structure == ")")[0]

    unpaired = np.where(db_structure == ".")[0]
    
    pk1_first_pairing = np.where(db_structure == "[")[0]
    pk1_second_pairing = np.where(db_structure == "]")[0]
    
    pk2_first_pairing = np.where(db_structure == "{")[0]
    pk2_second_pairing = np.where(db_structure == "}")[0]

    if len(open_bracket_index) != len(closed_bracket_index):
        print("Error: number of ( and ) is not equal")
        return
    
    basepairings = np.full(len(db_structure), fill_value = -1, dtype=int)
    
    for closed_bracket in closed_bracket_index:

        matching_open_bracket = open_bracket_index[np.where(open_bracket_index < closed_bracket)][-1]
        open_bracket_index = np.delete(open_bracket_index, np.where(open_bracket_index==matching_open_bracket)[0])
        basepairings[matching_open_bracket] = closed_bracket
        basepairings[closed_bracket] = matching_open_bracket
        
    for pk1_second in pk1_second_pairing:

        matching_open_bracket = pk1_first_pairing[np.where(pk1_first_pairing < pk1_second)][-1]
        pk1_first_pairing = np.delete(pk1_first_pairing, np.where(pk1_first_pairing==matching_open_bracket)[0])
        basepairings[matching_open_bracket] = pk1_second
        basepairings[pk1_second] = matching_open_bracket
        
    for pk2_second in pk1_second_pairing:

        matching_open_bracket = pk2_first_pairing[np.where(pk2_first_pairing < pk2_second)][-1]
        pk1_first_pairing = np.delete(pk2_first_pairing, np.where(pk2_first_pairing==matching_open_bracket)[0])
        basepairings[matching_open_bracket] = pk2_second
        basepairings[pk2_second] = matching_open_bracket
    
    #make basepairings 1-based
    basepairings += 1

    bpseq = np.array([positions, sequence, basepairings], dtype=str).T
    
    return bpseq #returns it back as 2D numpy array (dtype str)

def calculate_BPS_PPV(true_bpseq, predicted_bpseq):
    
    true_length = len(true_bpseq)
    if len(true_bpseq) != len(predicted_bpseq):
        print("Length of known and predicted sequences does not match! Trimming predicted sequence")
    predicted_bpseq = predicted_bpseq[:true_length]
    
    true_bpseq = true_bpseq.T
    #to not count basepairs twice, we only get those where the first base is pairing with a base downstream
    true_basepair_indices = true_bpseq[2].astype(int) > true_bpseq[0].astype(int)
    true_basepairs = set(["_".join(x) for x in true_bpseq.T[true_basepair_indices]])
    
    predicted_bpseq = predicted_bpseq.T
    predicted_basepair_indices = predicted_bpseq[2].astype(int) > predicted_bpseq[0].astype(int)
    predicted_basepairs = set(["_".join(x) for x in predicted_bpseq.T[predicted_basepair_indices]])
    
    
    shared_bp = true_basepairs.intersection(predicted_basepairs)
    num_shared = len(shared_bp)
    total_num_known = len(true_basepairs)
    total_num_predicted = len(predicted_basepairs)
    
    sensitivity = num_shared / total_num_known
    ppv = num_shared / total_num_predicted

    return sensitivity, ppv, shared_bp, num_shared, total_num_known, total_num_predicted

In [None]:
_, seq, known_structure = read_in_db(f"{data_folder}/references/dimer_canonical_wo_sl_at_210.db")

known_bpseq = convert_db_to_bp(known_structure, seq)

### Calculate BPS and PPV

In [None]:
tmp_data = []

isoform = "RT1_unspliced1"
for sample in samples:

    if ("Rep1" in sample):
        combined_sample = "_".join(sample.split("_")[1:])
        
        for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
            for reactive_nt in [ "AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]
                db_outfolder = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/"
                    
                db_file = f"{db_outfolder}/predict_{isoform}.db"
                if os.path.isfile(db_file):
                    
                    
                    _,seq,predicted_structure = read_in_db(db_file)
                    predicted_bpseq = convert_db_to_bp(predicted_structure, seq)

                    sensitivity, ppv, shared_bp, num_shared, total_num_known, total_num_predicted = calculate_BPS_PPV(known_bpseq, predicted_bpseq)
                    name, _, _ = read_in_db(db_file)
                    reactivity_roc_auc = float(name.split("ROCAUC")[1])
                    tmp_data.append({"sample" : combined_sample, "option" : option, "reactive_nt" : reactive_nt, "sensitivity" : sensitivity, "ppv" : ppv, "shared_bp" : shared_bp, "num_shared" : num_shared,  "total_num_known" : total_num_known, "total_num_predicted" : total_num_predicted, "reactivity_ROCAUC_w_predicted" : reactivity_roc_auc})

In [None]:
sens_ppv_df = pd.DataFrame(tmp_data)
sens_ppv_df["conc"] = sens_ppv_df["sample"].apply(lambda x: x.split("_")[1])
sens_ppv_df["location"] = sens_ppv_df["sample"].apply(lambda x: x.split("_")[2])

### Plot BPS and PPV

In [None]:
#plotly libraries
import plotly.express as px
import plotly.colors as pc
import plotly.graph_objects as go
import plotly.io as pio

#default renderer (svg means very small file size, visibility on github, high quality, but requires sometimes setting height and width)
pio.renderers.default = "svg"

colors = ["#363b3d", "#727b76", "#31a240", "#f93939", "#f79118", "#de7b06", "#9b308f", "#dc759b"]
#additional defaults
px.defaults.color_discrete_sequence = ["rgb(100,100,100)"] + pc.qualitative.D3
px.defaults.width=1000
px.defaults.height=800

#try setting this as default for histograms
#fig.update_traces(marker_line_width=0.2)

#set default template as "simple_white" (no background, no grid lines)
pio.templates["simple_white"].layout["yaxis"]["showgrid"] = True
pio.templates.default = "simple_white"

import numpy as np

import sklearn.metrics as metrics
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

colorscale = pc.sequential.Plasma
print(colorscale)
colorscale = [colorscale[0]] + colorscale[3:8]
colorscale

In [None]:
fig = px.bar(sens_ppv_df, x="conc", y="sensitivity", barmode="group", color="option", facet_col="location", facet_row="reactive_nt")
fig.update_yaxes(range=[0,1])
fig.update_layout(height=600)

In [None]:
px.bar(sens_ppv_df, x="conc", y="ppv", color="option", barmode="group", facet_col="location", facet_row="reactive_nt")
fig.update_yaxes(range=[0,1])
fig.update_layout(height=600)

In [None]:
fig = px.scatter(sens_ppv_df[sens_ppv_df["location"]=="cell"], x="sensitivity", y="ppv", color="conc", facet_row="option", facet_col="reactive_nt", color_discrete_sequence = colorscale)
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=600, width=800)
fig.show()

In [None]:
fig = px.scatter(sens_ppv_df[sens_ppv_df["location"]=="cell"], x="sensitivity", y="ppv", color="option", facet_row="conc", facet_col="reactive_nt")
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=800, width=800)
fig.show()

In [None]:
fig = px.scatter(sens_ppv_df[sens_ppv_df["location"]=="cell"], x="reactivity_ROCAUC_w_predicted", y="ppv", color="option", facet_row="conc", facet_col="reactive_nt")
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=800, width=800)
fig.show()

In [None]:
fig = px.scatter(sens_ppv_df[sens_ppv_df["location"]=="cell"], x="reactivity_ROCAUC_w_predicted", y="sensitivity", color="option", facet_row="conc", facet_col="reactive_nt")
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=800, width=800)
fig.show()

# Plot prediction for first 380 nt

In [314]:
_, seq, known_structure = read_in_db(f"{data_folder}/references/dimer_canonical_wo_sl_at_210.db")

known_bpseq = convert_db_to_bp(known_structure, seq)

In [None]:
tmp_data = []

isoform = "RT1_unspliced1"
for sample in samples:

    if ("Rep1" in sample):
        combined_sample = "_".join(sample.split("_")[1:])
        
        for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
            for reactive_nt in ["AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]
                db_outfolder = f"{data_folder}/eterna/{combined_sample}/{isoform}/{option}_{reactive_nt}/"
                    
                db_file = f"{db_outfolder}/predict_{isoform}_380nt.db"
                if os.path.isfile(db_file):
                    
                    
                    _,seq,predicted_structure = read_in_db(db_file)
                    predicted_bpseq = convert_db_to_bp(predicted_structure, seq)

                    sensitivity, ppv, shared_bp, num_shared, total_num_known, total_num_predicted = calculate_BPS_PPV(known_bpseq, predicted_bpseq)
                    name, _, _ = read_in_db(db_file)
                    reactivity_roc_auc = float(name.split("ROCAUC")[1])
                    tmp_data.append({"sample" : combined_sample, "option" : option, "reactive_nt" : reactive_nt, "sensitivity" : sensitivity, "ppv" : ppv, "shared_bp" : shared_bp, "num_shared" : num_shared,  "total_num_known" : total_num_known, "total_num_predicted" : total_num_predicted, "reactivity_ROCAUC_w_predicted" : reactivity_roc_auc})

In [None]:
sens_ppv_df = pd.DataFrame(tmp_data)
sens_ppv_df["conc"] = sens_ppv_df["sample"].apply(lambda x: x.split("_")[1])
sens_ppv_df["location"] = sens_ppv_df["sample"].apply(lambda x: x.split("_")[2])

In [None]:
fig = px.bar(sens_ppv_df, x="conc", y="sensitivity", barmode="group", color="option", facet_col="location", facet_row="reactive_nt")
fig.update_yaxes(range=[0,1])
fig.update_layout(height=600)

In [None]:
fig = px.bar(sens_ppv_df, x="conc", y="ppv", color="option", barmode="group", facet_col="location", facet_row="reactive_nt")
fig.update_yaxes(range=[0,1])
fig.update_layout(height=600)

In [None]:
fig = px.scatter(sens_ppv_df[sens_ppv_df["location"]=="cell"], x="sensitivity", y="ppv", color="conc", facet_row="option", facet_col="reactive_nt", color_discrete_sequence = colorscale[1:])
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=600, width=800)
fig.show()

In [None]:
fig = px.scatter(sens_ppv_df[sens_ppv_df["location"]=="cell"], x="sensitivity", y="ppv", color="option", facet_row="conc", facet_col="reactive_nt")
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=800, width=800)
fig.show()

In [None]:
fig = px.scatter(sens_ppv_df[(sens_ppv_df["location"]=="cell") & (sens_ppv_df["option"]=="q22_eq10_ndni")], x="sensitivity", y="ppv", color="reactive_nt", facet_col="conc")
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=300, width=1000)
fig.show()

In [None]:
os.makedirs("figures")

In [None]:
fig = px.scatter(sens_ppv_df[(sens_ppv_df["location"]=="cell") & (sens_ppv_df["option"]=="q22_eq10_ndni")], x="sensitivity", y="ppv",  range_color = [0.5,1],facet_col="reactive_nt", color_discrete_sequence = colorscale, color="conc")
fig.update_yaxes(range=[0,1], dtick=0.2, mirror=True,showline=True)
fig.update_xaxes(range=[0,1], dtick=0.2, showgrid=True, mirror=True,showline=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=300, width=900)
fig.write_image("figures/sensitivity_ppv_q22_eq10_ndni_US_cell.svg")
fig.show()

In [None]:
import shutil
shutil.make_archive("de_novo_eterna_folds_unspliced", "zip", os.getcwd(), "data/eterna/")