# Jupyter Notebook to run Illumina DMS-MaP pipeline

Note: Shell commands are all executed on Slurm cluster here. Adapt to specific needs if required. 

In [None]:
# folder structures:
experiment/data/
                fast5/      sample
                fastq/      sample
                bam/        sample
                rf/         sample
                perbase/    sample
                eterna/     sample
                
experiment/references/
                    .fa #(unspliced)
                    .gtf #(isoform information), created with 0_create_gtf.ipynb
                    .db #"truth" structure, required only for evaluation, e.g. ROC-AUC and BPS/PPV
                    RT-PCR_transcripts/*.fa #individual fasta file for each isoform (generated with 0_create_gtf.ipynb)
    

# Cutadapt

# Align with bowtie

In [None]:
#specify paths of bowtie and bowtie-build
bowtie_tool = '...'
bowtie_build_tool = '...'

#specify fasta file and bowtie index file (to be generated below)
fasta_file = f"{os.getcwd()}/references/transcripts_PCR1/RT1_unspliced1.fa"
index_file = f"{os.getcwd()}/references/transcripts_PCR1/RT1_unspliced1_bowtie.index"

#specify installed samtools binary (if installed via conda it is in miniconda3/envs/[...]/bin/samtools"
samtools_path = "/home/pbohn/miniconda3/envs/nanodms/bin/samtools"

In [None]:
# Build bowtie index

command = f"{bowtie_build_tool} {fasta_file} {index_file}"
!{command}

In [None]:
# Align with bowtie

num_threads = 5
slurm_align = Slurm("bowtie", {"partition" : "cpu", "mem" : "5G", "cpus-per-task" : num_threads, "time" : "60","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})


bowtie_outdir = f"{data_folder}/bam_bowtie/{qscore}"
os.makedirs(bowtie_outdir, exist_ok=True)

for sample in samples:

    R1 = f"{data_folder}/fastq/{sample}_1.fq.gz"
    R2 = f"{data_folder}/fastq/{sample}_2.fq.gz"
    bam_outdir = f"{bowtie_outdir}/{sample}/"

    command = f"""{bowtie_tool} -D 20 -R 3 -N 1 -L 15 -i S,1,0.50 -x {index_file} -1 {R1} -2 {R2} -S {bam_outdir}/{sample}.sam
                {samtools_path} view -h -t {reference_fasta}.fai {bam_outdir}/{sample}.sam > {bam_outdir}/{sample}.bam
                {samtools_path} calmd --output-fmt BAM --threads {num_threads} {bam_outdir}/{sample}.bam {reference_fasta} > {bam_outdir}/{sample}_MD.bam
                {samtools_path} sort -O bam {bam_outdir}/{sample}_MD.bam > {bam_outdir}/{sample}_MD_sorted.bam
                {samtools_path} index {bam_outdir}/{sample}_MD_sorted.bam
                """

    slurm_align.run(command)


## RNA-Framework

RNA-Framework version 2.8.0 was used for the paper. It is majorly written in perl. To run rfnorm the conda package "perl-xml-libxml" is required. 

In [None]:
rf_dir = "..." #specify path to RNAFramework

### rf-count

In [None]:
os.makedirs("data/rfcount", exist_ok=True)

def rf_count(rf_dir, data_folder, sample, isoform, reference_fasta, pids, num_threads = 10):

    slurm_rfcount = Slurm("rfcount", {"partition" : "cpu", "mem" : "2G", "cpus-per-task" : num_threads, "time" : "60","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

    commands = []
    
    bam_file = f"{bowtie_outdir}/{sample}/{sample}_MD_sorted.bam"
    rc_outdir = f"{data_folder}/rfcount/{sample}/"
    os.makedirs(rc_outdir, exist_ok=True)

    #to run rf-count with different settings comment/uncomment commands (and adjust output_dir naming)
    
    commands.append(f"{rf_dir}/rf-count -p {num_threads} -mf {primer_mask} -o {rc_outdir}/q22_eq10_ndni -ow -f {reference_fasta} -m -nd -ni -q 22 -eq 10 -mm {bam_file}")
    
    #commands.append(f"{rf_dir}/rf-count -p {num_threads} -mf {primer_mask} -o {rc_outdir}/q22_eq10 -ow -f {reference_fasta} -m -q 22 -eq 10 -mm {bam_file}")
    #commands.append(f"{rf_dir}/rf-count -p {num_threads} -mf {primer_mask} -o {rc_outdir}/default -ow -f {reference_fasta} -m -mm {bam_file}")
    
    command = "\n".join(commands)
    
    pids[sample] = slurm_rfcount.run(command)
    return pids

In [None]:
pids = {}

In [None]:
primer_mask = f"{data_folder}/references/mask_primers_PCR1_2_3.csv"

isoform = "RT1_unspliced1"
for sample in samples:
    pids[sample] = {}
    reference_fasta = f"{data_folder}/references/transcripts_PCR1/{isoform}.fa"
    pids = rf_count(rf_dir, data_folder, sample, isoform, reference_fasta, pids)

### rf-norm

In [None]:
rfnorm_outdir = f"{data_folder}/rfnorm/"
os.makedirs(rfnorm_outdir, exist_ok=True)

def rf_norm(rf_dir, data_folder, sample, control, isoform, num_threads = 10):
    pids = {}
    
    slurm_rfnorm = Slurm("rfnorm", {"partition" : "cpu", "mem" : "2G", "cpus-per-task" : num_threads, "time" : "480","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

    commands = []
    
    #specify the different rf-count outdir names here
    for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
        
        #running rfnorm with different reactive nt makes it easier to analyze them afterwards (e.g. for correlation analysis)
        for reactive_nt in ["AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]

            rc_file = f"{data_folder}/rfcount/{sample}/{option}/{sample}_MD_sorted.rc"
            
            control_rc_file = f"{data_folder}/rfcount/{control}/{option}/{sample}_MD_sorted.rc"
            
            rfnorm_outdir = f"{data_folder}/rfnorm/{sample}/{option}_{reactive_nt}"
            
            
            #standard procedure is using Siegfried method, which normalizes against control sample. Control vs itself does not work here. 
            if sample != control:
                os.makedirs(rfnorm_outdir, exist_ok=True)
                commands.append(f"{rf_dir}/rf-norm -p {num_threads} -t {rc_file} -u {control_rc_file} -o {rfnorm_outdir} --scoring-method 3 --norm-method 2 -rb {reactive_nt} --max-untreated-mut 0.05 --max-mutation-rate 0.2 --norm-independent -ow")
            
            #To get raw reactivities (e.g. for evaluation of total (DMS) mutation rates or signal-to-noise calculation) use the following
            
            #rfnorm_outdir = f"{data_folder}/rfnorm/{sample}/{isoform}/{option}_{reactive_nt}_raw"
            #os.makedirs(rfnorm_outdir, exist_ok=True)
            #commands.append(f"{rf_dir}/rf-norm -p {num_threads} -t {rc_file} -o {rfnorm_outdir} --scoring-method 4 -r -rb {reactive_nt} -ow")

    command = "\n".join(commands)
    
    pid = slurm_rfnorm.run(command)
    return pid

In [None]:
for sample in samples:

    #rf-norm with Siegfried requires a control sample. Here we assume the sample is structured as following: 
    #"{replicate}_{RT_primer}_{DMS_conc}_{localization}"
    #adjust the list index if DMS_conc is at a different position

    split_sample = sample.split("_")
    split_sample[2] = "0mM"
    control = "_".join(split_sample)

    pids[f"rfnorm_{isoform}_{sample}"] = rf_norm(rf_dir, data_folder, sample, control, isoform)

### rf-correlate and rf-combine

This step combines the data of multiple replicates. The correlation between those replicates (on A,C) is a key quality control metric, without it the interpretation of DMS reactivities is challenging. 

In [None]:
os.makedirs("data/rfcombine", exist_ok=True)
num_threads = 1
slurm_rfco = Slurm("rfco", {"partition" : "cpu", "mem" : "2G", "cpus-per-task" : num_threads, "time" : "10","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})


for sample in samples:

    #here we assume that our replicates are named "Rep[n]"

    if ("Rep1" in sample):
        commands = []

        combined_sample = "_".join(sample.split("_")[1:])
        sample1 = sample
        sample2 = sample.replace("Rep1", "Rep2")
        os.makedirs(f"{data_folder}/rfcombine/{combined_sample}/{isoform}", exist_ok=True)

        #specify options again as used in rfcount_outdir 
        for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]

            #specify reactive_nt again as in rfnorm
            for reactive_nt in ["AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]

                #include "_raw" if rfnorm was also run with Zubradt (4)
                for norm_option in [""]: #["", "_raw"]

                    #assumes 2 replicates currently, extend if more
                    xml1 = f"{data_folder}/rfnorm/{sample1}/{option}_{reactive_nt}{norm_option}/{isoform}.xml"
                    xml2 = f"{data_folder}/rfnorm/{sample2}/{option}_{reactive_nt}{norm_option}/{isoform}.xml"

                    if os.path.isfile(xml1) & os.path.isfile(xml2):

                        xml_combined = f"{data_folder}/rfcombine/{combined_sample}/{option}_{reactive_nt}{norm_option}/"
                        commands.append(f"{rf_dir}/rf-combine -d 6 -o {xml_combined} -ow {xml1} {xml2}")

                        outfile = f"{xml_combined}/correlation.csv"

                        commands.append(f"{rf_dir}/rf-correlate -o {outfile} -ow {xml1} {xml2}")
        command = "\n".join(commands)
        slurm_rfco.run(command)

# Generate csv and bpseq from xml files

RNA-Framework generates an xml file containing the sequence and reactivity scores. To facilitate easier processing with downstream tools, such as Varna and Eternafold, we provide functions to read in the xml file and generate standard csv and Eterna-/Contrafold compatible bpseq (https://github.com/eternagame/EternaFold#chemical-mapping) files. 

In [None]:
import xml.etree.ElementTree as ET
import numpy as np

def read_in_xml(xml_file, sample, with_stdev = False):

    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    transcript_id = root[0].attrib["id"]
    length = root[0].attrib["length"]
    sequence = root[0][0].text.replace("\t", "").replace("\n", "")
    reactivity = np.array(root[0][1].text.replace("\t", "").replace("\n", "").split(",")).astype(float)
    if with_stdev:
        stdev = np.array(root[0][2].text.replace("\t", "").replace("\n", "").split(",")).astype(float)
        return {"sample" : sample, 
            "transcript_id" : transcript_id,
            "length" : length,
            "sequence" : sequence, 
            "reactivity" : reactivity,
            "stdev": stdev
           }
    else:
        return {"sample" : sample, 
            "transcript_id" : transcript_id,
            "length" : length,
            "sequence" : sequence, 
            "reactivity" : reactivity
           }
    
    
def convert_xml_to_bpseq(xml_file,outfile):

    tmp_data = read_in_xml(xml_file, "")

    reactivities = tmp_data["reactivity"]
    sequence = list(tmp_data["sequence"].replace("T", "U"))
    
    reactivities = np.nan_to_num(reactivities, nan=-1.0)
    with open(outfile, "w+") as out:
        for i in np.arange(1,1+reactivities.shape[0]):
            position = int(i)
            line = f"{position} {sequence[position-1]} e1 {reactivities[position-1]}\n"
            out.write(line)
            
            
#function to help predicting only part of an RNA isoform
def convert_xml_to_bpseq_trimmed(xml_file,outfile, length):

    tmp_data = read_in_xml(xml_file, "")

    reactivities = tmp_data["reactivity"][:length]
    sequence = list(tmp_data["sequence"].replace("T", "U"))[:length]
    
    reactivities = np.nan_to_num(reactivities, nan=-1.0)
    with open(outfile, "w+") as out:
        for i in np.arange(1,1+reactivities.shape[0]):
            position = int(i)
            line = f"{position} {sequence[position-1]} e1 {reactivities[position-1]}\n"
            out.write(line)

In [None]:
# makes similar assumptions as rf-combine above
#genrates a bp2seq file for use in Eternafold and a csv file for general use

for sample in samples:
    if ("Rep1" in sample):
        combined_sample = "_".join(sample.split("_")[1:])

        for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
            for reactive_nt in ["AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]
                for norm_option in [""]: #["", "_raw"]

                    xml_combined = f"{data_folder}/rfcombine/{combined_sample}/{option}_{reactive_nt}{norm_option}/{isoform}.xml"
                    if os.path.isfile(xml_combined):
                        file = read_in_xml(xml_combined, combined_sample)
                        reactivities = file["reactivity"]
                        reactivities = np.nan_to_num(reactivities, nan = -1)
                        reactivity_file = xml_combined.replace(".xml", ".csv")
                        np.savetxt(reactivity_file, reactivities, fmt="%6f")
                        bp_file = xml_combined.replace(".xml", ".bp2seq")
                        convert_xml_to_bpseq(xml_combined, bp_file)

                        #lower part is optional, was used to predict HIV-1 unspliced 5' UTR folding
                        length = 380
                        bp_file = bp_file.replace(".bp2seq", f"_{length}nt.bp2seq")
                        convert_xml_to_bpseq_trimmed(xml_combined,bp_file, length)