# Jupyter Notebook to run Nano-DMS-MaP pipeline

Note: Shell commands are all executed on Slurm cluster here. Adapt to specific needs if required. 

In [None]:
# folder structures:
experiment/data/
                fast5/      sample
                fastq/      sample
                bam/        sample
                rf/         sample
                perbase/    sample
                eterna/     sample
                
experiment/references/
                    .fa #(unspliced)
                    .gtf #(isoform information), created with 0_create_gtf.ipynb
                    .db #"truth" structure, required only for evaluation, e.g. ROC-AUC and BPS/PPV
                    RT-PCR_transcripts/*.fa #individual fasta file for each isoform (generated with 0_create_gtf.ipynb)
    

## Basecalling

In [None]:
fast5_dir = f"..."
fastq_dir = f"..."
guppy = "..." #used guppy 6.1.4
command = f"{guppy} -x cuda:0 --do_read_splitting --trim_barcodes --trim_adapters --min_qscore 10 --trim_strategy dna  -c dna_r10.4_e8.1_sup.cfg --barcode_kits SQK-NBD112-96 -i {fast5_dir} -s {fastq_dir}"

#Note: for the current Kit 14 chemistry update the config file to r10.4.1 e8.2
#!{command}

# Sort Fastq files (from barcode to sample)

In [None]:
#use an excel (xlsx) sheet for samples, with required columns "Sample", samples must not contain spaces

import pandas as pd
samplesheet = pd.read_excel("./samplesheet.xlsx", engine="openpyxl")
samplesheet

In [None]:
samples = samplesheet["Sample"].values
samples

## Isoquant

In [None]:
# Isoquant (https://www.nature.com/articles/s41587-022-01565-y) attempts to assign each read to a specific isoform. 
# A fasta reference of the unspliced genome and a GTF file to name the transcripts is required. The GTF file is based on https://retrovirology.biomedcentral.com/articles/10.1186/s12977-020-00533-1,
# but modified (in 0.create_gtf.ipynb) to include the PCR primer workflow

In [36]:
!mkdir data/references

In [1]:
!cp /vol/projects/pbohn/AnSo_DMS_MaP/references/HIVNL43_all_iso_PCR1_2_3.gtf references/

In [2]:
!cp /vol/projects/pbohn/AnSo_DMS_MaP/references/p12_HIVNL43.fa references/

In [None]:
#specify absolute path to data folder
data_folder = "..."


In [None]:
#Isoquant Version 2.0
#specify Isoquant.py location from https://github.com/ablab/IsoQuant
isoquant = ".../IsoQuant/isoquant.py"
gtf_file = f"{os.getcwd()}/references/HIVNL43_all_iso_PCR1_2_3.gtf"
ref_file = f"{os.getcwd()}/references/p12_HIVNL43.fa"

In [None]:
from slurmpy import Slurm

samples = sorted(samples)

job_name = "isoquant"
num_threads = 5
s = Slurm(job_name, {"partition" : "cpu", "mem" : "10G", "cpus-per-task" : num_threads, "time" : "240",  "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

output_folder = f"{data_folder}/isoquant/"
os.makedirs(output_folder, exist_ok=True)

with open(output_folder + "fastq_list.txt", "w") as outfile:
    for sample in samples:
        
        #adjust fastq file name if named differently
        fastq_file = f"{data_folder}/fastq/{sample}/merged.fastq"
        
        outfile.write(f"{fastq_file}:{sample}\n\n")
command = f"{isoquant} -d nanopore -g {gtf_file} -r {ref_file} --fastq_list {output_folder}fastq_list.txt -l {' '.join(samples)} --stranded none -t {num_threads} -o {output_folder} --clean_start --fl_data"
s.run(command)

### Sort per isoform

After Isoquant finishes successfully, the following script will take the read to isoform assignments and sort into individual fastq files

In [None]:
# Reading in read_to_isoform assignemnts of each sample into a dataframe

import pandas as pd

all_read_assignments_df = []

for sample in samples:
    merged_transcript_file = f"{data_folder}/isoquant/{sample}/{sample}.read_assignments.tsv"
    read_df = pd.read_csv(merged_transcript_file, sep="\t")
    read_df["unique_name"] = sample
    all_read_assignments_df.append(read_df)
all_read_assignments_df = pd.concat(all_read_assignments_df)
all_read_assignments_df.rename(columns={'#read_id':'read_id'}, inplace=True)

#store df for loading in other notebooks
all_read_assignments_df.to_pickle(f"{data_folder}/isoquant/read_assignments.pickle")

unique_mapped_reads_df = all_read_assignments_df[all_read_assignments_df["assignment_type"].isin(["unique", "unique_minor_difference"])].copy()
unique_mapped_reads_df["unique_name"] = unique_mapped_reads_df["unique_name"].astype("category")


In [None]:
# Prints out total number each isoform occured. For plotting see 2_Plot_Isoquant.ipynb
unique_mapped_reads_df["isoform_id"].value_counts()

In [None]:
import numpy as np

isoforms = np.unique(unique_mapped_reads_df["isoform_id"].values)
print("Detected", len(isoforms), "isoforms")

In [None]:
#Sorting reads by isoform

from Bio import SeqIO

fastq_dir = f"{data_folder}/fastq/"

read_dict = unique_mapped_reads_df[["read_id", "isoform_id"]].set_index("read_id").to_dict()["isoform_id"]

for sample in samples:
    fastq_out = f"{data_folder}/fastq/{sample}/by_isoform/"
    os.makedirs(fastq_out, exist_ok=True)
    
    input_fastq_file = f'{data_folder}/fastq/{sample}/merged.fastq'
    
    output_files = {}
    sorted_reads = {}
    for isoform in isoforms:
        output_files[isoform] = open(f'{fastq_out}/{isoform}.fastq', "w")
        sorted_reads[isoform] = []
        
    not_sorted = 0
    for record in SeqIO.parse(input_fastq_file, "fastq"):
        try:
            sorted_reads[read_dict[record.id]].append(record)
            
        except:
            not_sorted +=1
    
    for isoform in isoforms:
        SeqIO.write(sorted_reads[isoform], output_files[isoform], "fastq")
        output_files[isoform].close()

## Alignment

In [None]:
last_bin_dir = ".../last-1409/bin" #specifies path of folder containing lastdb, lastal etc. (ends in "/bin")
reference_dir = f"references/transcripts_PCR1" #folder that contains references of individual isoforms (generated by 0._GTF_preprocess.ipynb)
samtools_path = ".../envs/nanodms/bin/samtools" #path to samtools binary

In [None]:
pids = {}

num_threads = 10

slurm_train = Slurm("LAST_train", {"partition" : "cpu", "mem" : "5G", "cpus-per-task" : num_threads, "time" : "10","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})
slurm_align = Slurm("LAST_align", {"partition" : "cpu", "mem" : "5G", "cpus-per-task" : num_threads, "time" : "60","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

os.makedirs(f"{data_folder}/bam", exist_ok=True)

#index reference
#generate last db file of fasta
for isoform in isoforms:
    reference_fasta = f"{reference_dir}/{isoform}.fa"
    !samtools faidx {reference_fasta}
    
    last_reference_prefix = f"{reference_dir}/{isoform}_LAST"
    !{last_bin_dir}/lastdb {last_reference_prefix} {reference_fasta}

    for sample in samples:
        bam_outdir = f"{data_folder}/bam/{sample}/{isoform}"
        fastq_file = f"{data_folder}/fastq/{sample}/by_isoform/{isoform}.fastq"
        
        os.makedirs(bam_outdir, exist_ok=True)
        #train LAST
        command = f"""
        {last_bin_dir}/last-train -Q0 -P {num_threads} {last_reference_prefix} {fastq_file} > {bam_outdir}/LAST_train.par
        """
        #slurm
        train_pid = slurm_train.run(command)
        #align LAST
        #slurm
        command = f"""
        {last_bin_dir}/lastal -Qkeep -P {num_threads} -p {bam_outdir}/LAST_train.par -m20 {last_reference_prefix} {fastq_file} | {last_bin_dir}/last-split -m1 > {bam_outdir}/LAST.maf
        {last_bin_dir}/maf-convert -j1e6 psl {bam_outdir}/LAST.maf > {bam_outdir}/LAST.psl
        {last_bin_dir}/maf-convert sam {bam_outdir}/LAST.maf > {bam_outdir}/LAST.sam
        {samtools_path} view -h -t {reference_fasta}.fai {bam_outdir}/LAST.sam > {bam_outdir}/LAST.bam
        {samtools_path} calmd --output-fmt BAM --threads {num_threads} {bam_outdir}/LAST.bam {reference_fasta} > {bam_outdir}/LAST_MD.bam
        {samtools_path} sort -O bam {bam_outdir}/LAST_MD.bam > {bam_outdir}/LAST_MD_sorted.bam
        {samtools_path} index {bam_outdir}/LAST_MD_sorted.bam
        """
        pids[f"{isoform}_{sample}"] = slurm_align.run(command, depends_on = [train_pid])

## RNA-Framework

RNA-Framework version 2.8.0 was used for the paper. It is majorly written in perl. To run rfnorm the conda package "perl-xml-libxml" is required. 

In [None]:
rf_dir = ".../RNAFramework" #specify path to RNAFramework

### rf-count

In [None]:
os.makedirs("data/rfcount", exist_ok=True)

def rf_count(rf_dir, data_folder, sample, isoform, reference_fasta, pids, num_threads = 10):

    slurm_rfcount = Slurm("rfcount", {"partition" : "cpu", "mem" : "2G", "cpus-per-task" : num_threads, "time" : "60","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

    commands = []
    
    bam_file = f"{data_folder}/bam/{sample}/{isoform}/LAST_MD_sorted.bam"
    rc_outdir = f"{data_folder}/rfcount/{sample}/{isoform}/"
    os.makedirs(rc_outdir, exist_ok=True)

    #to run rf-count with different settings comment/uncomment commands (and adjust output_dir naming)
    
    commands.append(f"{rf_dir}/rf-count -p {num_threads} -mf {primer_mask} -o {rc_outdir}/q22_eq10_ndni -ow -f {reference_fasta} -m -nd -ni -q 22 -eq 10 -mm {bam_file}")
    
    #commands.append(f"{rf_dir}/rf-count -p {num_threads} -mf {primer_mask} -o {rc_outdir}/q22_eq10 -ow -f {reference_fasta} -m -q 22 -eq 10 -mm {bam_file}")
    #commands.append(f"{rf_dir}/rf-count -p {num_threads} -mf {primer_mask} -o {rc_outdir}/default -ow -f {reference_fasta} -m -mm {bam_file}")
    
    command = "\n".join(commands)
    
    pids[sample] = slurm_rfcount.run(command)
    return pids

In [None]:
primer_mask = f"{data_folder}/references/mask_primers_PCR1_2_3.csv"
for sample in samples:
    pids[sample] = {}
    for isoform in isoforms:
        reference_fasta = f"{data_folder}/references/transcripts_PCR1/{isoform}.fa"
        
        pids = rf_count(rf_dir, data_folder, sample, isoform, reference_fasta, pids)

### rf-norm

In [None]:
rfnorm_outdir = f"{data_folder}/rfnorm/"
os.makedirs(rfnorm_outdir, exist_ok=True)



def rf_norm(rf_dir, data_folder, sample, control, isoform, num_threads = 10):
    pids = {}
    
    slurm_rfnorm = Slurm("rfnorm", {"partition" : "cpu", "mem" : "2G", "cpus-per-task" : num_threads, "time" : "480","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

    commands = []
    
    #specify the different rf-count outdir names here
    for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
        
        #running rfnorm with different reactive nt makes it easier to analyze them afterwards (e.g. for correlation analysis)
        for reactive_nt in ["AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]

            rc_file = f"{data_folder}/rfcount/{sample}/{isoform}/{option}/LAST_MD_sorted.rc"
            
            control_rc_file = f"{data_folder}/rfcount/{control}/{isoform}/{option}/LAST_MD_sorted.rc"
            
            rfnorm_outdir = f"{data_folder}/rfnorm/{sample}/{isoform}/{option}_{reactive_nt}"
            
            
            #standard procedure is using Siegfried method, which normalizes against control sample. Control vs itself does not work here. 
            if sample != control:
                os.makedirs(rfnorm_outdir, exist_ok=True)
                commands.append(f"{rf_dir}/rf-norm -p {num_threads} -t {rc_file} -u {control_rc_file} -o {rfnorm_outdir} --scoring-method 3 --norm-method 2 -rb {reactive_nt} --max-untreated-mut 0.05 --max-mutation-rate 0.2 --norm-independent -ow")
            
            #To get raw reactivities (e.g. for evaluation of total (DMS) mutation rates or signal-to-noise calculation) use the following
            
            #rfnorm_outdir = f"{data_folder}/rfnorm/{sample}/{isoform}/{option}_{reactive_nt}_raw"
            #os.makedirs(rfnorm_outdir, exist_ok=True)
            #commands.append(f"{rf_dir}/rf-norm -p {num_threads} -t {rc_file} -o {rfnorm_outdir} --scoring-method 4 -r -rb {reactive_nt} -ow")

    command = "\n".join(commands)
    
    pid = slurm_rfnorm.run(command)
    return pid

In [None]:
for isoform in isoforms:
    for sample in samples:
        
        #rf-norm with Siegfried requires a control sample. Here we assume the sample is structured as following: 
        #"{replicate}_{RT_primer}_{DMS_conc}_{localization}"
        #adjust the list index if DMS_conc is at a different position
        
        split_sample = sample.split("_")
        split_sample[2] = "0mM"
        control = "_".join(split_sample)
        
        pids[f"rfnorm_{isoform}_{sample}"] = rf_norm(rf_dir, data_folder, sample, control, isoform)

### rf-correlate and rf-combine

This step combines the data of multiple replicates. The correlation between those replicates (on A,C) is a key quality control metric, without it the interpretation of DMS reactivities is challenging. 

In [None]:
os.makedirs(f"{data_folder}/rfcombine", exist_ok=True)
num_threads = 1
slurm_rfco = Slurm("rfco", {"partition" : "cpu", "mem" : "2G", "cpus-per-task" : num_threads, "time" : "10","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})


for isoform in isoforms:
    for sample in samples:
        
        #here we assume that our replicates are named "Rep[n]"
        
        if ("Rep1" in sample):
            commands = []
            
            combined_sample = "_".join(sample.split("_")[1:])
            sample1 = sample
            sample2 = sample.replace("Rep1", "Rep2")
            os.makedirs(f"{data_folder}/rfcombine/{combined_sample}/{isoform}", exist_ok=True)
            
            #specify options again as used in rfcount_outdir 
            for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
                
                #specify reactive_nt again as in rfnorm
                for reactive_nt in ["AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]
                    
                    #include "_raw" if rfnorm was also run with Zubradt (4)
                    for norm_option in [""]: #["", "_raw"]
                        
                        #assumes 2 replicates currently, extend if more
                        xml1 = f"{data_folder}/rfnorm/{sample1}/{isoform}/{option}_{reactive_nt}{norm_option}/{isoform}.xml"
                        xml2 = f"{data_folder}/rfnorm/{sample2}/{isoform}/{option}_{reactive_nt}{norm_option}/{isoform}.xml"
                        
                        if os.path.isfile(xml1) & os.path.isfile(xml2):
                        
                            xml_combined = f"{data_folder}/rfcombine/{combined_sample}/{isoform}/{option}_{reactive_nt}{norm_option}/"
                            commands.append(f"{rf_dir}/rf-combine -d 6 -o {xml_combined} -ow {xml1} {xml2}")

                            outfile = f"{xml_combined}/correlation.csv"

                            commands.append(f"{rf_dir}/rf-correlate -o {outfile} -ow {xml1} {xml2}")
            command = "\n".join(commands)
            slurm_rfco.run(command)

# Generate csv and bpseq from xml files

RNA-Framework generates an xml file containing the sequence and reactivity scores. To facilitate easier processing with downstream tools, such as Varna and Eternafold, we provide functions to read in the xml file and generate standard csv and Eterna-/Contrafold compatible bpseq (https://github.com/eternagame/EternaFold#chemical-mapping) files. 

In [None]:
import xml.etree.ElementTree as ET
import numpy as np

def read_in_xml(xml_file, sample, with_stdev = False):

    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    transcript_id = root[0].attrib["id"]
    length = root[0].attrib["length"]
    sequence = root[0][0].text.replace("\t", "").replace("\n", "")
    reactivity = np.array(root[0][1].text.replace("\t", "").replace("\n", "").split(",")).astype(float)
    if with_stdev:
        stdev = np.array(root[0][2].text.replace("\t", "").replace("\n", "").split(",")).astype(float)
        return {"sample" : sample, 
            "transcript_id" : transcript_id,
            "length" : length,
            "sequence" : sequence, 
            "reactivity" : reactivity,
            "stdev": stdev
           }
    else:
        return {"sample" : sample, 
            "transcript_id" : transcript_id,
            "length" : length,
            "sequence" : sequence, 
            "reactivity" : reactivity
           }
    
    
def convert_xml_to_bpseq(xml_file,outfile):

    tmp_data = read_in_xml(xml_file, "")

    reactivities = tmp_data["reactivity"]
    sequence = list(tmp_data["sequence"].replace("T", "U"))
    
    reactivities = np.nan_to_num(reactivities, nan=-1.0)
    with open(outfile, "w+") as out:
        for i in np.arange(1,1+reactivities.shape[0]):
            position = int(i)
            line = f"{position} {sequence[position-1]} e1 {reactivities[position-1]}\n"
            out.write(line)
            
            
#function to help predicting only part of an RNA isoform
def convert_xml_to_bpseq_trimmed(xml_file,outfile, length):

    tmp_data = read_in_xml(xml_file, "")

    reactivities = tmp_data["reactivity"][:length]
    sequence = list(tmp_data["sequence"].replace("T", "U"))[:length]
    
    reactivities = np.nan_to_num(reactivities, nan=-1.0)
    with open(outfile, "w+") as out:
        for i in np.arange(1,1+reactivities.shape[0]):
            position = int(i)
            line = f"{position} {sequence[position-1]} e1 {reactivities[position-1]}\n"
            out.write(line)

In [None]:
# makes similar assumptions as rf-combine above
reactivity_dir = f"{data_folder}/reactivities"
os.makedirs(reactivity_dir, exist_ok=True)

for isoform in isoforms:
    isoform_reactivity_dir = f"{reactivity_dir}/{isoform}"
    os.makedirs(isoform_reactivity_dir, exist_ok=True)
    for sample in samples:
        if ("Rep1" in sample):
            combined_sample = "_".join(sample.split("_")[1:])

            for option in ["q22_eq10_ndni"]: #["q22_eq10_ndni", "q22_eq10", "default"]
                for reactive_nt in ["AC", "ACT"]: #["ACGT", "AC", "ACT", "G"]
                    for norm_option in [""]: #["", "_raw"]
                        os.makedirs(f"{isoform_reactivity_dir}/{isoform}/{option}_{reactive_nt}{norm_option}", exist_ok=True)
                        xml_combined = f"{data_folder}/rfcombine/{combined_sample}/{isoform}/{option}_{reactive_nt}{norm_option}/{isoform}.xml"
                        if os.path.isfile(xml_combined):
                            file = read_in_xml(xml_combined, combined_sample)
                            reactivities = file["reactivity"]
                            reactivities = np.nan_to_num(reactivities, nan = -1)
                            reactivity_file = xml_combined.replace(".xml", ".csv")
                            np.savetxt(reactivity_file, reactivities, fmt="%6f")
                            bp_file = xml_combined.replace(".xml", ".bp2seq")
                            convert_xml_to_bpseq(xml_combined, bp_file)
                            !cp {reactivity_file} {isoform_reactivity_dir}/{isoform}/{option}_{reactive_nt}{norm_option}/{combined_sample}.csv
                            #lower part is optional, was used to predict HIV-1 unspliced 5' UTR folding
                            length = 380
                            bp_file = bp_file.replace(".bp2seq", f"_{length}nt.bp2seq")
                            convert_xml_to_bpseq_trimmed(xml_combined,bp_file, length)