In [None]:
#plotly libraries
import plotly.express as px
import plotly.colors as pc
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np

import sklearn.metrics as metrics
import pandas as pd
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

#default renderer (svg means very small file size, visibility on github, high quality, but requires sometimes setting height and width)
pio.renderers.default = "svg"

colors = ["#363b3d", "#727b76", "#31a240", "#f93939", "#f79118", "#de7b06", "#9b308f", "#dc759b"]
#additional defaults
px.defaults.color_discrete_sequence = ["rgb(100,100,100)"] + pc.qualitative.D3
px.defaults.width=1000
px.defaults.height=800

#try setting this as default for histograms
#fig.update_traces(marker_line_width=0.2)

#set default template as "simple_white" (no background, no grid lines)
pio.templates["simple_white"].layout["yaxis"]["showgrid"] = True
pio.templates.default = "simple_white"

colorscale = pc.sequential.Plasma
print(colorscale)
colorscale = [colorscale[0]] + colorscale[3:8]
colorscale

In [None]:
#data_folder is the path to the data director, where different types of data are stored (fastq, bam, rna framework output, perbase, etc.)
data_folder = "..."


In [None]:
import pandas as pd
samplesheet = pd.read_excel("samplesheet.xlsx", engine="openpyxl")
samplesheet

In [None]:
samples = samplesheet["sample"].values
samples

In [None]:
import os
os.getcwd()

# Generating virtual gels

In [None]:
from slurmpy import Slurm

in_dir = f"{data_folder}/fastq/"
out_dir = f"{data_folder}/fastq_stats"

os.makedirs(f"{out_dir}", exist_ok=True)

num_threads = 1
slurm = Slurm("stats", {"partition" : "cpu", "mem" : "10G", "cpus-per-task" : num_threads, "time" : "30","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

for sample in samples:
    
    commands = []
    for stat in ["length"]:
        fastq_infile = f"{in_dir}/{sample}.fastq.gz"
        outfile = f"{out_dir}/{sample}"
        command = f"python3 functions/calc_per_read.py -i {fastq_infile} -o {outfile} -s {stat}"
        commands.append(command)
    #slurm.run(command)
    slurm.run("\n".join(commands))

    

In [177]:
out_dir = f"{data_folder}/fastq_stats"

read_lengths = {}
max_read_length = 0
for sample in samples:
    this_read_lengths = np.genfromtxt(f"{out_dir}/{sample}_read_length.csv")
    read_lengths[sample] = this_read_lengths
    this_max_read_length = np.max(this_read_lengths)
    if this_max_read_length > max_read_length:
        max_read_length = this_max_read_length

In [None]:
#read length histograms:
def plot_virtual_gel(samples, sample_length_dists, max_read_length = 6000, binsize=1, counts=False, mass = False, norm_mass = True, height = 600, width=600, norm_percentile = 100, zmin=0, zmax=1, dtick=500):

    nbins = int(max_read_length / binsize)
    ticks = np.array(np.arange(1, 1+nbins)*binsize)

    matrix = np.zeros((len(samples), int(max_read_length/binsize)))
    print(matrix.shape)
    sample_length_hists = {}
    for row, sample in enumerate(samples):
        read_length_hist = np.histogram(read_lengths[sample], bins = nbins, range=[1,1+max_read_length])[0]
        sample_length_hists[sample] = read_length_hist
        matrix[row, :] = read_length_hist
        
        
    if counts:
        fig = px.imshow(matrix.T, color_continuous_scale='gray_r', origin="lower", aspect="auto", x=samples, y=ticks)
        fig.update_layout(title="Virtual gel with molar ratios", width=width, height=height)
        fig.show()
    
    weights = ticks+int(binsize/2)
    weighted_matrix = weights*matrix
    
    if mass:

        fig = px.imshow(weighted_matrix.T, color_continuous_scale='gray_r', origin="lower", aspect="auto", x=samples, y=ticks)
        fig.update_layout(title="Virtual gel normalized by molecular mass (i.e. weight)", width=width, height=height)
        fig.show()
    
    if norm_mass:
        weighted_norm_values = np.percentile(weighted_matrix, norm_percentile ,axis=1)
        norm_weighted_matrix = weighted_matrix.T / weighted_norm_values

        fig = px.imshow(norm_weighted_matrix, color_continuous_scale='gray_r', origin="lower", aspect="auto", x=samples, y=ticks, zmin=zmin, zmax=zmax)
        fig.update_layout(title="Virtual gel normalized by molecular mass (i.e. weight) and between samples", width=width, height=height)
        fig.update_yaxes(dtick=dtick)
        fig.show()


In [None]:
plot_virtual_gel(samples, read_lengths, max_read_length = 1500, binsize=1, norm_mass=True, width=1200, norm_percentile = 100, dtick = 100, zmin=0.03)

# Generate per position quality score histogram

In [None]:
from slurmpy import Slurm

in_dir = f"{data_folder}/fastq/"
out_dir = f"{data_folder}/fastq_stats"

max_readlength = 6000

os.makedirs(f"{out_dir}", exist_ok=True)

num_threads = 1
slurm = Slurm("stats", {"partition" : "cpu", "mem" : "20G", "cpus-per-task" : num_threads, "time" : "30","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

for sample in samples:
    
    commands = []
    for stat in ["position_hist"]:
        fastq_infile = f"{in_dir}/{sample}.fastq.gz"
        outfile = f"{out_dir}/{sample}"
        command = f"python3 functions/calc_per_read.py -i {fastq_infile} -o {outfile} -s {stat} -rl {max_readlength}"
        commands.append(command)
    #slurm.run(command)
    slurm.run("\n".join(commands))

    

In [None]:
per_pos_hist = np.zeros((40,max_readlength), dtype=int)

for sample in samples:#samples
    
    outfile = f"{out_dir}/{sample}_read_quality_per_position_histogram.csv"
    tmp_per_pos_hist = np.genfromtxt(outfile)
    
    per_pos_hist = per_pos_hist + tmp_per_pos_hist


In [None]:
#assumes histogram is 0-40
def calc_median_qscore_from_hist(histogram):
    total_n = np.sum(histogram)
    
    half_n = total_n/2
    
    for qscore in np.arange(0,40):
        cum_n = np.sum(histogram[:qscore])
        if cum_n > half_n:
            return qscore -1

def plot_per_pos_median_mean_cov(per_pos_hist):
    coverage_per_pos = np.sum(per_pos_hist, axis=0)

    qscores = np.arange(1,41)
    error_prob_per_qscore = 10**(-qscores/10)

    error_per_pos = per_pos_hist.T * error_prob_per_qscore
    sum_error_per_pos = np.sum(error_per_pos, axis= 1)
    mean_error_per_pos = sum_error_per_pos/coverage_per_pos
    mean_qscore_per_pos = np.log10(mean_error_per_pos)*-10
    
    overall_error_prob = np.sum(error_per_pos) / np.sum(coverage_per_pos)
    
    overall_mean_qscore = np.log10(overall_error_prob)*-10
    print("Overall mean error rate:", overall_error_prob*100, "%")
    print("Overall mean qscore:", overall_mean_qscore)
    
    median_qscore_per_pos = np.apply_along_axis(calc_median_qscore_from_hist, 0, per_pos_hist)
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    positions = np.arange(per_pos_hist.shape[1])
    fig.add_trace(go.Scattergl(x=positions, y=coverage_per_pos, name="coverage", line_color="grey"), secondary_y=True)
    fig.add_trace(go.Scattergl(x=positions, y=mean_qscore_per_pos, name="mean_qscore", line_color="green"))
    fig.add_trace(go.Scattergl(x=positions, y=median_qscore_per_pos, name="median_qscore", line_color="blue"))
    
    fig.update_yaxes(rangemode="tozero", showgrid=False, secondary_y=True)
    fig.update_yaxes(range=[10,40], dtick=2, secondary_y=False)
    
    fig.show()
    return fig

In [None]:
_ = plot_per_pos_median_mean_cov(per_pos_hist[:,:2000])

In [None]:
#Total number of reads:
np.sum(per_pos_hist[:,0])

In [None]:
#Total number of bases:
np.sum(per_pos_hist)

# Plot read statistics

In [None]:
from slurmpy import Slurm

in_dir = f"{data_folder}/fastq/"
out_dir = f"{data_folder}/fastq_stats"

max_readlength = 6000

os.makedirs(f"{out_dir}", exist_ok=True)

num_threads = 1
slurm = Slurm("stats", {"partition" : "cpu", "mem" : "20G", "cpus-per-task" : num_threads, "time" : "30","mail-type" : "FAIL,INVALID_DEPEND", "mail-user" : "patrick.bohn@helmholtz-hiri.de"})

for sample in samples:
    
    commands = []
    for stat in ["mean", "median"]:
        fastq_infile = f"{in_dir}/Nanopore_HIV1_{sample}.fastq.gz"
        outfile = f"{out_dir}/{sample}"
        command = f"python3 functions/calc_per_read.py -i {fastq_infile} -o {outfile} -s {stat} -rl {max_readlength} --histogram"
        commands.append(command)
    #slurm.run(command)
    slurm.run("\n".join(commands))

    

In [None]:

read_median_histogram = np.array(40, dtype=int)
for sample in samples:
    

    outfile = f"{out_dir}/{sample}_read_median_histogram.csv"
    hist = np.genfromtxt(outfile, delimiter=" ", dtype=int)
    read_median_histogram = read_median_histogram + hist[0]

fig = go.Figure()
fig.add_trace(go.Bar(y=hist[1], x=read_median_histogram, name="r1", marker_color="darkblue",
            orientation='h'))

fig.update_layout(title="Read median qscore histogram", width=300, height=500)
fig.update_yaxes(range = [10,40], dtick=1)
#fig.write_image("figures/nanopore_median_qscore_per_read_distribution.svg")
fig.show()

In [None]:
read_median_histogram_r1 = np.array(40, dtype=int)

for sample in samples:
    

    outfile = f"{out_dir}/{sample}_read_mean_histogram.csv"
    hist = np.genfromtxt(outfile, delimiter=" ")
    read_median_histogram_r1 = read_median_histogram_r1 + hist[0]


fig = go.Figure()
fig.add_trace(go.Bar(y=hist[1], x=read_median_histogram_r1, name="r1", marker_color="green",
            orientation='h'))

fig.update_layout(title="Read mean qscore histogram", width=300, height=500)
fig.update_yaxes(range = [10,40], dtick=1)
#fig.write_image("figures/nanopore_mean_qscore_per_read_distribution.svg")
fig.show()

## Mean mutation rates (perbase)

Calculate empirical mean error rate (from bam files generated with LAST (Nanopore) or bowtie (Illumina)

In [None]:
#calculate for in cell samples only
samples = [sample for sample in samples if "cell" in sample]

In [None]:
#specify path to perbase binary (we used v 0.8.5) https://github.com/sstadick/perbase/releases/tag/v0.8.5 
perbase = "..."


In [None]:
#note: includes per position Q22 filter (if below Q22, it is counted as N instead - will still count to total DEPTH)
from slurmpy import Slurm

pids = {}
job_name = "perbase"
num_threads = 10
s = Slurm(job_name, {"partition" : "cpu", "mem" : "10G", "cpus-per-task" : num_threads, "time" : "30",  "mail-user" : "patrick.bohn@helmholtz-hiri.de"})
pids[job_name] = {}
os.makedirs(f"{data_folder}/perbase", exist_ok=True)

reference_fasta = f"references/transcripts_PCR1/RT1_unspliced1.fa"

for sample in samples:
    os.makedirs(f"{data_folder}/perbase/{sample}/", exist_ok=True)
    output_path = f"{data_folder}/perbase/{sample}/RT1_unspliced1.txt.gz"
    BAM_file =  f"{data_folder}/bam/{sample}/RT1_unspliced1/LAST_MD_sorted.bam"

    command = f"""
    {perbase} base-depth -Q 22 -t {num_threads} -r {reference_fasta} {BAM_file} | gzip > {output_path}
    """
    pids[job_name][sample] = s.run(command)

In [None]:
import pandas as pd
tmp_data = []

for sample in samples:
    output_path = f"{data_folder}/perbase/{sample}/RT1_unspliced1.txt.gz"
    try:
        test_df = pd.read_csv(output_path, sep="\t")
    except:
        print("could not read in", output_path)
    test_df["sample"] = sample
    tmp_data.append(test_df)

perbase_df = pd.concat(tmp_data)
print("Total number of nt read in:", perbase_df["DEPTH"].sum())

In [None]:
#remove N counts from total number (to calculate percentage)
perbase_df["DEPTH"] = perbase_df["DEPTH"] - perbase_df["N"]

#pivot df so that we can calculate % values easily
tmp_df = pd.melt(perbase_df, id_vars=["REF", "POS", "REF_BASE", "DEPTH", "NEAR_MAX_DEPTH", "sample"], value_vars =["A", "C", "G", "T", "N", "INS", "DEL", "REF_SKIP", "FAIL"], value_name="count")
pivot_df = tmp_df[tmp_df["variable"] != "N"].copy()
pivot_df["percent"] = 100*pivot_df["count"]/pivot_df["DEPTH"]
pivot_df.loc[pivot_df["DEPTH"] <1, "percent"] = np.nan

#convert all ref bases to upper to fix grouping
pivot_df["REF_BASE"] = pivot_df["REF_BASE"].str.upper()

#new column to easily filter out correct basecalls
pivot_df["match"] = pivot_df["REF_BASE"] == pivot_df["variable"]


In [None]:
#extract information from samples as separate columns for plotting; assumes sample as "{replicate}_{RT_primer}_{DMS_conc}_{localization}"

pivot_df["replicate"] = pivot_df["sample"].apply(lambda x: x.split("_")[0])
pivot_df["conc"] = pivot_df["sample"].apply(lambda x: x.split("_")[2])

In [None]:
#ensures samples are always plotted in the same order when specifying as category_orders in plotly
order_dict = {"conc" : ["0mM", "8mM", "17mM", "34mM", "57mM", "85mM"], 
              "replicate" : ["Rep1", "Rep2"],
             "REF_BASE" : ["A", "C", "G", "T"],
             "variable" : ["A", "C", "G", "T", "INS", "DEL", "REF_SKIP"]}

In [None]:
fig = px.box(pivot_df[(pivot_df["match"]) & (pivot_df["DEPTH"]>1000)], color="conc",x="conc",  y="percent", category_orders = order_dict, color_discrete_sequence =colorscale)
fig.update_yaxes(range=[80,100], dtick=2)
fig.update_layout(height=400, width=500)
fig.update_traces(marker_size=2)
fig.show(renderer="svg")

In [None]:
# Generate a new df to plot mismatch rates
mismatch_df = pivot_df[(pivot_df["match"])].copy()
mismatch_df["percent"] = 100- mismatch_df["percent"]

In [None]:
fig = px.box(mismatch_df[mismatch_df["DEPTH"]>1000], color="conc",x="conc",  y="percent", category_orders = order_dict, color_discrete_sequence =colorscale)
fig.update_yaxes(range=[0,15], dtick=1)
fig.update_layout(height=400, width=500)
fig.update_traces(marker_size=2)
fig.write_image(f"figures/nanopore_mut_rate_box.svg")
fig.show(renderer="svg")

In [None]:
fig = px.box(mismatch_df[mismatch_df["DEPTH"]>1000], facet_col="REF_BASE", color="conc",x="conc",  y="percent", category_orders = order_dict, color_discrete_sequence =colorscale)
fig.update_yaxes(range=[0,15], dtick=1)
fig.update_layout(height=400, width=800)
fig.update_traces(marker_size=2)
fig.write_image(f"figures/nanopore_mut_rate_per_nt_box.svg")

fig.show(renderer="svg")

In [None]:
fig = px.box(pivot_df[~pivot_df["match"]], facet_col="REF_BASE", color="variable", x="conc", y="percent_minus_control", category_orders = order_dict)
fig.update_traces(marker_size=1)
fig.update_yaxes(range = [-0.1,5])
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(height=400, width=1200)

In [None]:
# calc mean and plot line chart

In [None]:
mean_df = pivot_df.groupby(["conc", "REF_BASE", "variable"]).agg({"percent" : ["mean"]}).reset_index()
mean_df.columns = ["conc", "REF_BASE", "variable", "percent"]

mean_df["conc"] = pd.Categorical(mean_df["conc"], categories = order_dict["conc"], ordered=True)
mean_df.sort_values(by="conc", inplace=True)

mean_df["conc"] = mean_df["conc"].apply(lambda x: int(x.split("mM")[0]))

In [None]:
fig = px.line(mean_df, x="conc", y="percent", color="variable", facet_col="REF_BASE", category_orders = order_dict)
fig.update_yaxes(range=[0,2], dtick=0.2)
fig.update_xaxes(type="category")
fig.update_layout(height=400, width=800)
#fig.write_image(f"figures/illumina_comparison/mut_rates/{seq_platform}_mut_type_per_nt_line.svg")
fig.show()

In [None]:
fig = px.line(mean_df, x="conc", y="percent", color="variable", facet_col="REF_BASE", category_orders = order_dict)
fig.update_yaxes(type="log", range=[-2.5,0.6])
fig.update_xaxes(type="category")
fig.update_layout(height=400, width=800)
fig.write_image(f"figures/nanopore_mut_type_per_nt_log_line.svg")
fig.show()

In [None]:
mean_df["match"] = mean_df["REF_BASE"] == mean_df["variable"]
mean_df["conc"] = mean_df["conc"].apply(lambda x: str(x) + "mM")

In [None]:
fig = px.pie(mean_df[~mean_df["match"]], values='percent', names='variable', facet_col="REF_BASE", facet_row="conc", category_orders = order_dict)
fig.update_layout(height=1300, width=1000)
fig.write_image(f"figures/nanopore_mut_type_per_nt_pie.svg")
fig.show()