In [None]:
import plotly.express as px
import plotly.io as pio
pio.templates.default = "simple_white"
colorscale = px.colors.sequential.Plasma
colorscale = [colorscale[0]] + colorscale[3:8]


#for no gaps between histogram bars:
#fig.update_traces(marker_line_width=0) 
#fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

#plotly libraries
import plotly.express as px
import plotly.colors as pc
import plotly.graph_objects as go
import plotly.io as pio
import os
import pandas as pd
import numpy as np


#default renderer (svg means very small file size, visibility on github, high quality, but requires sometimes setting height and width)
#pio.renderers.default = "jupyterlab"
pio.renderers.default = "svg"
colors = ["#363b3d", "#727b76", "#31a240", "#f93939", "#f79118", "#de7b06", "#9b308f", "#dc759b"]
#additional defaults
px.defaults.color_discrete_sequence = ["rgb(100,100,100)"] + pc.qualitative.D3
px.defaults.width=1000
px.defaults.height=500


#try setting this as default for histograms
#fig.update_traces(marker_line_width=0.2)

#pio.templates["simple_white"].layout.template.data.Histogram["marker_line_width"] = 0.2


#set default template as "simple_white" (no background, no grid lines)
pio.templates["simple_white"].layout["yaxis"]["showgrid"] = True
pio.templates["simple_white"].layout["xaxis"]["showgrid"] = True
pio.templates.default = "simple_white"

In [None]:
import pandas as pd
import numpy as np
import os
from Bio import SeqIO

In [None]:
isoforms = ['RT2_Tat6_RT3_Tat2',
 'RT2_Env5_RT3_Nef3',
 'RT2_Env10_RT3_Rev7',
 'RT2_Vpr3_RT3_Vpr1',
 'RT2_Tat5_RT3_Tat1',
 'RT1_unspliced1',
 'RT2_Env11_RT3_Rev8',
 'RT2_Env13_RT3_Nef5',
 'RT2_Env3_RT3_Rev2',
 'RT2_Env9_RT3_Nef4',
 'RT2_Env1_RT3_Nef2',
 'RT2_Vif2_RT3_Vif1',
 'RT2_Env2_RT3_Rev1']

In [None]:
samplesheet = pd.read_excel("samplesheet.xlsx", engine="openpyxl")

In [None]:
import numpy as np
from Bio import SeqIO
data = []


option = "q22_eq10_ndni"
reactive_nt = "AC"
norm_option=""

for isoform in isoforms:
    reactivity_dir = f"{data_folder}/reactivities/{isoform}/{option}_{reactive_nt}{norm_option}"

    reactivities = [file for file in os.listdir(reactivity_dir) if ".csv" in file]
    for reactivity_file in reactivities:
        #print(reactivity_file)
        sample_name = reactivity_file.split(".csv")[0]
        reactivity = np.loadtxt(f"{reactivity_dir}/{reactivity_file}").astype("float")
        for record in SeqIO.parse(f"references/transcripts_PCR1_2_3/{isoform}.fa", "fasta"):
            sequence = list(record.seq)
        data.append(
        {
            "sample" : sample_name,
            "DMS_conc" : sample_name.split("_")[1],
            "location" : sample_name.split("_")[2],
            "type" : sample_name.split("_")[3],
            "RT" : sample_name.split("_")[0],
            "isoform" : isoform,
            "reactivity" : reactivity,
            "sequence" :sequence
        }
        )

In [None]:
df = pd.DataFrame(data)
df["RT_isoform_location"] = df["RT"] + "_" + df["isoform"]
df["RT_isoform_location"] = df["RT"] + "_" + df["isoform"] + "_" + df["location"]
df["RT_isoform_treatment"] = df["RT"] + "_" + df["isoform"] + "_" + df["type"]

# Reactivity heatmap with clustering

In [None]:
from seaborn import clustermap
import matplotlib.pyplot as plt

In [None]:
figure_dir = f"{os.getcwd()}/figures/reactivities"
os.makedirs(figure_dir, exist_ok=True)

In [None]:
reactive_nt = "AC"
option = "q22_eq10_ndni"
norm_option= ""

for conc in ["8mM", "17mM", "34mM", "57mM", "85mM"]:
    sub_df = df[df["DMS_conc"] == conc]
    isoform_names = df["RT_isoform_location"].values
    RTs = sub_df["RT"].values
    

    ref = SeqIO.read(f"{os.getcwd()}/references/RT1_unspliced1.fa", "fasta")
    sequence = np.array(list(ref.seq))

    reactivity_matrix = np.full((len(sub_df), 289), fill_value=-1.0)
    row = 0
    sample_labels = []
    
    for i, (_, row) in enumerate(sub_df.iterrows()):
        reactivity = np.nan_to_num(row["reactivity"], nan=-1)
        reactivity_matrix[i, :] = reactivity

    
    nonnan_positions = np.where(~np.any(reactivity_matrix<0, axis=0))[0]
    nonnan_matrix = reactivity_matrix[:,nonnan_positions]
    
    nonnan_nt = seq[nonnan_positions]
    
    nt_labels = [f"{nonnan_positions[i]}\t{nonnan_nt[i]}" for i in range(len(nonnan_positions))]

    fig = clustermap(np.corrcoef(nonnan_matrix), xticklabels = [label.replace("\t", " ") for label in isoform_names], yticklabels = [label.replace("\t", " ") for label in isoform_names], method="single", vmin=0.5, vmax=1)
    plt.title(conc)
    plt.savefig(f"{figure_dir}/{conc}_{reactive_nt}_correlation.svg")
    
    plt.show()
    
    ordered_isoform_names = np.array(isoform_names)[fig.dendrogram_row.reordered_ind]
    
    reordered_matrix = nonnan_matrix[fig.dendrogram_row.reordered_ind, :]

    fig = px.imshow(reordered_matrix, y=ordered_isoform_names, x=nt_labels, zmin=0, aspect='auto', color_continuous_scale="RdBu_r")
    fig.update_layout(title=conc, height=100+33*len(ordered_isoform_names), width=14*len(nonnan_nt))
    fig.update_xaxes(tickfont_size=11)
    fig.write_image(f"{figure_dir}/{conc}_{reactive_nt}_heatmap.svg")
    
    fig.show(renderer="svg")