In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as clr
import logging 
import sys
from functools import reduce
from collections import namedtuple
import math
import chart_studio.plotly as py
from pybedtools import BedTool
import pybedtools as pybt
import seaborn as sb
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines

In [97]:
def create_summed_matrix(working_directory,samplenames):
    """
    Function to summarize intensity matrices
    """
    relative_path = "intensity_matrix/intensity_matrix.csv"
    summed_matrix = np.zeros((13551,13551))
    for i in samplenames:
        matrix_path = os.path.join(working_directory,i,relative_path)
        matrix_df = pd.read_csv(matrix_path,sep=";",header=0)
        matrix_df = matrix_df[["start","end","n_reads"]]
        for start,end,n_reads in zip(matrix_df["start"],matrix_df["end"],matrix_df["n_reads"]):
            summed_matrix[start][end] += n_reads      
    summed_dbscan_matrix = np.array([[int(x),int(y),int(summed_matrix[x][y])] for x, y in zip(*np.nonzero(summed_matrix))])
    summed_df = pd.DataFrame(summed_dbscan_matrix)
    summed_df.columns = ["start","end","n_reads"]
    return summed_matrix,summed_dbscan_matrix,summed_df

def plot_matrix(
    legend_names:list,dbscan_matrices: list, color_samples: list, template_df_name: str, output: str, with_lines:bool=True):
    """
    Function to plot several matrices above one another and colorcode them
    """
    fig, ax = plt.subplots(1, 1, figsize=(10,10), dpi=500)
    legend_dots = []
    template_df = pd.read_csv(template_df_name, sep=";", header=0, index_col=None)
    template_df = template_df[template_df["Fragment"].isin(["18S", "28S", "5-8S", "5ETS", "ITS1", "ITS2", "ITS3", "3ETS"])]
    for legend_name,dbscan_matrix,color_sample in zip(legend_names,dbscan_matrices,color_samples):
        dbscan_df = pd.DataFrame(dbscan_matrix)
        start_points = dbscan_df.iloc[:, 0]
        end_points = dbscan_df.iloc[:, 1]
        maximum = dbscan_df.iloc[:, 2].max()
        alphas = dbscan_df.iloc[:, 2] / maximum
        alphas = alphas + 0.01
        alphas[alphas > 1] = 1
        ax.scatter(
            x=start_points, y=end_points, alpha=alphas, s=0.5, c=color_sample, label=legend_name
        )
        legend_dots.append(mlines.Line2D([], [], color=color_sample, marker='o', linestyle='None', markersize=5, label=legend_name))
    ax.legend(handles=legend_dots,loc='lower right')
    if with_lines:
        for template, start, end in zip(
            template_df["Fragment"], template_df["Start"], template_df["End"]
        ):
            ax.hlines(
                y=end, xmin=start, xmax=end, color="red", linewidth=1, linestyles="dashed"
            )
            ax.vlines(
                x=start, ymin=start, ymax=end, color="red", linewidth=1, linestyles="dashed"
            )
            ax.hlines(
                y=start, xmin=start, xmax=end, color="red", linewidth=1, linestyles="dashed"
            )
            ax.vlines(
                x=end, ymin=start, ymax=end, color="red", linewidth=1, linestyles="dashed"
            )
            text_x_position = int(start + ((end - start) * 0.5)) - (len(template) * 50)
            text_y_position = start - 250
            ax.text(x=text_x_position, y=text_y_position, s=template, fontsize="x-small")
    ax.set_xlabel("Start sites")
    ax.set_ylabel("End sites")
    fig.show()
    plt.savefig(f"{output}/intensity_matrix_summed.png", format="png",dpi=500)
    
    
def plot_difference_matrix(matrix_sample:np.array,matrix_reference:np.array, condition_sample:str, condition_reference:str,template_df_name:str,with_lines:bool=True):
    #viridis
    maximum_sample = matrix_sample.max()
    matrix_sample_norm = matrix_sample / maximum_sample
    
    maximum_reference = matrix_reference.max()
    matrix_reference_norm = matrix_reference / maximum_reference

    matrix_difference = matrix_sample_norm - matrix_reference_norm
    print(matrix_difference[matrix_difference < 0])
    print(matrix_difference[matrix_difference > 0])
    #matrix_difference += 1
    #matrix_difference = matrix_difference / 
    
    
    difference_dbscan_matrix = np.array([[int(x),int(y),float(matrix_difference[x][y])] for x, y in zip(*np.nonzero(matrix_difference))])
    difference_df = pd.DataFrame(difference_dbscan_matrix)
    difference_df.columns = ["start","end","alpha"]
    difference_df["alpha"] = [i - 0.2 if i < 0 else i + 0.2 for i in difference_df["alpha"]]
    difference_df["alpha"] = (difference_df["alpha"] + 1) / 2
    start_points = difference_df.start
    end_points = difference_df.end
    alphas = difference_df.alpha
    alphas[alphas > 1] = 1
    alphas[alphas < 0] = 0
    fig, ax = plt.subplots(1, 1, figsize=(10,10), dpi=500)
    template_df = pd.read_csv(template_df_name, sep=";", header=0, index_col=None)
    template_df = template_df[template_df["Fragment"].isin(["18S", "28S", "5-8S", "5ETS", "ITS1", "ITS2", "ITS3", "3ETS"])]
    
    ax.scatter(x=start_points, y=end_points, alpha=alphas, s=1,c=alphas,cmap="PiYG")
    ax.legend(loc='lower right')
    if with_lines:
        for template, start, end in zip(
            template_df["Fragment"], template_df["Start"], template_df["End"]
        ):
            ax.hlines(
                y=end, xmin=start, xmax=end, color="red", linewidth=1, linestyles="dashed"
            )
            ax.vlines(
                x=start, ymin=start, ymax=end, color="red", linewidth=1, linestyles="dashed"
            )
            ax.hlines(
                y=start, xmin=start, xmax=end, color="red", linewidth=1, linestyles="dashed"
            )
            ax.vlines(
                x=end, ymin=start, ymax=end, color="red", linewidth=1, linestyles="dashed"
            )
            text_x_position = int(start + ((end - start) * 0.5)) - (len(template) * 50)
            text_y_position = start - 250
            ax.text(x=text_x_position, y=text_y_position, s=template, fontsize="x-small")
    fig.show()

def plot_unique_matrix(matrix_samples:np.array,condition_samples:list,color_samples:list,template_df_name:str,with_lines:bool=True):
    template_df = pd.read_csv(template_df_name, sep=";", header=0, index_col=None)
    template_df = template_df[template_df["Fragment"].isin(["18S", "28S", "5-8S", "5ETS", "ITS1", "ITS2", "ITS3", "3ETS"])]
    unique_dict = dict()
    normalized_samples_matrices = []
    colors = color_samples
    conditions = condition_samples
    for matrix_sample in matrix_samples:
        maximum_sample = matrix_sample.max()
        matrix_sample_norm = matrix_sample / maximum_sample
        normalized_samples_matrices.append(matrix_sample_norm)
    for normalized_sample_matrix,color,condition in zip(normalized_samples_matrices, colors, conditions):
        for x,y in zip(*np.nonzero(normalized_sample_matrix)):
            if f"{x}:{y}" not in unique_dict.keys():
                unique_dict[f"{x}:{y}"] = list()
                unique_dict[f"{x}:{y}"].append((color,normalized_sample_matrix[x][y],condition))
            else:
                unique_dict[f"{x}:{y}"].append((color,normalized_sample_matrix[x][y],condition))
    entries = []
    for index,key in enumerate(list(unique_dict.keys())):
        x = int(key.split(":")[0])
        y = int(key.split(":")[1])
        color_alpha_condition_tuple = unique_dict[f"{x}:{y}"]
        if len(color_alpha_condition_tuple) > 1:
            color = "black"
            alphas = np.array([a[1] for a in color_alpha_condition_tuple])
            alpha = alphas.mean()
            #alpha=0
            condition = "Diverse"
            #entry = [x,y,alpha,color,condition]
            #entries.append(entry)
        elif len(color_alpha_condition_tuple) == 1:
            color=color_alpha_condition_tuple[0][0]
            alpha=color_alpha_condition_tuple[0][1]
            #alpha=1
            condition=color_alpha_condition_tuple[0][2]
            entry = [x,y,alpha,color,condition]
            entries.append(entry)
    
    condition_name = ""
    for name in condition_samples:
        spaceless_name = name.replace(" ","_")
        condition_name = f"{condition_name}{spaceless_name}_"
    #fig.savefig(f"{condition_name}_RNA45SN1_intensity_matrix.png",format = "png",dpi=500)
    #coordinates for 5'ETS,18S,ITS1,5.8S,ITS2,28S,3'ETS,small_processome,big_processome
    coordinate_names = ["5ETS","18S","ITS1+5.8S+ITS2","28S","3ETS","small_processome","big_processome","45S"]
    coordinate_tuples = [(-400,4054),(3256,5924),(5124,8325),(7525,13300),(12563,13900),(-200,7100),(6400,13900),(-400,14000)]
    plot_df = pd.DataFrame(entries, columns=["start","end","alpha","color","condition"])
    for coordinate_tuple,coordinate_name in zip(coordinate_tuples,coordinate_names):
        temp_plot_df = plot_df[plot_df["start"] >= coordinate_tuple[0]]
        temp_plot_df = temp_plot_df[temp_plot_df["end"] <= coordinate_tuple[1]]
        start_points = np.array(temp_plot_df["start"])
        end_points = np.array(temp_plot_df["end"])
        alphas = np.array(temp_plot_df["alpha"] + 0.01)
        alphas[alphas > 1] = 1
        dot_colors = np.array(temp_plot_df["color"])
        conditions = np.array(temp_plot_df["condition"])
        fig, ax = plt.subplots(1, 1, figsize=(10,10), dpi=500)
        legend_dots = []
        if not coordinate_name == "45S":
            with_lines = True
        else:
            with_lines = False
        if with_lines:
            for template, start, end in zip(
                template_df["Fragment"], template_df["Start"], template_df["End"]
            ):
                if start >= coordinate_tuple[0] and end <= coordinate_tuple[1]:
                    ax.hlines(
                        y=end, xmin=start, xmax=end, color="red", linewidth=1, linestyles="dashed", alpha = 0.08
                    )
                    ax.vlines(
                        x=start, ymin=start, ymax=end, color="red", linewidth=1, linestyles="dashed", alpha = 0.08
                    )
                    ax.hlines(
                        y=start, xmin=start, xmax=end, color="red", linewidth=1, linestyles="dashed", alpha = 0.08
                    )
                    ax.vlines(
                        x=end, ymin=start, ymax=end, color="red", linewidth=1, linestyles="dashed", alpha = 0.08
                    )
                    text_x_position = int(start + ((end - start) * 0.5)) - ((coordinate_tuple[1]-coordinate_tuple[0]) * 0.01)
                    text_y_position = start - ((coordinate_tuple[1]-coordinate_tuple[0]) * 0.01)
                    ax.text(x=text_x_position, y=text_y_position, s=template, fontsize="x-small")
        ax.scatter(x=start_points, y=end_points, alpha=alphas, s=0.5, c=dot_colors)
        #color_samples.append("black")
        #condition_samples.append("Diverse")
        for legend_color,legend_condition in zip(color_samples,condition_samples):
            legend_dots.append(mlines.Line2D([], [], color=legend_color, marker='o', linestyle='None', markersize=5, label=legend_condition))
        ax.legend(handles=legend_dots,loc='lower right')
        
        ax.set_xlabel("Start sites")
        ax.set_ylabel("End sites")
        ax.set_xlim(coordinate_tuple)
        ax.set_ylim(coordinate_tuple)
        #plt.show()
        if coordinate_name == "45S":
            fig.savefig(f"./{condition_name}_{coordinate_name}_intensity_matrix.png",format = "png",dpi=1000)
        else:
            fig.savefig(f"./{condition_name}_{coordinate_name}_intensity_matrix.svg",format = "svg")
    #ax.set_xlim(-400,14000)
    #ax.set_ylim(-400,14000)
    fig.show()
    #legend_dots.append(mlines.Line2D([], [], color=color_sample, marker='o', linestyle='None', markersize=5, label=legend_name))
    #ax.legend(handles=legend_dots,loc='lower right')

        
        
            

In [3]:
#Define path to list with rRNA intermediates from literature
template_name = "/home/stefan/wf-nanoribolyzer/references/Literature_Fragments_and_cut_sites_RNA45SN1.csv"

In [4]:
#Initialize intensity matrices of UTP18 mutant data

UTP18_wd = "/home/stefan/Synology/Data_nano_ribolyzer/20240314_Ribolyzer_UTP18/"

UTP18_IVPA_Nuc_samples = ["20240314_Ribolyzer_UTP18_R10_IVPA_Nuc","20240314_Ribolyzer_UTP18_R11_IVPA_Nuc","20240314_Ribolyzer_UTP18_R12_IVPA_Nuc"]
UTP18_IVPA_Nuc_matrix,UTP18_IVPA_Nuc_dbscan_matrix,UTP18_IVPA_Nuc_df = create_summed_matrix(UTP18_wd,UTP18_IVPA_Nuc_samples)


UTP18_IVPA_Cyt_samples = ["20240314_Ribolyzer_UTP18_R7_IVPA_Cyt","20240314_Ribolyzer_UTP18_R8_IVPA_Cyt","20240314_Ribolyzer_UTP18_R9_IVPA_Cyt"]
UTP18_IVPA_Cyt_matrix,UTP18_IVPA_Cyt_dbscan_matrix,UTP18_IVPA_Cyt_df = create_summed_matrix(UTP18_wd,UTP18_IVPA_Cyt_samples)

UTP18_NP_Nuc_samples = ["20240314_Ribolyzer_UTP18_R4_NP_Nuc","20240314_Ribolyzer_UTP18_R5_NP_Nuc","20240314_Ribolyzer_UTP18_R6_NP_Nuc"]
UTP18_NP_Nuc_matrix,UTP18_NP_Nuc_dbscan_matrix,UTP18_NP_Nuc_df = create_summed_matrix(UTP18_wd,UTP18_NP_Nuc_samples)

UTP18_NP_Cyt_samples = ["20240314_Ribolyzer_UTP18_R1_NP_Cyt","20240314_Ribolyzer_UTP18_R2_NP_Cyt","20240314_Ribolyzer_UTP18_R3_NP_Cyt"]
UTP18_NP_Cyt_matrix,UTP18_NP_Cyt_dbscan_matrix,UTP18_NP_Cyt_df = create_summed_matrix(UTP18_wd,UTP18_NP_Cyt_samples)



In [5]:
#Initialize intensity matrices of Las1L mutant data
Las1L_wd = "/home/stefan/Synology/Data_nano_ribolyzer/Las1L_NanoRibolyzer/"

Las1L_IVPA_Nuc_samples = ["IVPA_Las1L_Nucleus1","IVPA_Las1L_Nucleus2","IVPA_Las1L_Nucleus3"]
Las1L_IVPA_Nuc_matrix,Las1L_IVPA_Nuc_dbscan_matrix,Las1L_IVPA_Nuc_df = create_summed_matrix(Las1L_wd,Las1L_IVPA_Nuc_samples)


Las1L_IVPA_Cyt_samples = ["IVPA_Las1L_Cytoplasm1","IVPA_Las1L_Cytoplasm2","IVPA_Las1L_Cytoplasm3"]
Las1L_IVPA_Cyt_matrix,Las1L_IVPA_Cyt_dbscan_matrix,Las1L_IVPA_Cyt_df = create_summed_matrix(Las1L_wd,Las1L_IVPA_Cyt_samples)

Las1L_NP_Nuc_samples = ["NP_Las1L_Nucleus1","NP_Las1L_Nucleus2","NP_Las1L_Nucleus3"]
Las1L_NP_Nuc_matrix,Las1L_NP_Nuc_dbscan_matrix,Las1L_NP_Nuc_df = create_summed_matrix(Las1L_wd,Las1L_NP_Nuc_samples)

Las1L_NP_Cyt_samples = ["NP_Las1L_Cytoplasm1","NP_Las1L_Cytoplasm2","NP_Las1L_Cytoplasm3"]
Las1L_NP_Cyt_matrix,Las1L_NP_Cyt_dbscan_matrix,Las1L_NP_Cyt_df = create_summed_matrix(Las1L_wd,Las1L_NP_Cyt_samples)

In [6]:
#Initialize intensity matrices of WBSCR22 mutant data

WBSCR22_wd = "/home/stefan/Synology/Data_nano_ribolyzer/WBSCR22_NanoRibolyzer/"

WBSCR22_IVPA_Nuc_samples = ["IVPA_WBSCR22_Nucleus1","IVPA_WBSCR22_Nucleus2","IVPA_WBSCR22_Nucleus3"]
WBSCR22_IVPA_Nuc_matrix,WBSCR22_IVPA_Nuc_dbscan_matrix,WBSCR22_IVPA_Nuc_df = create_summed_matrix(WBSCR22_wd,WBSCR22_IVPA_Nuc_samples)


WBSCR22_IVPA_Cyt_samples = ["IVPA_WBSCR22_Cytoplasm1","IVPA_WBSCR22_Cytoplasm2","IVPA_WBSCR22_Cytoplasm3"]
WBSCR22_IVPA_Cyt_matrix,WBSCR22_IVPA_Cyt_dbscan_matrix,WBSCR22_IVPA_Cyt_df = create_summed_matrix(WBSCR22_wd,WBSCR22_IVPA_Cyt_samples)

WBSCR22_NP_Nuc_samples = ["NP_WBSCR22_Nucleus1","NP_WBSCR22_Nucleus2","NP_WBSCR22_Nucleus3"]
WBSCR22_NP_Nuc_matrix,WBSCR22_NP_Nuc_dbscan_matrix,WBSCR22_NP_Nuc_df = create_summed_matrix(WBSCR22_wd,WBSCR22_NP_Nuc_samples)

WBSCR22_NP_Cyt_samples = ["NP_WBSCR22_Cytoplasm1","NP_WBSCR22_Cytoplasm2","NP_WBSCR22_Cytoplasm3"]
WBSCR22_NP_Cyt_matrix,WBSCR22_NP_Cyt_dbscan_matrix,WBSCR22_NP_Cyt_df = create_summed_matrix(WBSCR22_wd,WBSCR22_NP_Cyt_samples)

In [7]:
#Initialize intensity matrices of Ctrl mutant data
Ctrl_wd = "/home/stefan/Synology/Data_nano_ribolyzer/20240314_Ribolyzer_Ctrl/"

Ctrl_IVPA_Nuc_samples = ["20240314_Ribolyzer_Ctrl_R10_IVPA_Nuc","20240314_Ribolyzer_Ctrl_R11_IVPA_Nuc","20240314_Ribolyzer_Ctrl_R12_IVPA_Nuc"]
Ctrl_IVPA_Nuc_matrix,Ctrl_IVPA_Nuc_dbscan_matrix,Ctrl_IVPA_Nuc_df = create_summed_matrix(Ctrl_wd,Ctrl_IVPA_Nuc_samples)


Ctrl_IVPA_Cyt_samples = ["20240314_Ribolyzer_Ctrl_R7_IVPA_Cyt","20240314_Ribolyzer_Ctrl_R8_IVPA_Cyt","20240314_Ribolyzer_Ctrl_R9_IVPA_Cyt"]
Ctrl_IVPA_Cyt_matrix,Ctrl_IVPA_Cyt_dbscan_matrix,Ctrl_IVPA_Cyt_df = create_summed_matrix(Ctrl_wd,Ctrl_IVPA_Cyt_samples)

Ctrl_NP_Nuc_samples = ["20240314_Ribolyzer_Ctrl_R4_NP_Nuc","20240314_Ribolyzer_Ctrl_R5_NP_Nuc","20240314_Ribolyzer_Ctrl_R6_NP_Nuc"]
Ctrl_NP_Nuc_matrix,Ctrl_NP_Nuc_dbscan_matrix,Ctrl_NP_Nuc_df = create_summed_matrix(Ctrl_wd,Ctrl_NP_Nuc_samples)

Ctrl_NP_Cyt_samples = ["20240314_Ribolyzer_Ctrl_R1_NP_Cyt","20240314_Ribolyzer_Ctrl_R2_NP_Cyt","20240314_Ribolyzer_Ctrl_R3_NP_Cyt"]
Ctrl_NP_Cyt_matrix,Ctrl_NP_Cyt_dbscan_matrix,Ctrl_NP_Cyt_df = create_summed_matrix(Ctrl_wd,Ctrl_NP_Cyt_samples)

In [8]:
#Initialize intensity matrices of SN1
SN1_wd = "/home/stefan/Synology/Data_nano_ribolyzer/NP_IVPA_SN1-3_R10/"

SN1_IVPA_samples = ["20231123_SNExp_18Bc_IVPA_SN1_R1","20231123_SNExp_18Bc_IVPA_SN1_R2","20231123_SNExp_18Bc_IVPA_SN1_R3"]
SN1_IVPA_matrix,SN1_IVPA_dbscan_matrix,SN1_IVPA_df = create_summed_matrix(SN1_wd,SN1_IVPA_samples)

SN1_NP_samples = ["20231123_SNExp_18Bc_NP_SN1_R1","20231123_SNExp_18Bc_NP_SN1_R2","20231123_SNExp_18Bc_NP_SN1_R3"]
SN1_NP_matrix,SN1_NP_dbscan_matrix,SN1_NP_df = create_summed_matrix(SN1_wd,SN1_NP_samples)



In [9]:
#Initialize intensity matrices of SN2
SN2_wd = "/home/stefan/Synology/Data_nano_ribolyzer/NP_IVPA_SN1-3_R10/"

SN2_IVPA_samples = ["20231123_SNExp_18Bc_IVPA_SN2_R1","20231123_SNExp_18Bc_IVPA_SN2_R2","20231123_SNExp_18Bc_IVPA_SN2_R3"]
SN2_IVPA_matrix,SN2_IVPA_dbscan_matrix,SN2_IVPA_df = create_summed_matrix(SN2_wd,SN2_IVPA_samples)

SN2_NP_samples = ["20231123_SNExp_18Bc_NP_SN2_R1","20231123_SNExp_18Bc_NP_SN2_R2","20231123_SNExp_18Bc_NP_SN2_R3"]
SN2_NP_matrix,SN2_NP_dbscan_matrix,SN2_NP_df = create_summed_matrix(SN2_wd,SN2_NP_samples)

In [10]:
#Initialize intensity matrices of SN1-SN3
SN3_wd = "/home/stefan/Synology/Data_nano_ribolyzer/NP_IVPA_SN1-3_R10/"

SN3_IVPA_samples = ["20231123_SNExp_18Bc_IVPA_SN3_R1","20231123_SNExp_18Bc_IVPA_SN3_R2","20231123_SNExp_18Bc_IVPA_SN3_R3"]
SN3_IVPA_matrix,SN3_IVPA_dbscan_matrix,SN3_IVPA_df = create_summed_matrix(SN3_wd,SN3_IVPA_samples)

SN3_NP_samples = ["20231123_SNExp_18Bc_NP_SN3_R1","20231123_SNExp_18Bc_NP_SN3_R2","20231123_SNExp_18Bc_NP_SN3_R3"]
SN3_NP_matrix,SN3_NP_dbscan_matrix,SN3_NP_df = create_summed_matrix(SN3_wd,SN3_NP_samples)

In [11]:
#Initialize intensity matrices of Cell Cytoplasm and Nucleus
general_wd = "/home/stefan/Synology/Data_nano_ribolyzer/NP_IVPA_R10"

Nuc_IVPA_samples = ["20230920_NucExp_IVPA_9bc_Nucleus1","20230920_NucExp_IVPA_9bc_Nucleus2","20230920_NucExp_IVPA_9bc_Nucleus3"]
Nuc_IVPA_matrix,Nuc_IVPA_dbscan_matrix,Nuc_IVPA_df = create_summed_matrix(general_wd,Nuc_IVPA_samples)

Nuc_NP_samples = ["20231214_NucExp_NP_R10_Nuc_R1","20231214_NucExp_NP_R10_Nuc_R2","20231214_NucExp_NP_R10_Nuc_R3"]
Nuc_NP_matrix,Nuc_NP_dbscan_matrix,Nuc_NP_df = create_summed_matrix(general_wd,Nuc_NP_samples)

Cyt_IVPA_samples = ["20230920_NucExp_IVPA_9bc_Cytoplasm1","20230920_NucExp_IVPA_9bc_Cytoplasm2","20230920_NucExp_IVPA_9bc_Cytoplasm3"]
Cyt_IVPA_matrix,Cyt_IVPA_dbscan_matrix,Cyt_IVPA_df = create_summed_matrix(general_wd,Cyt_IVPA_samples)

Cyt_NP_samples = ["20231214_NucExp_NP_R10_Cyt_R1","20231214_NucExp_NP_R10_Cyt_R2","20231214_NucExp_NP_R10_Cyt_R3"]
Cyt_NP_matrix,Cyt_NP_dbscan_matrix,Cyt_NP_df = create_summed_matrix(general_wd,Cyt_NP_samples)

Cell_IVPA_samples = ["20230920_NucExp_IVPA_9bc_Cell1","20230920_NucExp_IVPA_9bc_Cell2","20230920_NucExp_IVPA_9bc_Cell3"]
Cell_IVPA_matrix,Cell_IVPA_dbscan_matrix,Cell_IVPA_df = create_summed_matrix(general_wd,Cell_IVPA_samples)

Cell_NP_samples = ["20231214_NucExp_NP_R10_Cell_R1","20231214_NucExp_NP_R10_Cell_R2","20231214_NucExp_NP_R10_Cell_R3"]
Cell_NP_matrix,Cell_NP_dbscan_matrix,Cell_NP_df = create_summed_matrix(general_wd,Cell_NP_samples)

In [None]:
summed_matrix_list = [UTP18_IVPA_Nuc_dbscan_matrix,Ctrl_IVPA_Nuc_dbscan_matrix]
colors = ["blue", "red"]
legend_names = ["UTP18 IVPA Nuc","Ctrl IVPA Nuc"]
plot_matrix(legend_names,summed_matrix_list,colors,template_name,"./")

In [None]:
summed_matrix_list = [Las1L_IVPA_Nuc_dbscan_matrix,Ctrl_IVPA_Nuc_dbscan_matrix]
colors = ["blue", "red"]
legend_names = ["Las1L IVPA Nuc","Ctrl IVPA Nuc"]
plot_matrix(legend_names,summed_matrix_list,colors,template_name,"./")

In [None]:
summed_matrix_list = [SN1_NP_dbscan_matrix,SN2_NP_dbscan_matrix,SN3_NP_dbscan_matrix]
colors = ["blue", "red","green"]
legend_names = ["SN1 NP","SN2 NP","SN3 NP"]
plot_matrix(legend_names,summed_matrix_list,colors,template_name,"./")

In [None]:
summed_matrix_list = [SN1_IVPA_dbscan_matrix,SN2_IVPA_dbscan_matrix,SN3_IVPA_dbscan_matrix]
colors = ["blue", "red","green"]
legend_names = ["SN1 IVPA","SN2 IVPA","SN3 IVPA"]
plot_matrix(legend_names,summed_matrix_list,colors,template_name,"./",False)

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_IVPA_Nuc_matrix,UTP18_IVPA_Nuc_matrix,WBSCR22_IVPA_Nuc_matrix,Las1L_IVPA_Nuc_matrix],
    condition_samples=["Ctrl IVPA","UTP18 IVPA","WBSCR22 IVPA","Las1L IVPA"], 
    color_samples = ["blue","magenta","cyan","orange"],
    template_df_name=template_name, 
    with_lines=False
    )

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_IVPA_Nuc_matrix,UTP18_IVPA_Nuc_matrix],
    condition_samples=["Ctrl IVPA","UTP18 IVPA"], 
    color_samples = ["blue","red"],
    template_df_name=template_name, 
    with_lines=False
    )

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_IVPA_Nuc_matrix],
    condition_samples=["Ctrl IVPA"], 
    color_samples = ["blue"],
    template_df_name=template_name, 
    with_lines=False
    )

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_IVPA_Nuc_matrix,WBSCR22_IVPA_Nuc_matrix],
    condition_samples=["Ctrl IVPA","WBSCR22 IVPA"], 
    color_samples = ["blue","red"],
    template_df_name=template_name, 
    with_lines=False
    )

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_IVPA_Nuc_matrix,Las1L_IVPA_Nuc_matrix],
    condition_samples=["Ctrl IVPA","Las1L IVPA"], 
    color_samples = ["blue","red"],
    template_df_name=template_name, 
    with_lines=False
    )
#4285F4

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_NP_Nuc_matrix,UTP18_NP_Nuc_matrix,WBSCR22_NP_Nuc_matrix,Las1L_NP_Nuc_matrix],
    condition_samples=["Ctrl NP","UTP18 NP","WBSCR22 NP","Las1L NP"], 
    color_samples = ["blue","magenta","cyan","orange"],
    template_df_name=template_name, 
    with_lines=False
    )

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_NP_Nuc_matrix,UTP18_NP_Nuc_matrix],
    condition_samples=["Ctrl NP","UTP18 NP"], 
    color_samples = ["blue","red"],
    template_df_name=template_name, 
    with_lines=False
    )

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_NP_Nuc_matrix,WBSCR22_NP_Nuc_matrix],
    condition_samples=["Ctrl NP","WBSCR22 NP"], 
    color_samples = ["blue","red"],
    template_df_name=template_name, 
    with_lines=False
    )

In [None]:
plot_unique_matrix(
    matrix_samples=[Ctrl_NP_Nuc_matrix,Las1L_NP_Nuc_matrix],
    condition_samples=["Ctrl NP","Las1L NP"], 
    color_samples = ["blue","red"],
    template_df_name=template_name, 
    with_lines=False
    )

In [158]:
#Boxplot function
import pysam
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
def boxplot_bamstats(bamfile_paths:list,exp_conditions:list,reference_path:str,literature_mod_df_path:str):
    literature_mod_df = pd.read_csv(literature_mod_df_path ,sep="\t",header=None,index_col=None) #"/home/stefan/wf-nanoribolyzer/references/rRNA_modifications_conv.bed"
    literature_mod_df.columns = ["reference","start","end","modification","A","B","C"]
    fasta_file = pysam.FastaFile(reference_path)
    reference = fasta_file.references[0]
    reference_sequence = str(fasta_file.fetch(reference))
    list_of_modification_ratios = []
    for bamfile_path,exp_condition in zip(bamfile_paths,exp_conditions):
        bamfile = pysam.AlignmentFile(bamfile_path, mode="rb")
        #iterator = 0            
        for i in tqdm(bamfile.fetch(until_eof=True)):
            #iterator += 1
            #if iterator >= 1001:
                #break
            modification_dict = {}
            for modification in np.unique(literature_mod_df["modification"]):
                #print(modification)
                modification_dict[modification] = [0,0,0,0,0,[]]
            if i.is_supplementary:
                continue
            try:
                aligned_pairs = i.get_aligned_pairs(with_seq=True)
                alignment_dict = {}
                minimum = i.reference_start
                maximum = i.reference_end
                temp_mod_df = literature_mod_df
                temp_mod_df = literature_mod_df[literature_mod_df["start"] >= minimum]
                #print(temp_mod_df)
                #print(maximum)
                temp_mod_df = temp_mod_df[temp_mod_df["end"] <= maximum]
                #print(temp_mod_df.shape)
                last_valid_ref_pos = 1000000000
                for query_pos, ref_pos, base in aligned_pairs:
                    #if query_pos == None or ref_pos == None:
                    #print(query_pos, ref_pos, base, i.get_forward_sequence()[query_pos])
                    if temp_mod_df.shape[0] == 0:
                        break
                    if ref_pos is not None:
                        last_valid_ref_pos = ref_pos
                        if ref_pos > max(temp_mod_df["end"]):
                            continue
                        if ref_pos < min(temp_mod_df["start"]):
                            continue
                    for start,end,modification in zip(temp_mod_df["start"],temp_mod_df["end"],temp_mod_df["modification"]):
                        #try:
                        if query_pos is None and ref_pos is None:
                            break
                        elif ref_pos is None and query_pos is not None: 
                            if start - 1 <= last_valid_ref_pos <= end + 1:
                                #Base Quality record
                                quality = i.query_qualities[query_pos]
                                modification_dict[modification][5].append(float(quality))
                                #Insertion
                                #print("Insertion")
                                modification_dict[modification][0] = modification_dict[modification][0] + 1
                                modification_dict[modification][1] = modification_dict[modification][1] + 1
                                last_valid_ref_pos = 1000000000
                                break
                        elif query_pos is None and ref_pos is not None:
                            if start <= ref_pos <= end:
                                #Deletion
                                #print("Deletion")
                                modification_dict[modification][0] = modification_dict[modification][0] + 1
                                modification_dict[modification][2] = modification_dict[modification][2] + 1
                                last_valid_ref_pos = 1000000000
                                break
                        elif query_pos is not None and ref_pos is not None:
                            if start <= ref_pos <= end:
                                #Base Quality record
                                modification_dict[modification][0] = modification_dict[modification][0] + 1
                                quality = i.query_qualities[query_pos]
                                modification_dict[modification][5].append(float(quality))
                                #Missmatch
                                if str(base).islower():
                                    #print("Missmatch")
                                    #print(query_pos, ref_pos, base, i.get_forward_sequence()[query_pos], start, end)
                                    modification_dict[modification][3] = modification_dict[modification][3] + 1
                                    last_valid_ref_pos = 1000000000
                                    break
                                #Match
                                elif str(base).isupper():
                                    #print("Match")
                                    modification_dict[modification][4] = modification_dict[modification][4] + 1
                                    last_valid_ref_pos = 1000000000
                                    break
            except TypeError:
                #print("Aligned pairs were NoneType")
                continue
            for modification in np.unique(literature_mod_df["modification"]):
                        modification_entry = modification_dict[modification]
                        if modification_entry[0] != 0:
                            mod_list_for_df = [
                                modification_entry[0],
                                modification_entry[1],
                                modification_entry[1]/modification_entry[0],
                                modification_entry[2],
                                modification_entry[2]/modification_entry[0],
                                modification_entry[3],
                                modification_entry[3]/modification_entry[0],
                                modification_entry[4],
                                modification_entry[4]/modification_entry[0],
                                np.mean(np.array(modification_entry[5])),
                                modification,
                                i.query_name,
                                exp_condition
                            ]
                            list_of_modification_ratios.append(mod_list_for_df)
    modification_ratio_df = pd.DataFrame(list_of_modification_ratios,
                                         columns=
                                            [
                                             "n_modification",
                                             "insertion",
                                             "insertion_freq",
                                             "deletion",
                                             "deletion_freq",
                                             "missmatch",
                                             "missmatch_freq",
                                             "match",
                                             "match_freq",
                                             "mean_quality",
                                             "modification_type",
                                             "read_id",
                                             "experimental_condition"
                                             ]
                                         )
    modification_ratio_df.to_csv("./bamstats.csv",sep=";",header=True,index=None)
    #print(modification_ratio_df[modification_ratio_df["experimental_condition"] == "NP Cyt"])
    #print(modification_ratio_df[modification_ratio_df["experimental_condition"] == "IVT 18S"])
    return modification_ratio_df

def plot_boxplot(modification_df:pd.DataFrame):
    #print(np.unique(modification_df["modification_type"]))
    fig,axs = plt.subplots(nrows=len(np.unique(modification_df["modification_type"])),ncols=5,figsize = (40,40))
    for index,val in enumerate(np.unique(modification_df["modification_type"])):
        for index2,val2 in enumerate(["mean_quality","insertion_freq","deletion_freq","missmatch_freq","match_freq"]):
            temp_ax = axs[index,index2]
            temp_modification_df =  modification_df[modification_df["modification_type"] == val]
            sns.violinplot(x=temp_modification_df["experimental_condition"],y=temp_modification_df[val2],hue=temp_modification_df["experimental_condition"],ax=temp_ax,alpha=0.5)
            sns.boxplot(x=temp_modification_df["experimental_condition"],y=temp_modification_df[val2],hue=temp_modification_df["experimental_condition"],ax=temp_ax,whis=(0, 100))
            temp_ax.set_ylabel(val2.replace("_"," "))
            temp_ax.set_xlabel("")
            if val2 == "mean_quality":
                temp_ax.set_ylim((-5,55))
            else:
                temp_ax.set_ylim((-0.2,1.2))
            if index2 == 0:
                temp_ylabel = val2.replace("_"," ")
                temp_ax.set_ylabel(f"{val}\n{temp_ylabel}")
    fig.savefig(f"quality_indel_match_boxplot.svg",format="svg")
    plt.show()

In [None]:
plot_boxplot(modification_ratio_df)