In [2]:
#Specific Modification Analysis of m62A and m1acp3psu
import read_raw_current_class as rrc
from pathlib import Path
import pod5
import remora
from remora import io, refine_signal_map, util
import numpy as np
import argparse
import pysam
from tqdm import tqdm
import math
from itertools import repeat
import json
import re
import polars as pl
from multiprocessing import Process, Queue
import matplotlib.pyplot as plt
import os
import pandas as pd
import h5py



def extract_reference_coordinate_signals_for_PCA(
    pod5_dr,
    df,
    bam_fh,
    kmer_table,
    sig_map_refiner,
    coordinate,
    bases_upstream,
    bases_downstream,
    ref_fragment,
    #ax,
    #ax2,
    #color1,
    reference_path,
    condition,
    read_ids,
    mean_signal_df,
    dwell_time_df
):
    fasta_file = pysam.FastaFile(reference_path)
    reference = fasta_file.references[0]
    reference_sequence = str(fasta_file.fetch(reference))
    extracted_sequence = reference_sequence[
        coordinate - bases_downstream : coordinate + bases_upstream + 1
    ]
    record_length = 1 + bases_upstream + bases_downstream
    columns = [f"{i+1}" for i in range(record_length)]
    columns.append("condition")

    #temp_dwell_time_df = pl.DataFrame(schema={col: pl.Float64 if col != "condition" else pl.Utf8 for col in columns})
    temp_dwell_time_df_list = []
    #temp_signal_df = pl.DataFrame(schema={col: pl.Float64 if col != "condition" else pl.Utf8 for col in columns})
    temp_signal_df_list = []
    for fragment_id_list, fragment in zip(df["IDS"], df["Fragment"]):
        if fragment == ref_fragment:
            fragment_id_list = eval(fragment_id_list)
            valid_read_ids = []
            key_dict = {}
            for i in tqdm(fragment_id_list):
                key_dict[i] = i
            for i in tqdm(read_ids):
                try:
                    valid_read_ids.append(key_dict[i])
                except KeyError:
                    continue
            for iteration, single_id in tqdm(
                enumerate(valid_read_ids), total=len(valid_read_ids)
            ):
                if condition == "IVT 18S" and iteration >= 20000:
                    break
                dataset = rrc.RawCurrentReadDataset(
                    id=single_id,
                    pod5_dr=pod5_dr,
                    bam_fh=bam_fh,
                    kmer_table=kmer_table,
                    sig_map_refiner=sig_map_refiner,
                )
                try:
                    (
                        ref_motifs,
                        ref_signals,
                        ref_mean_signals,
                        ref_trimmean_signals,
                        ref_dwell_signals,
                    ) = dataset.extract_signal_reference_coordinates(coordinate, 10, 10)
                    if len(ref_motifs) == record_length:
                        #print(ref_dwell_signals)
                        new_ref_dwell_signals = list(ref_dwell_signals)
                        new_ref_dwell_signals.append(condition)
                        #new_dwell_time_row = pl.DataFrame([[float(i)] if column != "condition" else [str(i)] for i,column in zip(new_ref_dwell_signals,columns)],schema=columns)
                        temp_dwell_time_df_list.append(new_ref_dwell_signals)
                        new_ref_trimmean_signals = list(ref_trimmean_signals)
                        new_ref_trimmean_signals.append(condition)
                        #print(ref_trimmean_signals)
                        #new_trimmean_signals_row = pl.DataFrame([[float(i)] if column != "condition" else [str(i)] for i,column in zip(new_ref_trimmean_signals,columns)],schema=columns)
                        #print(new_trimmean_signals_row)
                        temp_signal_df_list.append(new_ref_trimmean_signals)
                except IndexError:
                    continue
                except TypeError:
                    #print(new_ref_dwell_signals)
                    continue
    temp_dwell_time_df = pl.DataFrame(temp_dwell_time_df_list,schema={col: pl.Float64 if col != "condition" else pl.Utf8 for col in columns})
    temp_signal_df = pl.DataFrame(temp_signal_df_list,schema={col: pl.Float64 if col != "condition" else pl.Utf8 for col in columns})
    if dwell_time_df is not None:
        dwell_time_df = dwell_time_df.vstack(temp_dwell_time_df)
    else:
        dwell_time_df = temp_dwell_time_df
    if mean_signal_df is not None:
        mean_signal_df = mean_signal_df.vstack(temp_signal_df)
    else:
        mean_signal_df = temp_signal_df
    return mean_signal_df,dwell_time_df 


In [None]:
#fig, (ax1,ax2) = plt.subplots(nrows=2, ncols=1, figsize=(14, 14))
ref_fragment = "18S"
coordinate = 4901
bases_upstream = 10
bases_downstream = 10
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
name_modification = "m1acp3psU"
#fig.suptitle(f"{name_modification}", fontsize=16)

condition = "IVPA Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table, do_rough_rescale=True, scale_iters=1, do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,None,None)
# #"navy"


condition = "IVPA Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"darkred"

condition = "NP Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"royalblue"


condition = "NP Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"indianred",

condition = "IVT 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/IVT_18S/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/IVT_18S/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/IVT_18S/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"green"

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "IVPA Nucleus"

pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)

for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "NP Nucleus"


pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)


for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)


ref_fragment = "18S"
condition = "TSR KO 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/TSR_KO/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/TSR_KO/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/TSR_KO/filtered_pod5/filtered_pod5_rebasecalled.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)

m1acp3psU_mean_signal_df = mean_signal_df
m1acp3psU_dwell_time_df = dwell_time_df
m7F_dwell_time_df.write_csv("./m1acp3psu_PCA_dwell_time.csv",separator = ";", include_header=True)
m1acp3psU_mean_signal_df.write_csv("./m1acp3psu_PCA_mean_signal.csv",separator = ";", include_header=True)

In [30]:
m1acp3psU_mean_signal_df = pl.read_csv("./m1acp3psu_PCA_mean_signal.csv",separator = ";", has_header=True)
m1acp3psU_dwell_time_df = pl.read_csv("./m1acp3psu_PCA_dwell_time.csv",separator = ";", has_header=True)

m1acp3psU_mean_signal_df = m1acp3psU_mean_signal_df.with_columns(count = np.array([1 for i in range(m1acp3psU_mean_signal_df.shape[0])]))
grouped_m1acp3psU_mean_signal_df = m1acp3psU_mean_signal_df.group_by("condition").agg(pl.col("count").sum())
grouped_m1acp3psU_mean_signal_df
grouped_m1acp3psU_mean_signal_df.write_csv("./grouped_m1acp3psU_counts.csv",separator = ";", include_header=True)

In [5]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
import matplotlib.lines as mlines



def plot_pca_raw_currents(signal_df:pl.DataFrame,colors:list,alphas:list,conditions:list):
    #temp_mean_signal_df = mean_signal_df
    temp_signal_df = signal_df.filter(pl.col("condition").is_in(conditions))
    labels = temp_signal_df["condition"]
    temp_signal_df = temp_signal_df.select(pl.exclude("condition"))

    #print(temp_mean_signal_df)
    X = temp_signal_df.to_numpy()
    y = labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

    imputer = SimpleImputer(strategy='mean')

    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    pca = PCA(n_components=20)

    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    #colors = ["navy","darkred","royalblue","indianred","green"]
    #conditions = ["IVPA Cytoplasm","IVPA Nucleus","NP Cytoplasm","NP Nucleus","IVT 18S"]
    color_dict = {}
    alpha_dict = {}
    for color,alpha,condition in zip(colors,alphas,conditions):
        color_dict[condition] = color
        alpha_dict[condition] = float(alpha)
    sns.set_style("white")
    X_test = np.array([j for i,j in zip(list(y_test),list(X_test)) if alpha_dict[i] != 0])
    y_test = np.array([i for i in list(y_test) if alpha_dict[i] != 0])
    
    fig = sns.jointplot(x=X_test[:,0],y=X_test[:,1],hue=y_test,palette=color_dict,fill=True, alpha = 0.4 ,kind="kde",marginal_ticks=False,levels=5,thresh=0.3,marginal_kws=dict(fill=False))
    fig.plot_joint(sns.kdeplot,color="b",zorder=0,levels=5,thresh=0.1)
    fig.plot_marginals(sns.kdeplot, color="b",zorder=0)
    fig.set_axis_labels(f'PC1:{int(round(pca.explained_variance_ratio_[0],2)*100)}%',f'PC2:{int(round(pca.explained_variance_ratio_[1],2)*100)}%')
    fig.ax_marg_x.set_ylim(0,1)
    fig.ax_marg_y.set_xlim(0,1)
    fig.ax_joint.set_xlim(-4.2,4.2)
    fig.ax_joint.set_ylim(-4.2,4.2)
    #sns.kdeplot(x=X_test[:,0],y=X_test[:,1],hue=y_test,fill=False, levels=2, thresh=0.5,palette=color_dict, ax=ax )
    # ax.scatter(x=X_test[:,0],y=X_test[:,1],c=[color_dict[condition] for condition in y_test], alpha = [alpha_dict[condition] for condition in y_test])
    #ax.set_xlabel('First principal component')
    #ax.set_ylabel('Second Principal Component')
    #legend_dots = []
    # print(np.unique(y_test))
    #for legend_color,legend_condition, show_bool in zip([color_dict[condition] for condition in np.unique(y_test)],np.unique(y_test),[alpha_dict[condition] for condition in np.unique(y_test)]):
    #    if show_bool != 0:
    #        legend_dots.append(mlines.Line2D([], [], color=legend_color, marker='o', linestyle='None', markersize=5, label=legend_condition))
    # ax.set_facecolor('white')
    
    return fig


def plot_pca_raw_currents_scatter(signal_df:pl.DataFrame,colors:list,alphas:list,conditions:list):
    #temp_mean_signal_df = mean_signal_df
    temp_signal_df = signal_df.filter(pl.col("condition").is_in(conditions))
    labels = temp_signal_df["condition"]
    temp_signal_df = temp_signal_df.select(pl.exclude("condition"))

    #print(temp_mean_signal_df)
    X = temp_signal_df.to_numpy()
    y = labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

    imputer = SimpleImputer(strategy='mean')

    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    pca = PCA(n_components=20)

    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    #colors = ["navy","darkred","royalblue","indianred","green"]
    #conditions = ["IVPA Cytoplasm","IVPA Nucleus","NP Cytoplasm","NP Nucleus","IVT 18S"]
    color_dict = {}
    alpha_dict = {}
    for color,alpha,condition in zip(colors,alphas,conditions):
        color_dict[condition] = color
        alpha_dict[condition] = float(alpha)
    sns.set_style("white")
    X_test = np.array([j for i,j in zip(list(y_test),list(X_test)) if alpha_dict[i] != 0])
    y_test = np.array([i for i in list(y_test) if alpha_dict[i] != 0])
    
    fig = sns.jointplot(x=X_test[:,0],y=X_test[:,1],hue=y_test,palette=color_dict, alpha = 0.4)
    #fig.plot_joint(sns.kdeplot,color="b",zorder=0,levels=5,thresh=0.1)
    fig.plot_marginals(sns.kdeplot, color="b",zorder=0)
    fig.set_axis_labels(f'PC1:{int(round(pca.explained_variance_ratio_[0],2)*100)}%',f'PC2:{int(round(pca.explained_variance_ratio_[1],2)*100)}%')
    fig.ax_marg_x.set_ylim(0,1)
    fig.ax_marg_y.set_xlim(0,1)
    fig.ax_joint.set_xlim(-10,10)
    fig.ax_joint.set_ylim(-10,10)
    #sns.kdeplot(x=X_test[:,0],y=X_test[:,1],hue=y_test,fill=False, levels=2, thresh=0.5,palette=color_dict, ax=ax )
    # ax.scatter(x=X_test[:,0],y=X_test[:,1],c=[color_dict[condition] for condition in y_test], alpha = [alpha_dict[condition] for condition in y_test])
    #ax.set_xlabel('First principal component')
    #ax.set_ylabel('Second Principal Component')
    #legend_dots = []
    # print(np.unique(y_test))
    #for legend_color,legend_condition, show_bool in zip([color_dict[condition] for condition in np.unique(y_test)],np.unique(y_test),[alpha_dict[condition] for condition in np.unique(y_test)]):
    #    if show_bool != 0:
    #        legend_dots.append(mlines.Line2D([], [], color=legend_color, marker='o', linestyle='None', markersize=5, label=legend_condition))
    # ax.set_facecolor('white')
    
    return fig

In [None]:
modification = "m1acp3psU"
fig1 = plot_pca_raw_currents(m1acp3psU_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig1
 
fig2 = plot_pca_raw_currents(m1acp3psU_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig2

fig3 = plot_pca_raw_currents(m1acp3psU_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig3
 
fig4 = plot_pca_raw_currents(m1acp3psU_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig4

fig5 = plot_pca_raw_currents(m1acp3psU_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig5
 
fig6 = plot_pca_raw_currents(m1acp3psU_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig6

fig7 = plot_pca_raw_currents(m1acp3psU_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig7

fig8 = plot_pca_raw_currents(m1acp3psU_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig8

fig9 = plot_pca_raw_currents_scatter(m1acp3psU_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig9

fig10 = plot_pca_raw_currents_scatter(m1acp3psU_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","TSR KO 18S"])
fig10


fig1.savefig(f"~/Figures_Tamer/{modification}_TSR_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig2.savefig(f"~/Figures_Tamer/{modification}_TSR_KO_NP_Nuc_NP_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig3.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig4.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig7.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig8.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_dwell_time_PCA.svg",format="svg")
fig9.savefig(f"~/Figures_Tamer/{modification}_scattter_TSR_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig10.savefig(f"~/Figures_Tamer/{modification}_scatter_IVPA_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")

In [None]:
#fig, (ax1,ax2) = plt.subplots(nrows=2, ncols=1, figsize=(14, 14))
ref_fragment = "18S"
coordinate = 5292
name_modification = "m7G"
bases_upstream = 10
bases_downstream = 10
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"

#fig.suptitle(f"{name_modification}", fontsize=16)

condition = "IVPA Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table, do_rough_rescale=True, scale_iters=1, do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,None,None)
# #"navy"


condition = "IVPA Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"darkred"

condition = "NP Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"royalblue"


condition = "NP Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"indianred",

condition = "IVT 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/IVT_18S/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/IVT_18S/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/IVT_18S/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"green"

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "IVPA Nucleus"

pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)

for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "NP Nucleus"


pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)


for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)

ref_fragment = "18S"
condition = "WBSCR KO 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/WBSCR_KO/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/WBSCR_KO/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/WBSCR_KO/filtered_pod5/filtered_pod5_rebasecalled.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)


m7G_mean_signal_df = mean_signal_df
m7G_dwell_time_df = dwell_time_df
m7G_dwell_time_df.write_csv("./m7G_PCA_dwell_time.csv",separator = ";", include_header=True)
m7G_mean_signal_df.write_csv("./m7G_PCA_mean_signal.csv",separator = ";", include_header=True)

In [29]:
m7G_mean_signal_df = pl.read_csv("./m7G_PCA_mean_signal.csv",separator = ";", has_header=True)
m7G_dwell_time_df = pl.read_csv("./m7G_PCA_dwell_time.csv",separator = ";", has_header=True)

m7G_mean_signal_df = m7G_mean_signal_df.with_columns(count = np.array([1 for i in range(m7G_mean_signal_df.shape[0])]))
grouped_m7G_mean_signal_df = m7G_mean_signal_df.group_by("condition").agg(pl.col("count").sum())
grouped_m7G_mean_signal_df.head(40)
grouped_m7G_mean_signal_df.write_csv("./grouped_m7G_counts.csv",separator = ";", include_header=True)

In [None]:

fig1 = plot_pca_raw_currents(m7G_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig1
 
fig2 = plot_pca_raw_currents(m7G_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig2

fig3 = plot_pca_raw_currents(m7G_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig3
 
fig4 = plot_pca_raw_currents(m7G_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig4

fig5 = plot_pca_raw_currents(m7G_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig5
 
fig6 = plot_pca_raw_currents(m7G_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig6

fig7 = plot_pca_raw_currents(m7G_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig7

fig8 = plot_pca_raw_currents(m7G_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig8

fig9 = plot_pca_raw_currents_scatter(m7G_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig9

fig10 = plot_pca_raw_currents_scatter(m7G_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","WBSCR KO 18S"])
fig10

modification = "m7G"
fig1.savefig(f"~/Figures_Tamer/{modification}_WBSCR22_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig2.savefig(f"~/Figures_Tamer/{modification}_WBSCR22_KO_NP_Nuc_NP_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig3.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig4.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig7.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig8.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_dwell_time_PCA.svg",format="svg")

fig9.savefig(f"~/Figures_Tamer/{modification}_scatter_WBSCR22_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig10.savefig(f"~/Figures_Tamer/{modification}_scatter_IVPA_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")


In [None]:
#fig, (ax1,ax2) = plt.subplots(nrows=2, ncols=1, figsize=(14, 14))
ref_fragment = "18S"
coordinate = 5504
name_modification = "m62A"
bases_upstream = 10
bases_downstream = 10
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"

#fig.suptitle(f"{name_modification}", fontsize=16)

condition = "IVPA Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table, do_rough_rescale=True, scale_iters=1, do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,None,None)
# #"navy"


condition = "IVPA Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"darkred"

condition = "NP Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"royalblue"


condition = "NP Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"indianred",

condition = "IVT 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/IVT_18S/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/IVT_18S/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/IVT_18S/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"green"

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "IVPA Nucleus"

pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)

for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "NP Nucleus"


pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)


for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)


ref_fragment = "18S"
condition = "DIMT1L KO 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/DIMT1L_KO/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/DIMT1L_KO/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/DIMT1L_KO/filtered_pod5/filtered_pod5_rebasecalled.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)


ref_fragment = "18S"
condition = "DIMT1L KO 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/DIMT1L_KO/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/DIMT1L_KO/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/DIMT1L_KO/filtered_pod5/filtered_pod5_rebasecalled.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)



m62A_mean_signal_df = mean_signal_df
m62A_dwell_time_df = dwell_time_df
m62A_dwell_time_df.write_csv("./m62A_PCA_dwell_time.csv",separator = ";", include_header=True)
m62A_mean_signal_df.write_csv("./m62A_PCA_mean_signal.csv",separator = ";", include_header=True)

In [3]:
m62A_mean_signal_df = pl.read_csv("./m62A_PCA_mean_signal.csv",separator = ";", has_header=True)
m62A_dwell_time_df = pl.read_csv("./m62A_PCA_dwell_time.csv",separator = ";", has_header=True)

m62A_mean_signal_df = m62A_mean_signal_df.with_columns(count = np.array([1 for i in range(m62A_mean_signal_df.shape[0])]))
grouped_m62A_mean_signal_df = m62A_mean_signal_df.group_by("condition").agg(pl.col("count").sum())
#grouped_m62A_mean_signal_df.write_csv("./grouped_m62A_counts.csv",separator = ";", include_header=True)

In [None]:

fig1 = plot_pca_raw_currents(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig1

fig1_DIMT1L = plot_pca_raw_currents(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig1_DIMT1L
 
fig2 = plot_pca_raw_currents(m62A_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig2

fig3 = plot_pca_raw_currents(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig3
 
fig4 = plot_pca_raw_currents(m62A_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig4

fig5 = plot_pca_raw_currents(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig5
 
fig6 = plot_pca_raw_currents(m62A_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig6

fig7 = plot_pca_raw_currents(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig7

fig8 = plot_pca_raw_currents(m62A_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1,0],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig8

fig9 = plot_pca_raw_currents_scatter(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig9

fig9_DIMT1L = plot_pca_raw_currents_scatter(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig9_DIMT1L

fig10 = plot_pca_raw_currents_scatter(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S","DIMT1L KO 18S"])
fig10

modification = "m62A"
fig1.savefig(f"~/Figures_Tamer/{modification}_DIMT1L_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig1_DIMT1L.savefig(f"~/Figures_Tamer/{modification}_only_DIMT1L_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig2.savefig(f"~/Figures_Tamer/{modification}_DIMT1L_KO_NP_Nuc_NP_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig3.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig4.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig7.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig8.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_dwell_time_PCA.svg",format="svg")

fig9.savefig(f"~/Figures_Tamer/{modification}_scatter_DIMT1L_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig9_DIMT1L.savefig(f"~/Figures_Tamer/{modification}_scatter_only_DIMT1L_KO_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig10.savefig(f"~/Figures_Tamer/{modification}_scatter_IVPA_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")


In [None]:
#fig, (ax1,ax2) = plt.subplots(nrows=2, ncols=1, figsize=(14, 14))
ref_fragment = "18S"
coordinate = 5209
name_modification = "negative Ctrl1 5209"
bases_upstream = 10
bases_downstream = 10
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"

#fig.suptitle(f"{name_modification}", fontsize=16)

condition = "IVPA Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table, do_rough_rescale=True, scale_iters=1, do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,None,None)
# #"navy"


condition = "IVPA Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"darkred"

condition = "NP Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"royalblue"


condition = "NP Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"indianred",

condition = "IVT 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/IVT_18S/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/IVT_18S/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/IVT_18S/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"green"

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "IVPA Nucleus"

pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)

for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "NP Nucleus"


pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)


for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)



neg_Ctrl_5209_mean_signal_df = mean_signal_df
neg_Ctrl_5209_dwell_time_df = dwell_time_df
neg_Ctrl_5209_dwell_time_df.write_csv("./neg_Ctrl_5209_PCA_dwell_time.csv",separator = ";", include_header=True)
neg_Ctrl_5209_mean_signal_df.write_csv("./neg_Ctrl_5209_PCA_mean_signal.csv",separator = ";", include_header=True)

In [None]:
fig1 = plot_pca_raw_currents(neg_Ctrl_5209_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig1
 
fig2 = plot_pca_raw_currents(neg_Ctrl_5209_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig2

fig3 = plot_pca_raw_currents(neg_Ctrl_5209_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig3
 
fig4 = plot_pca_raw_currents(neg_Ctrl_5209_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig4

fig5 = plot_pca_raw_currents(m62A_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig5
 
fig6 = plot_pca_raw_currents(m62A_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig6

fig7 = plot_pca_raw_currents(neg_Ctrl_5209_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig7

fig8 = plot_pca_raw_currents(neg_Ctrl_5209_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig8
modification = "neg_Ctrl_5209"
fig1.savefig(f"~/Figures_Tamer/{modification}_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig2.savefig(f"~/Figures_Tamer/{modification}_NP_Nuc_NP_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig3.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig4.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig7.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig8.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_dwell_time_PCA.svg",format="svg")

In [None]:
#fig, (ax1,ax2) = plt.subplots(nrows=2, ncols=1, figsize=(14, 14))
ref_fragment = "18S"
coordinate = 3896
name_modification = "negative Ctrl2 3896"
bases_upstream = 10
bases_downstream = 10
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"

#fig.suptitle(f"{name_modification}", fontsize=16)

condition = "IVPA Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Cyt/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table, do_rough_rescale=True, scale_iters=1, do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,None,None)
# #"navy"


condition = "IVPA Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"darkred"

condition = "NP Cytoplasm 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Cyt/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Cyt/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"royalblue"


condition = "NP Nucleus 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"indianred",

condition = "IVT 18S"
pod5_dr = pod5.DatasetReader("~/directRNA_004/IVT_18S/filtered_pod5/filtered_1M.pod5")
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/IVT_18S/template_based_analysis/template_fragment_df.csv", separator=";")
bam_fh = io.ReadIndexedBam("~/directRNA_004/IVT_18S/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)
mean_signal_df,dwell_time_df =  extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,condition,read_ids,mean_signal_df,dwell_time_df)
# #"green"

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "IVPA Nucleus"

pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_1M.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_IVPA_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_IVPA_Nuc/filtered_pod5/filtered_rebasecalled_aligned_1M.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)

for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)

ref_fragments = [ "21S", "21S-C", "18S-E"]
colors = [
    "indigo",
    "palevioletred",
    #"seagreen",
    "deepskyblue"
]

condition = "NP Nucleus"


pod5_dr = pod5.DatasetReader(
    "~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered.pod5"
)
read_ids = list(pod5_dr.read_ids)
df = pl.read_csv("~/directRNA_004/20231114_RNA004_NP_Nuc/template_based_analysis/template_fragment_df.csv", separator=";")
reference_path = "~/wf-nanoribolyzer/references/RNA45SN1.fasta"
bam_fh = io.ReadIndexedBam("~/directRNA_004/20231114_RNA004_NP_Nuc/filtered_pod5/filtered_rebasecalled_aligned.bam")
kmer_table = "~/kmer_models/rna004/9mer_levels_v1.txt"
sig_map_refiner = refine_signal_map.SigMapRefiner(kmer_model_filename=kmer_table,do_rough_rescale=True,scale_iters=1,do_fix_guage=True)


for ref_fragment, color in zip(ref_fragments, colors):
    mean_signal_df,dwell_time_df = extract_reference_coordinate_signals_for_PCA(pod5_dr,df,bam_fh,kmer_table,sig_map_refiner,coordinate,bases_upstream,bases_downstream, ref_fragment,reference_path,f"{condition} {ref_fragment}",read_ids,mean_signal_df,dwell_time_df)



neg_Ctrl_3896_mean_signal_df = mean_signal_df
neg_Ctrl_3896_dwell_time_df = dwell_time_df
neg_Ctrl_3896_dwell_time_df.write_csv("./neg_Ctrl_3896_PCA_dwell_time.csv",separator = ";", include_header=True)
neg_Ctrl_3896_mean_signal_df.write_csv("./neg_Ctrl_3896_PCA_mean_signal.csv",separator = ";", include_header=True)

In [None]:
fig1 = plot_pca_raw_currents(neg_Ctrl_3896_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig1
 
fig2 = plot_pca_raw_currents(neg_Ctrl_3896_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,1,1,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig2

fig3 = plot_pca_raw_currents(neg_Ctrl_3896_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig3
 
fig4 = plot_pca_raw_currents(neg_Ctrl_3896_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,1,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig4

fig5 = plot_pca_raw_currents(neg_Ctrl_3896_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig5
 
fig6 = plot_pca_raw_currents(neg_Ctrl_3896_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[1,0,0,0,0,0,0,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig6

fig7 = plot_pca_raw_currents(neg_Ctrl_3896_mean_signal_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig7

fig8 = plot_pca_raw_currents(neg_Ctrl_3896_dwell_time_df,["darkred","navy","indianred","royalblue","indigo","palevioletred","deepskyblue","gold","black"],[0,0,0,0,1,1,1,1],["IVPA Cytoplasm 18S","IVPA Nucleus 18S","NP Cytoplasm 18S","NP Nucleus 18S","IVPA Nucleus 21S","IVPA Nucleus 21S-C","IVPA Nucleus 18S-E","IVT 18S"])
fig8


modification = "neg_Ctrl_3896"
fig1.savefig(f"~/Figures_Tamer/{modification}_NP_Nuc_NP_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig2.savefig(f"~/Figures_Tamer/{modification}_NP_Nuc_NP_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig3.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig4.savefig(f"~/Figures_Tamer/{modification}_IVPA_Nuc_IVPA_Cyt_IVT_18S_dwell_time_PCA.svg",format="svg")
fig7.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_norm_current_signal_PCA.svg",format="svg")
fig8.savefig(f"~/Figures_Tamer/{modification}_IVPA_intermediates_Nuc_18S_IVT_18S_dwell_time_PCA.svg",format="svg")