In [None]:
#plotly libraries
import plotly.express as px
import plotly.colors as pc
import plotly.graph_objects as go
import plotly.io as pio


#default renderer (svg means very small file size, visibility on github, high quality, but requires sometimes setting height and width)
pio.renderers.default = "svg"

colors = ["#363b3d", "#727b76", "#31a240", "#f93939", "#f79118", "#de7b06", "#9b308f", "#dc759b"]
#additional defaults
px.defaults.color_discrete_sequence = ["rgb(100,100,100)"] + pc.qualitative.D3
px.defaults.width=1000
px.defaults.height=800


#try setting this as default for histograms
#fig.update_traces(marker_line_width=0.2)

#pio.templates["simple_white"].layout.template.data.Histogram["marker_line_width"] = 0.2


#set default template as "simple_white" (no background, no grid lines)
pio.templates["simple_white"].layout["yaxis"]["showgrid"] = True
pio.templates.default = "simple_white"

import numpy as np

import sklearn.metrics as metrics
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

colorscale = pc.sequential.Plasma
print(colorscale)
colorscale = [colorscale[0]] + colorscale[3:8]
colorscale

# Plot isoforms per sample

In [None]:
import os
figure_dir = "figures/isoform_expression"
os.makedirs(figure_dir, exist_ok=True)

In [None]:

isoforms = [
    'RT1_unspliced1', 
    'RT2_unspliced2', 
    'RT2_Vif2_RT3_Vif1',
    'RT2_Vif3_RT3_Vif6', 
    'RT2_Vif4_RT3_Vif9', 
    'RT2_Vif5_RT3_Vif8',
    'RT2_Vif7', 
    
    'RT2_Vpr3_RT3_Vpr1', 
    'RT2_Vpr4_RT3_Vpr2',
       
    'RT2_Env1_RT3_Nef2', 
    'RT2_Env9_RT3_Nef4', 
    'RT2_Env2_RT3_Rev1',
    'RT2_Env3_RT3_Rev2', 
    'RT2_Env17_RT3_Nef9', 
    'RT2_Env5_RT3_Nef3',
    'RT2_Env13_RT3_Nef5', 
    'RT2_Env18_RT3_Rev13', 
    'RT2_Env10_RT3_Rev7',
    'RT2_Env19_RT3_Nef12', 
    'RT2_Env11_RT3_Rev8', 
    'RT2_Env14_RT3_Rev10',
    'RT2_Env15_RT3_Rev11', 
    'RT2_Env12_RT3_Rev9', 
    'RT2_Env6_RT3_Rev4',
    'RT2_Env7_RT3_Rev5', 
    'RT2_Env4_RT3_Rev3', 
    'RT2_Env8_RT3_Rev6',
    'RT2_Env20_RT3_Nef11', 
    
    'RT2_Tat5_RT3_Tat1', 
    'RT2_Tat6_RT3_Tat2',
    'RT2_Tat7_RT3_Tat3', 
    'RT2_Tat8_RT3_Tat4', 
    'RT2_Tat12_RT3_Tat9',
    'RT3_Tat10', 
    'RT3_Tat11', 
    'RT3_Tat13', 
    'RT3_Tat15', 
    'RT3_Tat16',
    
    'RT3_Rev12', 
    
    'RT3_Nef16', 
    'RT3_Nef15', 
    'RT3_Nef19', 
    'RT3_Nef20',
    'RT3_Nef22', 
    'RT3_Nef23', 
    
    'RT3_sORF1', 
    'RT3_sORF3']


isoform_colors_RT2 = {
    " "  : "white",
    "unspliced" : "grey",
 'RT1_unspliced1': "black",
 'RT2_unspliced2': "black",

 'RT2_Env1_RT3_Nef2': "rgb(26,82,118)",
 'RT2_Env2_RT3_Rev1': "rgb(31,97,141)",
 'RT2_Env3_RT3_Rev2': "rgb(36,113,163)",
 'RT2_Env4_RT3_Rev3': "rgb(41,128,185)", 
 'RT2_Env5_RT3_Nef3': "rgb(84,153,199)",
 'RT2_Env6_RT3_Rev4': "rgb(127,179,213)",
 'RT2_Env7_RT3_Rev5': "rgb(169,204,227)", 
 'RT2_Env8_RT3_Rev6': "rgb(27,79,114)",
 'RT2_Env9_RT3_Nef4': "rgb(33,97,140)",
 'RT2_Env10_RT3_Rev7': "rgb(40,116,166)",
 'RT2_Env11_RT3_Rev8': "rgb(46,134,163)",
 'RT2_Env12_RT3_Rev9': "rgb(52,152,219)", 
 'RT2_Env13_RT3_Nef5': "rgb(93,173,226)",
 'RT2_Env14_RT3_Rev10': "rgb(133,193,233)",
 'RT2_Env15_RT3_Rev11': "rgb(174,214,241)",
 'RT2_Env17_RT3_Nef9': "rgb(84,153,199)",
 'RT2_Env18_RT3_Rev13': "rgb(127,179,213)",
 'RT2_Env19_RT3_Nef12': "rgb(169,204,224)",
 'RT2_Env20_RT3_Nef11': "rgb(174,214,241)", 
    
 'RT3_Nef16': "grey",
    'RT3_Nef15': "grey", 
    'RT3_Nef19': "grey", 
    'RT3_Nef20': "grey",
    'RT3_Nef22': "grey", 
    'RT3_Nef23': "grey", 

    'RT3_Rev12':"grey", 

 'RT2_Tat5_RT3_Tat1': "rgb(207,87,161)",
 'RT2_Tat6_RT3_Tat2': "rgb(217,144,188)",
 'RT2_Tat7_RT3_Tat3': "rgb(237,206,228)",
 'RT2_Tat8_RT3_Tat4': "rgb(221,168,200)",
 'RT2_Tat12_RT3_Tat9': "rgb(229,186,216)",

    'RT3_Tat10': "grey", 
    'RT3_Tat11': "grey", 
    'RT3_Tat13': "grey", 
    'RT3_Tat15': "grey", 
    'RT3_Tat16': "grey",
    
 'RT2_Vif2_RT3_Vif1': "rgb(96,22,135)",
 'RT2_Vif3_RT3_Vif6': "rgb(141,74,176)",
 'RT2_Vif4_RT3_Vif9': "rgb(170,89,212)",
 'RT2_Vif5_RT3_Vif8': "rgb(204,108,255)",
    'RT2_Vif7':"rgb(142,68,173)",
    
 'RT2_Vpr3_RT3_Vpr1': "rgb(168,30,45)",
 'RT2_Vpr4_RT3_Vpr2': "rgb(192,98,106)",

 'RT3_sORF1': "rgb(177,153,109)",
 'RT3_sORF3': "rgb(157,138,109)",
}


isoform_colors_RT3 = {
 'RT1_unspliced1': "black",
 'RT2_unspliced2': "black",

    "Nef" : "green",
 'RT2_Env1_RT3_Nef2': "rgb(17,120,100)",
 'RT2_Env5_RT3_Nef3': "rgb(20,143,119)",
 'RT2_Env9_RT3_Nef4': "rgb(23,165,137)",
 'RT2_Env13_RT3_Nef5': "rgb(26,188,156)",
 'RT2_Env17_RT3_Nef9': "rgb(72,201,176)",
 'RT2_Env20_RT3_Nef11': "rgb(118,215,196)", 
 'RT2_Env19_RT3_Nef12': "rgb(163,228,215)",
 'RT3_Nef15': "rgb(29,131,72)", 
 'RT3_Nef16': "rgb(35,155,86)",
 'RT3_Nef19': "rgb(40,180,99)", 
 'RT3_Nef20': "rgb(46,204,113)",
 'RT3_Nef22': "rgb(88,214,141)", 
 'RT3_Nef23': "rgb(130,224,170)", 

 'RT2_Env2_RT3_Rev1': "rgb(147,81,22)",
 'RT2_Env3_RT3_Rev2': "rgb(175,96,26)",
 'RT2_Env4_RT3_Rev3': "rgb(202,111,30)",
 'RT2_Env6_RT3_Rev4': "rgb(230,126,34)",
 'RT2_Env7_RT3_Rev5': "rgb(235,152,78)",
 'RT2_Env8_RT3_Rev6': "rgb(240,178,122)",
 'RT2_Env10_RT3_Rev7': "rgb(185,119,14)",
 'RT2_Env11_RT3_Rev8': "rgb(214,137,16)",
 'RT2_Env12_RT3_Rev9': "rgb(243,156,18)",
 'RT2_Env14_RT3_Rev10': "rgb(245,176,65)",
 'RT2_Env15_RT3_Rev11': "rgb(248,180,113)",
 'RT3_Rev12': "rgb(246,221,204)", 
 'RT2_Env18_RT3_Rev13': "rgb(250,215,160)",
 
    
 'RT2_Tat5_RT3_Tat1': "rgb(207,87,161)",
 'RT2_Tat6_RT3_Tat2': "rgb(217,144,188)",
 'RT2_Tat7_RT3_Tat3': "rgb(237,206,228)",
 'RT2_Tat8_RT3_Tat4': "rgb(221,168,200)",
 'RT2_Tat12_RT3_Tat9': "rgb(229,186,216)",

 'RT3_Tat10': "rgb(240,98,146)", 
 'RT3_Tat11': "rgb(244,143,177)", 
 'RT3_Tat13': "rgb(248,187,208)", 
 'RT3_Tat15': "rgb(252,200,215)", 
 'RT3_Tat16': "rgb(252,228,236)",
    
 'RT2_Vif2_RT3_Vif1': "rgb(96,22,135)",
 'RT2_Vif3_RT3_Vif6': "rgb(141,74,176)",
 'RT2_Vif4_RT3_Vif9': "rgb(170,89,212)",
 'RT2_Vif5_RT3_Vif8': "rgb(204,108,255)",
 'RT2_Vif7' : "grey",
 'RT2_Vpr3_RT3_Vpr1': "rgb(168,30,45)",
 'RT2_Vpr4_RT3_Vpr2': "rgb(192,98,106)",

 'RT3_sORF1': "rgb(177,153,109)",
 'RT3_sORF3': "rgb(157,138,109)",
}

In [None]:
order_dict = {
    "DMS_conc" : [f"{i}mM" for i in [0,8,17,34,57,85]],
    "isoform" : isoforms
}

# Read in isoquant output

In [None]:
data_folder_rep1 = ".."
data_folder_rep2 = ".."

In [None]:
import pandas as pd
samplesheet = pd.read_excel("./samplesheet.xlsx", engine="openpyxl")
samplesheet
samples = samplesheet["samples"].values

In [None]:
def split_unique_name(df, unique_name_column = "unique_name"):
    df["replicate"] = df[unique_name_column].apply(lambda x: x.split("_")[0])
    df["RT"] = df[unique_name_column].apply(lambda x: x.split("_")[1])
    df["DMS_conc"] = df[unique_name_column].apply(lambda x: x.split("_")[2])
    df["location"] = df[unique_name_column].apply(lambda x: x.split("_")[3])
    df["treatment"] = df[unique_name_column].apply(lambda x: x.split("_")[4])
    df["RT_and_location"] = df["RT"].astype(str) + "_" + df["location"]
    df["rep_and_DMS"] = df["replicate"] + "_" + df["DMS_conc"]
    df["DMS_conc"] = df["DMS_conc"].astype("category")
    
    return df

In [None]:
rep1_read_assignments = []

for sample in samples:
    if "Rep1" in sample:
        merged_transcript_file = f'{data_folder_rep1}/isoquant_out_allow_single_bc/{sample}/{sample}.read_assignments.tsv'
        read_df = pd.read_csv(merged_transcript_file, sep="\t")
        read_df["unique_name"] = sample
        rep1_read_assignments.append(read_df)
    
rep1_read_assignments = pd.concat(rep1_read_assignments)
rep1_read_assignments.rename(columns={'#read_id':'read_id'}, inplace=True)
rep1_read_assignments = split_unique_name(rep1_read_assignments, unique_name_column="unique_name")

In [None]:
rep2_read_assignments = []

for sample in samples:
    if "Rep2" in sample:
        merged_transcript_file = f'{data_folder_rep2}/isoquant_out_allow_single_bc/{sample}/{sample}.read_assignments.tsv'
        read_df = pd.read_csv(merged_transcript_file, sep="\t")
        read_df["unique_name"] = sample
        rep2_read_assignments.append(read_df)
    
rep2_read_assignments = pd.concat(rep2_read_assignments)
rep2_read_assignments.rename(columns={'#read_id':'read_id'}, inplace=True)
rep2_read_assignments = split_unique_name(rep2_read_assignments, unique_name_column="unique_name")

In [None]:
rep1_read_assignments.to_pickle(f"{data_folder_rep1}/isoquant_out/read_assignments.pickle")
rep2_read_assignments.to_pickle(f"{data_folder_rep2}/isoquant_out/read_assignments.pickle")

# Take per read information to calculate assignment types

In [None]:
order = {"assignment_type" : ["unique", "unique_minor_difference", "ambiguous", "inconsistent", "noninformative"],"treatment":["native", "deprot"], "DMS_conc" : [f"{i}mM" for i in [0,8,17,34,57,85]]}

def split_unique_name(df, unique_name_column = "unique_name"):
    df["replicate"] = df[unique_name_column].apply(lambda x: x.split("_")[0])
    df["RT"] = df[unique_name_column].apply(lambda x: x.split("_")[1])
    df["DMS_conc"] = df[unique_name_column].apply(lambda x: x.split("_")[2])
    df["location"] = df[unique_name_column].apply(lambda x: x.split("_")[3])
    df["treatment"] = df[unique_name_column].apply(lambda x: x.split("_")[4])
    df["location_treatment"] = df["location"] + "_" + df["treatment"]
    df["RT_and_location"] = df["RT"].astype(str) + "_" + df["location"]
    df["rep_and_DMS"] = df["replicate"] + "_" + df["DMS_conc"]
    df["DMS_conc"] = df["DMS_conc"].astype("category")
    
    return df

In [None]:
#laod from disk if previously stored
rep1_read_assignments = pd.read_pickle(f"{data_folder_rep1}/isoquant_out/read_assignments.pickle")
rep2_read_assignments = pd.read_pickle(f"{data_folder_rep2}/isoquant_out/read_assignments.pickle")

### Plot read assignments of rep 1

In [None]:
unique_reads_rep1 = rep1_read_assignments.drop_duplicates(subset="read_id")
value_counts_rep1 = unique_reads_rep1.groupby("unique_name")["assignment_type"].value_counts().reset_index(name="count")
value_counts_rep1 = split_unique_name(value_counts_rep1)

count_dict = value_counts_rep1.groupby("unique_name")["count"].sum().to_dict()

value_counts_rep1["percent"] = value_counts_rep1.apply(lambda x: 100*x["count"]/(count_dict[x["unique_name"]]), axis=1)

In [None]:
fig = px.bar(value_counts_rep1, x="DMS_conc", y="count", facet_col="RT_and_location", facet_row="treatment", color="assignment_type", barmode="stack", category_orders=order)
fig.update_yaxes(dtick=50000, showgrid=True)
fig.update_traces(width=0.8, marker_line_width=1, marker_line_color="black")
fig.write_image("figures/isoform_expression/isoquant_classification_absolute_rep1.svg", width=1000,height=400)
fig.show(renderer="svg", width=1000,height=400)

In [None]:
fig = px.bar(value_counts_rep1, x="DMS_conc", y="percent", facet_col="RT_and_location", facet_row="treatment", color="assignment_type", barmode="stack", category_orders=order)
fig.update_yaxes(dtick=20, showgrid=True)
fig.update_traces(width=0.8, marker_line_width=1, marker_line_color="black")
fig.write_image("figures/isoform_expression/isoquant_classification_percent_rep1.svg", width=1000,height=400)
fig.show(renderer="svg", width=1000,height=400)

### Plot read assignments of rep 2

In [None]:
unique_reads_rep2 = rep2_read_assignments.drop_duplicates(subset="read_id")

value_counts_rep2 = unique_reads_rep2.groupby("unique_name")["assignment_type"].value_counts().reset_index(name="count")
value_counts_rep2 = split_unique_name(value_counts_rep2)
count_dict = value_counts_rep2.groupby("unique_name")["count"].sum().to_dict()
value_counts_rep2["percent"] = value_counts_rep2.apply(lambda x: 100*x["count"]/(count_dict[x["unique_name"]]), axis=1)

In [None]:
fig = px.bar(value_counts_rep2, x="DMS_conc", y="count", facet_col="RT_and_location", facet_row="treatment", color="assignment_type", barmode="stack", category_orders=order)
fig.update_yaxes(dtick=50000, showgrid=True)
fig.update_traces(width=0.8, marker_line_width=1, marker_line_color="black")
fig.write_image("figures/isoform_expression/isoquant_classification_absolute_rep2.svg", width=700,height=400)
fig.show(renderer="svg", width=700,height=400)

In [None]:
fig = px.bar(value_counts_rep2, x="DMS_conc", y="percent", facet_col="RT_and_location", facet_row="treatment", color="assignment_type", barmode="stack", category_orders=order)
fig.update_yaxes(dtick=20, showgrid=True)
fig.update_traces(width=0.8, marker_line_width=1, marker_line_color="black")
fig.write_image("figures/isoform_expression/isoquant_classification_percent_rep2.svg", width=700,height=400)
fig.show(renderer="svg", width=700,height=400)

# Isoform expression correlation between replicates

In [None]:
import os
import pandas as pd
import scipy.stats as stats
import numpy as np
os.getcwd()

In [None]:
rep1 = value_counts_rep1
rep2 = value_counts_rep2

In [None]:
rep1["name_wo_rep"] = rep1["unique_name"].apply(lambda x: x.split("Rep1_")[1])
rep2["name_wo_rep"] = rep2["unique_name"].apply(lambda x: x.split("Rep2_")[1])
rep1["name_wo_rep_w_isoform"] = rep1["name_wo_rep"].astype(str) + "_" +rep1["isoform_id"].astype(str)
rep2["name_wo_rep_w_isoform"] = rep2["name_wo_rep"].astype(str) +"_" +rep2["isoform_id"].astype(str)

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "simple_white"
colorscale = px.colors.sequential.Plasma
colorscale = [colorscale[0]] + colorscale[3:8]

In [None]:
def get_actual_protein(row):
    try:
        return row[f"protein_{row['RT_rep1']}_rep1"]
    except:
        return "none"

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT2.keys())
fig = px.bar(rep1, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="percent", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT2)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_rep1_color_RT2.svg", width=1000,height=400)
fig.show(renderer="svg", width=1000,height=400)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT2.keys())
fig = px.bar(rep1, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="count", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT2)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.update_yaxes(showgrid=True, dtick=25000, range=[0,230000])
fig.write_image(f"{figure_dir}/absolute_expresion_rep1_color_RT2.svg", width=1000,height=600)
fig.show(renderer="svg", width=1000,height=600)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT2.keys())
fig = px.bar(rep2, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="percent", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT2)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_rep2_color_RT2.svg", width=750,height=400)
fig.show(renderer="svg", width=750,height=400)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT2.keys())
fig = px.bar(rep2, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="count", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT2)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.update_yaxes(showgrid=True, dtick=25000, range=[0,230000])
fig.write_image(f"{figure_dir}/absolute_expresion_rep2_color_RT2.svg", width=750,height=600)
fig.show(renderer="svg", width=750,height=600)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT2.keys())
fig = px.bar(rep2, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="percent", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT2)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_colorbar_color_RT2.svg", width=400,height=3000)
fig.show(renderer="svg", width=400,height=3000)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT3.keys())
fig = px.bar(rep1, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="percent", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT3)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_rep1_color_RT3.svg", width=1000,height=400)
fig.show(renderer="svg", width=1000,height=400)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT3.keys())
fig = px.bar(rep1, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="count", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT3)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.update_yaxes(showgrid=True, dtick=25000, range=[0,230000])
fig.write_image(f"{figure_dir}/absolute_expresion_rep1_color_RT3.svg", width=1000,height=600)
fig.show(renderer="svg", width=1000,height=600)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT3.keys())
fig = px.bar(rep2, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="percent", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT3)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_rep2_color_RT3.svg", width=750,height=400)
fig.show(renderer="svg", width=750,height=400)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT3.keys())
fig = px.bar(rep2, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="count", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT3)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.update_yaxes(showgrid=True, dtick=25000, range=[0,230000])
fig.write_image(f"{figure_dir}/absolute_expresion_rep2_color_RT3.svg", width=750,height=600)
fig.show(renderer="svg", width=750,height=600)

In [None]:
order_dict["isoform_id"] = list(isoform_colors_RT3.keys())
fig = px.bar(rep2, x="DMS_conc", facet_col ="RT_and_location", facet_row ="treatment", y="percent", color="isoform_id", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT3)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_colorbar_color_RT3.svg", width=400,height=3000)
fig.show(renderer="svg", width=400,height=3000)

# Plot mean data from both replicates

In [None]:
merged = rep1.merge(rep2, on="name_wo_rep_w_isoform", how="left",  suffixes=["_rep1", "_rep2"])
merged.dropna(inplace=True)

In [None]:
merged["actual_protein"] = merged.apply(lambda x: get_actual_protein(x), axis=1)

In [None]:
counts1 = merged["percent_rep1"].values
counts2 = merged["percent_rep2"].values
np.corrcoef(counts1, counts2)

In [None]:
merged["mean_percent"] = merged.apply(lambda x: (x["percent_rep1"] + x["percent_rep2"])/2, axis=1)

In [None]:
fig = px.scatter(merged, x="percent_rep1", y="percent_rep2", color="actual_protein", hover_data=["unique_name_rep1", "isoform_rep1"])
fig.add_trace(go.Scatter(x=[0,100], y=[0,100], mode="lines", line_color="black", line_width=2))
fig.update_layout(title = "Relative abundace of each isoform per sample for both replicates<br>R=0.994")
fig.update_traces(marker=dict(size=5, opacity=0.8,line=dict(width=1)))
fig.update_xaxes(type="log")
fig.update_yaxes(type="log")
fig.write_image("figures/allow_single_bc_relative_abundace_correlation.svg", width=500, height=400)
fig.show(width=900, height=800)

In [None]:
merged.to_pickle("merged_relative_abundances.pickle")

In [None]:
order_dict["isoform_rep1"] = list(isoform_colors_RT2.keys())
order_dict["DMS_conc_rep1"] = order_dict["DMS_conc"]
order_dict["treatment_rep1"] = order_dict["treatment"]
fig = px.bar(merged, x="DMS_conc_rep1", facet_col ="RT_and_location_rep1", facet_row ="treatment_rep1", y="mean_percent", color="isoform_rep1", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT2)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_mean_color_RT2.svg", width=1000,height=400)
fig.show(renderer="svg", width=1000,height=400)

In [None]:
order_dict["isoform_rep1"] = list(isoform_colors_RT3.keys())
order_dict["DMS_conc_rep1"] = order_dict["DMS_conc"]
order_dict["treatment_rep1"] = order_dict["treatment"]
fig = px.bar(merged, x="DMS_conc_rep1", facet_col ="RT_and_location_rep1", facet_row ="treatment_rep1", y="mean_percent", color="isoform_rep1", barmode="stack", category_orders=order_dict, color_discrete_map = isoform_colors_RT3)
fig.update_traces(width=0.8, marker=dict(line_width=1, line_color="black"))
fig.write_image(f"{figure_dir}/relative_expresion_mean_color_RT3.svg", width=1000,height=400)
fig.show(renderer="svg", width=1000,height=400)

In [None]:
this_df = merged[merged["unique_name_rep1"] == "Rep1_RT1_0mM_cell_native"]
fig = px.sunburst(this_df, values="mean_percent", path=["protein_RT1_rep1", "isoform_rep1"], title="uniquely assigned reads per sample", color="protein_RT1_rep1",color_discrete_map = isoform_colors_RT2)
#fig.update_traces(marker=dict(line=dict(width=0.1, color="black")))
#fig.update_layout(sort=True)
#fig.update_xaxes(type="category")

fig.write_image(f"{figure_dir}/sunburst_isoform_ratios_mean_RT1.svg")
fig.show()

In [None]:
this_df = merged[merged["unique_name_rep1"] == "Rep1_RT2_0mM_cell_native"]
fig = px.sunburst(this_df, values="mean_percent", path=["protein_RT2_rep1", "isoform_RT2_rep1"], title="uniquely assigned reads per sample", color="isoform_rep1",color_discrete_map = isoform_colors_RT2)
#fig.update_traces(marker=dict(line=dict(width=0.1, color="black")))
#fig.update_layout(sort=True)
#fig.update_xaxes(type="category")

fig.write_image(f"{figure_dir}/sunburst_isoform_ratios_mean_RT2.svg")
fig.show()

In [None]:
this_df = merged[merged["unique_name_rep1"] == "Rep1_RT3_0mM_cell_native"]
fig = px.sunburst(this_df, values="mean_percent", path=["protein_RT3_rep1", "isoform_RT3_rep1"], title="uniquely assigned reads per sample", color="isoform_rep1",color_discrete_map = isoform_colors_RT3)
#fig.update_traces(marker=dict(line=dict(width=0.1, color="black")))
#fig.update_layout(sort=True)
#fig.update_xaxes(type="category")
fig.write_image(f"{figure_dir}/sunburst_isoform_ratios_mean_RT3.svg")
fig.show()

In [None]:
gtf_df = pd.read_pickle("references/gtf_dataframe_PCR1_2_3.pickle")
gtf_df.rename(columns={"isoform_name" : "isoform_id_rep1"}, inplace=True)

In [None]:
merged = merged.merge(gtf_df, on="isoform_id_rep1", how="left")

In [None]:
def get_first_acceptor(splice_sites):
    try:
        return "A" + splice_sites[0].split("A")[1]
    except:
        return "none"
merged["first_acceptor"] = merged["actual_splice_sites"].apply(lambda x: get_first_acceptor(x))

In [None]:
this_df = merged[merged["unique_name_rep1"] == "Rep1_RT1_0mM_cell_native"]
fig = px.sunburst(this_df, values="mean_percent", path=["first_acceptor", "isoform_rep1"], title="uniquely assigned reads per sample", color="protein_RT1_rep1",color_discrete_map = isoform_colors_RT2)
#fig.update_traces(marker=dict(line=dict(width=0.1, color="black")))
#fig.update_layout(sort=True)
#fig.update_xaxes(type="category")

fig.write_image(f"{figure_dir}/sunburst_isoform_ratios_by_first_acceptor_mean_RT1.svg")
fig.show()

In [None]:
this_df = merged[merged["unique_name_rep1"] == "Rep1_RT2_0mM_cell_native"]
fig = px.sunburst(this_df, values="mean_percent", path=["first_acceptor", "isoform_RT2_rep1"], title="uniquely assigned reads per sample", color="isoform_rep1",color_discrete_map = isoform_colors_RT2)
#fig.update_traces(marker=dict(line=dict(width=0.1, color="black")))
#fig.update_layout(sort=True)
#fig.update_xaxes(type="category")

fig.write_image(f"{figure_dir}/sunburst_isoform_ratios_by_first_acceptor_mean_RT2.svg")
fig.show()

In [None]:
this_df = merged[merged["unique_name_rep1"] == "Rep1_RT3_0mM_cell_native"]
fig = px.sunburst(this_df, values="mean_percent", path=["first_acceptor", "isoform_RT3_rep1"], title="uniquely assigned reads per sample", color="isoform_rep1",color_discrete_map = isoform_colors_RT3)
#fig.update_traces(marker=dict(line=dict(width=0.1, color="black")))
#fig.update_layout(sort=True)
#fig.update_xaxes(type="category")
fig.write_image(f"{figure_dir}/sunburst_isoform_ratios_by_first_acceptor_mean_RT3.svg")
fig.show()

In [None]:
protein_color_dict = {
    "unspliced" : "black",
    "Env" : "rgb(58,108,207)",
    "Nef" : "rgb(23,188,115)",
    "Vpr" : "rgb(207,37,55)",
    "Vif" : "rgb(161,77,196)",
    "Tat" : "rgb(221,168,200)",
    "Rev" : "rgb(230,126,34)",
    "sORF" : "rgb(177,153,109)",
    "-" : "grey"
}

In [None]:
merged.loc[(merged["actual_protein"] == " "), "actual_protein"] = "-"

In [None]:
merged["actual_protein"].value_counts()

In [None]:
np.corrcoef(merged["percent_rep1"].values, merged["percent_rep2"].values)

In [None]:
fig = px.scatter(merged, render_mode="svg", x="percent_rep1", y="percent_rep2", color="actual_protein", category_orders={"actual_protein": list(protein_color_dict.keys())}, color_discrete_map = protein_color_dict, hover_data=["unique_name_rep1", "isoform_rep1"])
fig.add_trace(go.Scatter(x=[0,100], y=[0,100], mode="lines", line_color="black", line_width=2))
fig.update_layout(title = "Relative abundace of each isoform per sample for both replicates<br>R=0.994")
fig.update_traces(marker=dict(size=5,line=dict(width=1)))
fig.update_xaxes(type="log", showgrid=True)
fig.update_yaxes(type="log", showgrid=True)
fig.write_image(f"{figure_dir}/rep1_vs2_relative_abundace_correlation.svg", width=500, height=400)
fig.show(renderer="svg", width=900, height=800)

In [None]:
fig = px.scatter(merged, render_mode="svg", x="percent_rep1", y="percent_rep2", color="DMS_conc_rep1", category_orders={"actual_protein": list(protein_color_dict.keys())}, color_discrete_map = protein_color_dict, hover_data=["unique_name_rep1", "isoform_rep1"])
fig.add_trace(go.Scatter(x=[0,100], y=[0,100], mode="lines", line_color="black", line_width=2))
fig.update_layout(title = "Relative abundace of each isoform per sample for both replicates<br>R=0.994")
fig.update_traces(marker=dict(size=5,line=dict(width=1)))
fig.update_xaxes(type="log", showgrid=True)
fig.update_yaxes(type="log", showgrid=True)
fig.write_image(f"{figure_dir}/rep1_vs2_relative_abundace_correlation_color_DMS.svg", width=500, height=400)
fig.show(renderer="svg", width=900, height=800)

# Correlation for isoforms with at least 1 % in both replicates

In [None]:
merged = pd.read_pickle("merged_relative_abundances.pickle")

In [None]:
merged_filtered = merged[(merged["percent_rep1"] > 1) & (merged["percent_rep2"] > 1) ]

In [None]:
counts1 = merged_filtered["percent_rep1"].values
counts2 = merged_filtered["percent_rep2"].values
import numpy as np
np.corrcoef(counts1, counts2)

In [None]:
stats.linregress(counts1, counts2)

In [None]:
fig = px.scatter(merged_filtered, x="percent_rep1", y="percent_rep2", color="protein_RT2_rep1", hover_data=["unique_name_rep1", "isoform_rep1"])
fig.add_trace(go.Scatter(x=[0,100], y=[0,100], mode="lines", line_color="black", line_width=2))
fig.update_layout(title = "Relative abundace of each isoform with at least 1% per sample for both replicates<br>R=0.993")
fig.write_image("relative_abundace_min1_correlation.svg", width=900, height=800)
fig.show(width=900, height=800)

In [None]:
reformat_df = merged_filtered.groupby(["name_wo_rep_rep1", "isoform_rep1"])[["count_rep1", "count_rep2"]].min().dropna().reset_index()
reformat_df

In [None]:
reformat_df["count_rep1"] = reformat_df["count_rep1"].astype(int)
reformat_df["count_rep2"] = reformat_df["count_rep2"].astype(int)

In [None]:
reformat_df["min_coverage"] = reformat_df[["count_rep1", "count_rep2"]].min(axis=1)

In [None]:
isoforms = sorted(np.unique(reformat_df["isoform_rep1"].values))
samples = sorted(np.unique(reformat_df["name_wo_rep_rep1"].values))
native_samples = [sample for sample in samples if "native" in sample]

data = np.full((len(native_samples), len(isoforms)), 0)
for row, barcode in enumerate(native_samples):
    for col, isoform in enumerate(isoforms):
        #print(barcode, isoform,  row, col)
        try:
            readcount = reformat_df[(reformat_df["name_wo_rep_rep1"] == barcode) & (reformat_df["isoform_rep1"] == isoform)]["min_coverage"]
        except:
            readcount = 0
        #print(readcount)
        try:
            data[row, col] = readcount#
        except:
            continue


In [None]:
import plotly.express as px
#barcodes = [f"barcode{str(i).zfill(2)}" for i in range(1,53)]

fig = px.imshow(data, x=isoforms, y=native_samples, zmax = 4000, color_continuous_scale=[(0.00, "black"), (200/4000, "darkred"), (500/4000, "yellow"),(2000/4000, "yellow"),  (3000/4000, "lightgreen"), (1, "green")], aspect="auto", height=1000)
fig.update_layout(title="Minimum coverage per isoform of both repeats<br> for all native samples", coloraxis_colorbar=dict(tickvals = [0,200,500,1000,2000,4000], ticktext = ["0", "200", "500", "1000", "2000", "4000"]))#[0, 500/np.max(data), 1000/np.max(data), 1500/np.max(data), 3000/np.max(data), 6000/np.max(data)]
fig.show()

In [None]:
isoforms = sorted(np.unique(reformat_df["isoform_rep1"].values))
samples = sorted(np.unique(reformat_df["name_wo_rep_rep1"].values))
deprot_samples = [sample for sample in samples if "deprot" in sample]

data = np.full((len(deprot_samples), len(isoforms)), 0)
for row, barcode in enumerate(deprot_samples):
    for col, isoform in enumerate(isoforms):
        #print(barcode, isoform,  row, col)
        try:
            readcount = reformat_df[(reformat_df["name_wo_rep_rep1"] == barcode) & (reformat_df["isoform_rep1"] == isoform)]["min_coverage"]
        except:
            readcount = 0
        #print(readcount)
        try:
            data[row, col] = readcount#
        except:
            continue

In [None]:
fig = px.imshow(data, x=isoforms, y=deprot_samples, zmax = 4000, color_continuous_scale=[(0.00, "black"), (200/4000, "darkred"), (500/4000, "yellow"),(2000/4000, "yellow"),  (3000/4000, "lightgreen"), (1, "green")], aspect="auto", height=1000)
fig.update_layout(title="Minimum coverage per isoform of both repeats<br> for all deproteinated samples", coloraxis_colorbar=dict(tickvals = [0,200,500,1000,2000,4000], ticktext = ["0", "200", "500", "1000", "2000", "4000"]))#[0, 500/np.max(data), 1000/np.max(data), 1500/np.max(data), 3000/np.max(data), 6000/np.max(data)]
fig.show()

In [None]:
isoforms = sorted(np.unique(reformat_df["isoform_rep1"].values))
samples = sorted(np.unique(reformat_df["name_wo_rep_rep1"].values))
cell_samples = [sample for sample in samples if "cell" in sample]

data = np.full((len(cell_samples), len(isoforms)), 0)
for row, barcode in enumerate(cell_samples):
    for col, isoform in enumerate(isoforms):
        #print(barcode, isoform,  row, col)
        try:
            readcount = reformat_df[(reformat_df["name_wo_rep_rep1"] == barcode) & (reformat_df["isoform_rep1"] == isoform)]["min_coverage"]
        except:
            readcount = 0
        #print(readcount)
        try:
            data[row, col] = readcount#
        except:
            continue

In [None]:
fig = px.imshow(data, x=isoforms, y=cell_samples, zmax = 4000, color_continuous_scale=[(0.00, "black"), (200/4000, "darkred"), (500/4000, "yellow"),(2000/4000, "yellow"),  (3000/4000, "lightgreen"), (1, "green")], aspect="auto", height=1000)
fig.update_layout(title="Minimum coverage per isoform of both repeats<br> for all cell samples", coloraxis_colorbar=dict(tickvals = [0,200,500,1000,2000,4000], ticktext = ["0", "200", "500", "1000", "2000", "4000"]))#[0, 500/np.max(data), 1000/np.max(data), 1500/np.max(data), 3000/np.max(data), 6000/np.max(data)]
fig.show()

In [None]:
isoforms = sorted(np.unique(reformat_df["isoform_rep1"].values))
samples = sorted(np.unique(reformat_df["name_wo_rep_rep1"].values))
virion_samples = [sample for sample in samples if "virion" in sample]

data = np.full((len(virion_samples), len(isoforms)), 0)
for row, barcode in enumerate(virion_samples):
    for col, isoform in enumerate(isoforms):
        #print(barcode, isoform,  row, col)
        try:
            readcount = reformat_df[(reformat_df["name_wo_rep_rep1"] == barcode) & (reformat_df["isoform_rep1"] == isoform)]["min_coverage"]
        except:
            readcount = 0
        #print(readcount)
        try:
            data[row, col] = readcount#
        except:
            continue

In [None]:
fig = px.imshow(data, x=isoforms, y=virion_samples, zmax = 4000, color_continuous_scale=[(0.00, "black"), (200/4000, "darkred"), (500/4000, "yellow"),(2000/4000, "yellow"),  (3000/4000, "lightgreen"), (1, "green")], aspect="auto", height=1000)
fig.update_layout(title="Minimum coverage per isoform of both repeats<br> for all virion samples", coloraxis_colorbar=dict(tickvals = [0,200,500,1000,2000,4000], ticktext = ["0", "200", "500", "1000", "2000", "4000"]))#[0, 500/np.max(data), 1000/np.max(data), 1500/np.max(data), 3000/np.max(data), 6000/np.max(data)]
fig.show()