In [1]:
import numpy as np
import pandas as pd
import os
import joblib
import pickle
import math
import ast
from scipy.stats import median_abs_deviation, hypergeom, mannwhitneyu
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as mcolors

In [2]:
#import seaborn as sns
import dictys
from utils_custom import *
from episode_plots import *

In [6]:
# Define file paths 
output_folder = '/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/figures'
data_folder = '/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output'
input_folder = '/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files'

# Cell state distributions per fate

In [None]:
# load cell_count_per_window_df from csv file
cell_count_per_window_df = pd.read_csv(os.path.join(input_folder, 'cell_count_per_window_df.csv'), header=0, index_col=0)
day_count_per_window_df = pd.read_csv(os.path.join(input_folder, 'day_count_per_window_df.csv'), header=0, index_col=0)
pseudotime_values_of_windows = pd.read_csv(os.path.join(input_folder, 'pseudotime_values_of_windows.csv'), header=None)[0].tolist()
cell_count_per_window_df.columns = cell_count_per_window_df.columns.astype(int)
day_count_per_window_df.columns = day_count_per_window_df.columns.astype(int)

In [None]:
# assign window indices to bifurcation branches
PB_fate_window_indices = [1] + list(range(97, 3, -1)) + [0] + list(range(98, 147, 1)) + [2]
GC_fate_window_indices = [1] + list(range(97, 3, -1)) + [0] + list(range(147, 193, 1)) + [3]
PB_post_bifurcation_window_indices = [0] + list(range(98, 147, 1)) + [2]
GC_post_bifurcation_window_indices = [0] + list(range(147, 193, 1)) + [3]

# Define distinct colors for better visibility
colors_cell_count = {
    'ActB-1': '#87CEFA',     # lightskyblue
    'ActB-2': '#1E90FF',     # dodgerblue
    'ActB-3': '#00008B',     # darkblue
    'ActB-4': '#9370DB',     # mediumorchid
    'GC-1': '#7BDE7B',       # custom light green
    'GC-2': '#008000',       # green
    'PB-2': '#BB3636',       # custom red
    'earlyActB': '#008080',   # teal
    'earlyPB': '#F08080'   # lightcoral
}
colors_day_count = {
    'day0_2': '#87CEFA',     # lightskyblue
    'day3_4': '#1E90FF',     # dodgerblue
    'day5_6': '#00008B'
}

In [None]:
# Create a new dataframe with only the columns in PB_fate_window_indices
df_plot = cell_count_per_window_df[PB_post_bifurcation_window_indices]
#remove ActB-1 from the dataframe rows
df_plot = df_plot.drop(index=['ActB-1', 'earlyActB'])


In [None]:
import numpy as np

# Create time bins
n_bins = 8  # Adjust this number as needed
x = [pseudotime_values_of_windows[i] for i in PB_post_bifurcation_window_indices]
x_min, x_max = min(x), max(x)

# Create bin edges
bin_edges = np.linspace(x_min, x_max, n_bins + 1)
bin_centers = [(bin_edges[i] + bin_edges[i+1]) / 2 for i in range(n_bins)]

# Initialize binned data and count time points per bin
binned_data = {state: [0] * n_bins for state in df_plot.index}
bin_counts = [0] * n_bins  # Track how many time points are in each bin

# Aggregate data into bins
for i, time_point in enumerate(x):
    # Find which bin this time point belongs to
    bin_idx = np.digitize(time_point, bin_edges) - 1
    bin_idx = max(0, min(bin_idx, n_bins - 1))  # Ensure within bounds
    
    # Add counts for each cell state to the appropriate bin
    for state in df_plot.index:
        binned_data[state][bin_idx] += df_plot.loc[state].values[i]
    
    bin_counts[bin_idx] += 1

# Average the counts within each bin (so each bin represents average composition)
for state in df_plot.index:
    for bin_idx in range(n_bins):
        if bin_counts[bin_idx] > 0:
            binned_data[state][bin_idx] = binned_data[state][bin_idx] / bin_counts[bin_idx]

# Plot
plt.figure(figsize=(6, 4))
plt.style.use('default')
plt.grid(False)

# Initialize bottom array for stacking
bottom = [0] * n_bins

# Plot each cell state as a layer in the stacked bar
for state in df_plot.index:
    y = binned_data[state]
    plt.bar(range(n_bins), y,
            label=state,
            color=colors_cell_count[state],
            bottom=bottom,
            alpha=0.8)
    
    # Update bottom for next stack layer
    bottom = [bottom[i] + y[i] for i in range(n_bins)]

plt.xlabel('Binned windows (PB branch)', fontsize=14, fontweight='bold', labelpad=15)
plt.ylabel('Average Cell Count', fontsize=14, fontweight='bold')
plt.xticks(range(n_bins), [f"Bin {i+1}" for i in range(n_bins)], fontsize=12, fontweight='bold')
plt.yticks(fontsize=12, fontweight='bold')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
#plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Print bin info for verification
print("Time points per bin:", bin_counts)
print("Total cells per bin:", [sum(binned_data[state][i] for state in df_plot.index) for i in range(n_bins)])

In [None]:
import matplotlib.pyplot as plt

# Extract the day5_6 data for PB and GC windows
pb_indices = PB_post_bifurcation_window_indices
gc_indices = GC_post_bifurcation_window_indices

pb_pseudotime = [pseudotime_values_of_windows[i] for i in pb_indices]
gc_pseudotime = [pseudotime_values_of_windows[i] for i in gc_indices]

pb_day5_6 = day_count_per_window_df.loc['day5_6', pb_indices]
gc_day5_6 = day_count_per_window_df.loc['day5_6', gc_indices]

plt.figure(figsize=(12, 6))

# Plot PB branch
plt.plot(pb_pseudotime, pb_day5_6, label='PB branch cells', color='#BB3636', linewidth=2)
plt.fill_between(pb_pseudotime, pb_day5_6, color='#BB3636', alpha=0.25)

# Plot GC branch
plt.plot(gc_pseudotime, gc_day5_6, label='GC branch cells', color='#008000', linewidth=2)
plt.fill_between(gc_pseudotime, gc_day5_6, color='#008000', alpha=0.25)

plt.xlabel('Pseudotime')
plt.ylabel('Day 5 to 6 Cell Counts')
plt.legend()
plt.tight_layout()
plt.show()

# Episodic enrichment plots

In [3]:
### Multiome LFs
# PB
pb_ep1 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep1_pb.csv"
pb_ep2 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep2_pb.csv"
pb_ep3 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep3_pb.csv"
pb_ep4 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep4_pb.csv"
# GC
gc_ep1 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep1_gc.csv"
gc_ep2 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep2_gc.csv"
gc_ep3 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep3_gc.csv"
gc_ep4 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/direct_effect_enrichment/enrichment_ep4_gc.csv"

### TF perturbation LFs
# irf4 model 
# GC
gc_irf4_ep1 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_1.csv"
gc_irf4_ep2 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_2.csv"
gc_irf4_ep3 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_3.csv"
gc_irf4_ep4 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_4.csv"
gc_irf4_ep5 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_5.csv"
gc_irf4_ep6 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_6.csv"
gc_irf4_ep7 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_7.csv"
gc_irf4_ep8 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/irf4_ko/gc_98/enrichment_episode_8.csv"

# prdm1 model 
# GC
gc_blimp1_ep1 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_1.csv"
gc_blimp1_ep2 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_2.csv"
gc_blimp1_ep3 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_3.csv"
gc_blimp1_ep4 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_4.csv"
gc_blimp1_ep5 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_5.csv"
gc_blimp1_ep6 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_6.csv"
gc_blimp1_ep7 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_7.csv"
gc_blimp1_ep8 = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files/prdm1_ko/gc_98/enrichment_episode_8.csv"


In [None]:
df_ep1 = pd.read_csv(gc_blimp1_ep1)
df_ep2 = pd.read_csv(gc_blimp1_ep2)
df_ep3 = pd.read_csv(gc_blimp1_ep3)
df_ep4 = pd.read_csv(gc_blimp1_ep4)
df_ep5 = pd.read_csv(gc_blimp1_ep5)
df_ep6 = pd.read_csv(gc_blimp1_ep6)
df_ep7 = pd.read_csv(gc_blimp1_ep7)
df_ep8 = pd.read_csv(gc_blimp1_ep8)

fig, plot_data, plotted_tfs = plot_tf_episodic_enrichment_dotplot(
    dfs=[df_ep1, df_ep2, df_ep3, df_ep4, df_ep5, df_ep6, df_ep7, df_ep8],
    episode_labels=['ActB1/2', '.', '..', 'ActB3/4', '...', '....', '.....', 'late GC'],
    figsize=(4.5, 4),
    p_value_threshold=0.05,
    min_significance_threshold=0.05, 
    min_targets_in_lf=2,
    min_targets_dwnstrm=2,
    cmap_name="Greens",
    sort_by_gene_similarity=True,
    show_dendrogram=False,
    figure_title="Blimp1 KO",
    log_scale=True
)
# save the plot
#fig.savefig(os.path.join(output_folder, "prdm1_ko_episodic_enrichment_GC.pdf"), dpi=300)
#0.0035 for PB, 0.009 for GC in the gc_pb lf

No TFs meet the criteria: >= 2 LF genes AND >= 2 downstream genes


ValueError: not enough values to unpack (expected 3, got 2)