In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import dictys
from dictys.net import stat
import joblib
import pickle
from scipy.stats import median_abs_deviation, hypergeom
import math

In [2]:
from utils_custom import *

In [3]:
# Define file paths
output_folder = '/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output/intermediate_tmp_files'
data_folder = '/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output'

In [4]:
# Load data
dictys_dynamic_object = dictys.net.dynamic_network.from_file(os.path.join(data_folder, 'dynamic.h5'))

## TF expression + TF degree dynamics

In [25]:
lcurve_tf_pb, dtime_pb = compute_expression_regulation_curves(dictys_dynamic_object, start=0, stop=2, num=20, dist=0.001, mode="regulation")

In [26]:
display(lcurve_tf_pb.head())
display(lcurve_tf_pb.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
AHR,10.197217,10.192293,10.192293,10.184875,10.164907,10.147205,10.11244,10.087463,10.034799,9.964341,9.865733,9.744834,9.584963,9.398744,9.209453,9.027906,8.915879,8.794416,8.689998,8.629357
ARID3A,4.807355,4.906891,4.857981,4.807355,4.70044,4.70044,4.584963,4.643856,4.523562,4.459432,4.247928,4.0,3.70044,3.70044,3.70044,3.584963,3.459432,3.459432,3.584963,3.584963
ARID3B,7.577429,7.60733,7.622052,7.60733,7.577429,7.569856,7.577429,7.491853,7.507795,7.507795,7.442943,7.33985,7.189825,7.149747,7.044394,7.011227,6.918863,6.894818,6.882643,6.84549
ARID5A,6.882643,6.857981,6.84549,6.820179,6.78136,6.70044,6.599913,6.584963,6.459432,6.285402,6.0,5.754888,5.459432,5.209453,5.129283,5.129283,5.129283,5.0,5.087463,5.087463
ARID5B,4.169925,3.906891,3.807355,3.807355,3.70044,3.459432,3.169925,3.0,3.169925,3.321928,4.0,3.906891,4.169925,4.0,3.807355,4.087463,4.0,4.169925,4.247928,4.459432


(551, 20)

#### Get curve chars

In [27]:
dchar_tf_pb = compute_curve_characteristics(lcurve_tf_pb, dtime_pb)
display(dchar_tf_pb.head())
display(dchar_tf_pb.shape)

Unnamed: 0,Terminal logFC,Transient logFC,Switching time
AHR,-1.56786,0.0,0.663595
ARID3A,-1.222392,-0.005311,0.50831
ARID3B,-0.731939,0.005496,0.64615
ARID5A,-1.79518,-0.004603,0.505593
ARID5B,0.289507,-0.38968,0.959504


(551, 3)

In [28]:
from scipy import stats

def classify_tf_activity(df, terminal_col, transient_col):
    """
    Add TF activity class to dataframe based on z-score normalized logFC comparison
    """
    # Z-score normalize both columns
    terminal_zscore = stats.zscore(df[terminal_col])
    transient_zscore = stats.zscore(df[transient_col])
    
    # Classification function
    def get_class_name(terminal_z, transient_z):
        if abs(terminal_z) >= abs(transient_z):
            # Terminal effect dominates
            return 'Cumulative' if terminal_z > 0 else 'Reductive'
        else:
            # Transient effect dominates
            return 'Bell wave' if transient_z > 0 else 'U-shaped'
    
    # Add class name column
    df['tf_class'] = [get_class_name(t_z, tr_z) 
                      for t_z, tr_z in zip(terminal_zscore, transient_zscore)]
    # Add z score columns
    df['terminal_z'] = terminal_zscore
    df['transient_z'] = transient_zscore
    df['terminal_rank'] = df['terminal_z'].abs().rank(method='dense', ascending=False).astype(int)
    df['transient_rank'] = df['transient_z'].abs().rank(method='dense', ascending=False).astype(int)
    return df

def get_top_k_tfs_by_class(df, k=20):
    """
    Get top k TFs from each class based on their relevant ranks
    """
    # Determine which rank to use for each TF based on their class
    def get_relevant_rank(row):
        # If terminal effect dominates (Activating/Inactivating or similar), use terminal_rank
        # If transient effect dominates, use transient_rank
        if abs(row['terminal_z']) >= abs(row['transient_z']):
            return row['terminal_rank']
        else:
            return row['transient_rank']
    
    df['relevant_rank'] = df.apply(get_relevant_rank, axis=1)
    
    # Get unique classes
    classes = df['tf_class'].unique()
    
    # Dictionary to store top k TFs for each class
    top_tfs_dict = {}
    
    for class_name in classes:
        class_df = df[df['tf_class'] == class_name].copy()
        # Sort by relevant rank and take top k
        top_k = class_df.nsmallest(k, 'relevant_rank')
        # Extract TF names (assuming index contains TF names)
        top_tfs_dict[class_name] = top_k.index.tolist()
    
    # Create result dataframe with classes as columns
    # Pad shorter lists with None to make all columns same length
    max_len = max(len(v) for v in top_tfs_dict.values())
    
    for class_name in top_tfs_dict:
        while len(top_tfs_dict[class_name]) < max_len:
            top_tfs_dict[class_name].append(None)
    
    result_df = pd.DataFrame(top_tfs_dict)
    
    return result_df

In [29]:
dclass_tf_pb = classify_tf_activity(dchar_tf_pb, 'Terminal logFC', 'Transient logFC')
display(dclass_tf_pb.head())
display(dclass_tf_pb.shape)

Unnamed: 0,Terminal logFC,Transient logFC,Switching time,tf_class,terminal_z,transient_z,terminal_rank,transient_rank
AHR,-1.56786,0.0,0.663595,Reductive,-1.015492,-0.125767,133,322
ARID3A,-1.222392,-0.005311,0.50831,Reductive,-0.766908,-0.148879,184,289
ARID3B,-0.731939,0.005496,0.64615,Reductive,-0.413998,-0.101847,286,352
ARID5A,-1.79518,-0.004603,0.505593,Reductive,-1.179063,-0.145801,104,294
ARID5B,0.289507,-0.38968,0.959504,U-shaped,0.320993,-1.821718,318,25


(551, 8)

In [30]:
top_k_df = get_top_k_tfs_by_class(dclass_tf_pb, k=20)
display(top_k_df)

Unnamed: 0,Reductive,U-shaped,Cumulative,Bell wave
0,ETV5,MYC,ZNF441,MYPOP
1,TBX21,LEF1,TP73,SNAI3
2,EGR2,ZNF540,ZNF566,ZNF605
3,PATZ1,HSF5,ZNF222,GLI1
4,FOSL2,ARID5B,ST18,ZNF16
5,ZNF783,DDIT3,ZNF415,ZNF692
6,ZNF257,ZNF354C,ZNF610,ZNF266
7,LYL1,ZNF18,ZNF296,ZNF595
8,ZBTB6,ZNF770,ZNF548,ZNF823
9,JUNB,ZNF350,BHLHE22,MTF1


## Regulatory network

In [None]:
# get lcpm chars for these genes
lcpm_dcurve_pb, dtime_pb = compute_expression_regulation_curves(dictys_dynamic_object, start=0, stop=2, num=20, dist=0.001, mode="expression")
# get lcpm chars for these genes
lcpm_dcurve_gc, dtime_gc = compute_expression_regulation_curves(dictys_dynamic_object, start=0, stop=3, num=20, dist=0.001, mode="expression")
# slice the dcurve for the lf genes using gene names which are indices in pandas df
display(lcpm_dcurve_gc.head())

In [None]:
pts, fsmooth = dictys_dynamic_object.linspace(0,3,20,0.001)
stat1_net = fsmooth(stat.net(dictys_dynamic_object)) #varname=w_in loads total effect network
stat1_netbin = stat.fbinarize(stat1_net,sparsity=0.01)
stat1_x=stat.pseudotime(dictys_dynamic_object,pts)
dtime = pd.Series(stat1_x.compute(pts)[0])