In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import dictys
from dictys.net import stat
import joblib

In [2]:
from utils_custom import *

In [3]:
# Define file paths 
output_folder = '/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/actb1_added_v2/output'

In [4]:
# Load data
dictys_dynamic_object = dictys.net.dynamic_network.from_file(os.path.join(output_folder, 'dynamic.h5'))

# Rank TFs unbiasedly based on their expression and regulation curve chars

# Get the top genes of TF subnetworks based on the beta curve chars

In [5]:
# load ranked TFs list 
ranked_tfs_pb = pd.read_csv(os.path.join(output_folder, 'ranked_tfs_pb.csv'))
pb_top_tfs = ranked_tfs_pb[0:15]
#convert to list of TF_name
pb_top_tfs_list = pb_top_tfs['TF_name'].tolist()
# get tf_indices from pb_top_tfs_list
tf_indices_top_pb_tfs, _, _ = get_tf_indices(dictys_dynamic_object, pb_top_tfs_list)
display(tf_indices_top_pb_tfs)

[225, 305, 25, 150, 243, 251, 299, 303, 68, 36, 48, 133, 46, 118, 134]

In [7]:
# get the edge strengths of the top tfs with all relevant genes (dropping genes based on sparsity)
pts, fsmooth = dictys_dynamic_object.linspace(0,2,100,0.0005)
stat1_net = fsmooth(stat.net(dictys_dynamic_object))
stat1_netbin = stat.fbinarize(stat1_net,sparsity=0.01)

In [10]:
# compute the binary network to get the indices of genes to keep based on sparsity
dnetbin = stat1_netbin.compute(pts)

In [11]:
display(dnetbin.shape)

(551, 11907, 100)

In [12]:
subnetworks = dnetbin[np.ix_(tf_indices_top_pb_tfs, range(dnetbin.shape[1]), range(dnetbin.shape[2]))]
display(subnetworks.shape)

(15, 11907, 100)

In [27]:
# Calculate sparsity for each gene (across all TFs and timepoints)
# Mean across TFs (axis 0) and time (axis 2)
gene_density = (subnetworks != 0).mean(axis=(0, 2))  # Shape: (11907,)
# Keep genes that are non-zero more than 20% of the time
genes_to_keep = gene_density > 0.05
genes_to_keep_indices = np.where(genes_to_keep)[0]
# Create reverse mapping: index -> gene_name
ndict = dictys_dynamic_object.ndict
index_to_gene = {idx: name for name, idx in ndict.items()}
# Get gene names for kept indices directly
kept_gene_names = [index_to_gene[idx] for idx in genes_to_keep_indices]
print(len(genes_to_keep_indices))

647


In [28]:
# Filter the subnetworks array to keep only non-sparse genes
filtered_subnetworks = subnetworks[:, genes_to_keep_indices, :]
print(filtered_subnetworks.shape)

(15, 647, 100)


In [None]:
import pandas as pd
import numpy as np
from multiprocessing import Pool
import math
from tqdm import tqdm

def process_chunk(args):
    """Process a chunk of curves to compute characteristics"""
    chunk_df, dtime = args
    results = {}
    for idx in chunk_df.index:
        curve = chunk_df.loc[idx]
        char_dict = compute_curve_characteristics(pd.DataFrame(curve).T, dtime)
        results[idx] = char_dict.iloc[0]
    return pd.DataFrame.from_dict(results, orient='index')

# Main processing code
dtime = pd.Series(np.linspace(0, 1, betas_dcurve.shape[1]))

# Split data into chunks
n_cores = 8  # Adjust based on your CPU
chunk_size = math.ceil(len(betas_dcurve) / n_cores)
chunks = []

for i in range(0, len(betas_dcurve), chunk_size):
    chunk = betas_dcurve.iloc[i:i + chunk_size]
    chunks.append((chunk, dtime))

# Process chunks in parallel with progress bar
with Pool(n_cores) as pool:
    results = list(tqdm(
        pool.imap(process_chunk, chunks),
        total=len(chunks),
        desc="Processing curves"
    ))

# Combine results and sort
final_dchar = pd.concat(results).sort_index()
# final_dchar.to_csv(os.path.join(output_folder, 'betas_chars_ntfs_ngenes.csv'))


In [None]:
def get_top_curves_by_category(dchar, ntops=(20,20,20,30)):
    """
    Get top curves for different patterns based on Terminal and Transient logFC.
    
    Parameters:
    -----------
    dchar : pd.DataFrame
        DataFrame with curve characteristics, having multi-index (TF, Target)
        and columns ['Terminal logFC', 'Transient logFC', 'Switching time']
    ntops : tuple
        Number of top curves to return for (activating, inactivating, transient_up, transient_down)
    """
    categories = {}
    
    # Activating (positive Terminal logFC)
    t1 = dchar.sort_values('Terminal logFC', ascending=False).head(ntops[0])
    categories['activating'] = t1.sort_values('Switching time')
    
    # Inactivating (negative Terminal logFC)
    t1 = dchar.sort_values('Terminal logFC', ascending=True).head(ntops[1])
    categories['inactivating'] = t1.sort_values('Switching time')
    
    # Transient up
    t1 = dchar.sort_values('Transient logFC', ascending=False).head(ntops[2])
    categories['transient_up'] = t1
    
    # Transient down
    t1 = dchar.sort_values('Transient logFC', ascending=True).head(ntops[3])
    categories['transient_down'] = t1
    
    # Extract TF-target pairs for each category
    activating_pairs = [(idx[0], idx[1]) for idx in categories['activating'].index]
    inactivating_pairs = [(idx[0], idx[1]) for idx in categories['inactivating'].index]
    transient_up_pairs = [(idx[0], idx[1]) for idx in categories['transient_up'].index]
    transient_down_pairs = [(idx[0], idx[1]) for idx in categories['transient_down'].index]
    
    return {
        'activating': activating_pairs,
        'inactivating': inactivating_pairs,
        'transient_up': transient_up_pairs,
        'transient_down': transient_down_pairs
    }

# Example usage:
sorted_categories = get_top_curves_by_category(final_dchar, ntops=(20,20,30,30))

# Print results
for category, pairs in sorted_categories.items():
    print(f"\nTop pairs for {category}:")
    for tf, target in pairs:
        print(f"TF: {tf}, Target: {target}")