In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

In [2]:
tasks_df = pd.read_csv('data/All_Metabolic_Tasks.csv')

moran_df = pd.read_csv('data/All_MoranI_combined.csv')
pathway_df = pd.read_csv('Task_Info_with_CRC_binary.csv')
moran_df = moran_df.merge(pathway_df[['Task', 'System' ,'Subsystem']], on='Task', how='left')
moran_df.rename(columns={'System':'Top-level Pathway','Subsystem': 'Detailed Pathway'}, inplace=True)

coloc_df = pd.read_csv('data/All_Colocalization_Scores.csv')

  tasks_df = pd.read_csv('data/All_Metabolic_Tasks.csv')


In [5]:
tasks_df

Unnamed: 0,(R)-3-Hydroxybutanoate synthesis,ATP generation from glucose (hypoxic conditions) - glycolysis,ATP regeneration from glucose (normoxic conditions) - glycolysis + krebs cycle,Acetoacetate synthesis,Alanine degradation,Alanine synthesis,Arachidonate degradation,Arachidonate synthesis,Arginine degradation,Arginine synthesis,...,beta-Alanine synthesis,cis-vaccenic acid degradation,cis-vaccenic acid synthesis,gamma-Linolenate degradation,Subject_ID,Treatment_Status,Cell_ID,Cell_type,x,y
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1182412B,Untreated,TMA2_1_303,Malignant,-481975.375698,8092.453408
1,3.596589,3.596589,1.593463,3.596589,2.363587,3.596589,1.388632,1.666016,2.688572,3.596589,...,3.596589,1.506556,0.828416,1.506556,1182412B,Untreated,TMA2_3_303,Malignant,-481865.375698,7851.453408
2,1.262172,1.262172,0.683991,1.262172,2.010341,1.262172,0.736895,0.736804,0.653205,1.262172,...,1.262172,3.839008,0.379129,3.839008,1182412B,Untreated,TMA2_6_303,Malignant,-480421.375698,6738.453408
3,0.941260,0.941260,0.617541,0.941260,0.652265,0.941260,0.764453,0.736154,0.465586,0.941260,...,0.941260,1.021286,0.348070,1.021286,1182412B,Untreated,TMA2_8_303,Malignant,-480420.375698,6614.453408
4,18.998492,18.998492,5.560580,18.998492,5.922492,18.998492,1.621913,5.685172,7.343501,18.998492,...,18.998492,2.281059,2.585627,2.281059,1182412B,Untreated,TMA2_9_303,Malignant,-480494.375698,6607.453408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2139956,0.000000,0.000000,0.000000,0.000000,0.065008,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.252118,0.000000,0.252118,1185092B,Untreated,TMA2_223_302,Plasma_IgG,-470201.968263,-72967.018989
2139957,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,5.942360,0.544111,5.942360,1185092B,Untreated,TMA2_231_302,Plasma_IgG,-470619.968263,-73036.018989
2139958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.051084,0.000000,0.000000,...,0.000000,0.375187,0.054611,0.375187,1185092B,Untreated,TMA2_236_302,Plasma_IgG,-470193.968263,-73111.018989
2139959,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1185092B,Untreated,TMA2_238_302,Fib_ECM,-469892.968263,-73112.018989


In [3]:
############################################
# 1. Identify metadata + task columns
############################################

meta_cols = ["Subject_ID", "Treatment_Status", "Cell_ID", "Cell_type", "x", "y"]
task_cols = [c for c in tasks_df.columns if c not in meta_cols]

############################################
# 2. Melt tasks from wide → long format
############################################

tasks_long = tasks_df.melt(
    id_vars=meta_cols,
    value_vars=task_cols,
    var_name="Task",
    value_name="Task_score"
)

############################################
# 3. Define T-cell proximity using cKDTree
############################################

def add_T_proximity(meta_df, radius=50.0, q=0.7):
    df = meta_df.copy()
    chunks = []

    for (sid, trt), g in df.groupby(["Subject_ID", "Treatment_Status"]):
        g = g.copy()
        coords = g[["x","y"]].values

        is_T = (
            g["Cell_type"].str.contains("T", case=False) |
            g["Cell_type"].str.contains("NK", case=False)
        )

        if is_T.sum() == 0:
            g["T_neighbors"] = 0
            g["T_proximal"] = False
        else:
            tcoords = g.loc[is_T, ["x","y"]].values
            tree = cKDTree(tcoords)
            counts = tree.query_ball_point(coords, r=radius)
            g["T_neighbors"] = [len(c) for c in counts]
            thr = g["T_neighbors"].quantile(q)
            g["T_proximal"] = g["T_neighbors"] >= thr

        chunks.append(g)

    return pd.concat(chunks, ignore_index=True)


tasks_meta = tasks_df[meta_cols].drop_duplicates("Cell_ID")
tasks_meta = add_T_proximity(tasks_meta, radius=50.0, q=0.7)

tasks_long = tasks_long.merge(
    tasks_meta[["Cell_ID", "T_neighbors", "T_proximal"]],
    on="Cell_ID",
    how="left"
)

############################################
# 4. Filter for Mph_*, Fib_*, Malignant
############################################

mask_eff = (
    tasks_long["Cell_type"].str.startswith("Mph_") |
    tasks_long["Cell_type"].str.startswith("Fib_") |
    (tasks_long["Cell_type"] == "Malignant")
)

eff_df = tasks_long[mask_eff].copy()

############################################
# 5. Compute proximal vs distal + log fold-change
############################################

prox = (
    eff_df
    .groupby(["Subject_ID", "Treatment_Status", "Cell_type", "Task", "T_proximal"])
    ["Task_score"]
    .mean()
    .reset_index()
    .pivot(
        index=["Subject_ID","Treatment_Status","Cell_type","Task"],
        columns="T_proximal",
        values="Task_score"
    )
)

prox = prox.rename(columns={False:"distal", True:"proximal"}).reset_index()
prox[["proximal","distal"]] = prox[["proximal","distal"]].fillna(0.0)

# small epsilon to avoid division by zero
eps = 1e-6
prox["logFC_prox_vs_dist"] = np.log2((prox["proximal"] + eps) / (prox["distal"] + eps))

############################################
# 6. Merge with Moran I
############################################

moran_cols = [
    "Task","Subject_ID","Treatment_Status",
    "I_z","pval_norm_fdr_bh","Top-level Pathway","Detailed Pathway"
]

full_df = prox.merge(
    moran_df[moran_cols],
    on=["Task","Subject_ID","Treatment_Status"],
    how="left"
)

############################################
# 7. Add colocalization per task
############################################

coloc_long = pd.concat([
    coloc_df.rename(columns={"Task_1":"Task"})[
        ["Subject_ID","Treatment_Status","Task","Colocalization_Score"]
    ],
    coloc_df.rename(columns={"Task_2":"Task"})[
        ["Subject_ID","Treatment_Status","Task","Colocalization_Score"]
    ],
], ignore_index=True)

coloc_task = (
    coloc_long
    .groupby(["Subject_ID","Treatment_Status","Task"])
    ["Colocalization_Score"]
    .mean()
    .reset_index()
    .rename(columns={"Colocalization_Score":"Coloc_mean"})
)

full_df = full_df.merge(
    coloc_task,
    on=["Subject_ID","Treatment_Status","Task"],
    how="left"
)

############################################
# 8. Summaries: log fold-change Treated vs Untreated
############################################

summary = (
    full_df
    .groupby(["Treatment_Status","Cell_type","Task"])
    [["logFC_prox_vs_dist","I_z","Coloc_mean"]]
    .mean()
    .reset_index()
)

treated = summary[summary["Treatment_Status"]=="Treated"]
untreated = summary[summary["Treatment_Status"]=="Untreated"]

merged = treated.merge(
    untreated,
    on=["Cell_type","Task"],
    suffixes=("_Treated","_Untreated")
)

# log2 fold-change of Treated vs Untreated
merged["logFC_Treated_vs_Untreated"] = np.log2(
    (merged["logFC_prox_vs_dist_Treated"] + eps) /
    (merged["logFC_prox_vs_dist_Untreated"] + eps)
)

############################################
# Outputs
############################################

print("full_df:", full_df.shape)
print("summary:", summary.shape)
print("merged (logFC):", merged.shape)

# full_df.to_csv("full_df_logFC.csv", index=False)
# merged.to_csv("treated_vs_untreated_logFC.csv", index=False)


full_df: (45000, 12)
summary: (2800, 6)
merged (logFC): (1400, 11)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [4]:
full_df.head()

Unnamed: 0,Subject_ID,Treatment_Status,Cell_type,Task,distal,proximal,logFC_prox_vs_dist,I_z,pval_norm_fdr_bh,Top-level Pathway,Detailed Pathway,Coloc_mean
0,89397,Treated,Fib_ACTA2,(R)-3-Hydroxybutanoate synthesis,0.0,2.45749,21.228754,,,,,
1,89397,Treated,Fib_ACTA2,ATP generation from glucose (hypoxic condition...,0.0,2.45749,21.228754,,,,,
2,89397,Treated,Fib_ACTA2,ATP regeneration from glucose (normoxic condit...,0.0,0.881156,19.749039,,,,,
3,89397,Treated,Fib_ACTA2,Acetoacetate synthesis,0.0,2.45749,21.228754,,,,,
4,89397,Treated,Fib_ACTA2,Alanine degradation,0.0,1.199259,20.193714,,,,,
