In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import scanpy as sc
import numpy as np

import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, balanced_accuracy_score, roc_auc_score, average_precision_score, matthews_corrcoef, cohen_kappa_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

In [None]:
base_dir = Path().resolve()
parent_dir = base_dir.parent
parent_dir

## Read flux and TMA preprocessed Data

### Read Metabolic Task data

In [None]:
metabolic_task_df = pd.read_csv(parent_dir / 'data/scCellFie/scCellFie_metabolic_tasks_with_name.csv', low_memory=False)
metabolic_task_df.rename(columns={'Unnamed: 0':'cell'},inplace=True)
metabolic_task_df.head()

In [None]:
print("count of +inf:", np.isposinf(metabolic_task_df.iloc[:,1:].to_numpy()).sum())
print("count of -inf:", np.isneginf(metabolic_task_df.iloc[:,1:].to_numpy()).sum())

In [None]:
metabolic_task_df = metabolic_task_df.replace(-np.inf, 0)

### Read Reactions data

In [None]:
reactions_df = pd.read_csv(parent_dir / 'data/scCellFie/scCellFie_reactions_with_name.csv', low_memory=False)
reactions_df.rename(columns={'Unnamed: 0':'cell'},inplace=True)
reactions_df.head()

In [None]:
print("count of +inf:", np.isposinf(reactions_df.iloc[:,1:].to_numpy()).sum())
print("count of -inf:", np.isneginf(reactions_df.iloc[:,1:].to_numpy()).sum())

In [None]:
reactions_df = reactions_df.replace(-np.inf, 0)

### Read selected gene expression data

In [None]:
gene_df = pd.read_csv(parent_dir / 'data/scCellFie/scCellFie_genes_with_name.csv', low_memory=False)
gene_df.rename(columns={'Unnamed: 0':'cell'},inplace=True)
gene_df.head()

In [None]:
print("count of +inf:", np.isposinf(gene_df.iloc[:,1:].to_numpy()).sum())
print("count of -inf:", np.isneginf(gene_df.iloc[:,1:].to_numpy()).sum())

### Read adata data

In [None]:
adata = sc.read(parent_dir / 'data/h5ad/merged_TMA_processed_compatible_n_gene_200.h5ad')
adata

In [None]:
# Extract UMAP
umap_df = pd.DataFrame(
    adata.obsm['X_umap'],
    index=adata.obs.index,
    columns=['UMAP1', 'UMAP2']
)

# Extract spatial
spatial_df = pd.DataFrame(
    adata.obsm['spatial'],
    index=adata.obs.index,
    columns=['X_spatial', 'Y_spatial']
)

# Concatenate along columns
coords_df = pd.concat([umap_df, spatial_df], axis=1)
coords_df = coords_df.reset_index().rename(columns={"index": "cell"})
coords_df

In [None]:
cols = ["sample", "leiden","cell_type", "Treatment_Status", "Subject_ID" ]
obs_df = adata.obs[cols].copy()
obs_df = adata.obs[cols].reset_index().rename(columns={"index": "cell"})
obs_df

In [None]:
obs_coords_df = obs_df.merge(coords_df, on="cell", how="inner")
obs_coords_df

### Merge datasets

In [None]:
dataset = metabolic_task_df.merge(obs_coords_df, on="cell", how="inner") \
               .merge(reactions_df, on="cell", how="inner") \
               .merge(gene_df, on="cell", how="inner")

# dataset = gene_df.merge(obs_coords_df, on="cell", how="inner") \
#                .merge(reactions_df, on="cell", how="inner")

# dataset = metabolic_task_df.merge(obs_coords_df, on="cell", how="inner")
# dataset

dataset["treatment_encoded"] = dataset["Treatment_Status"].map({"Untreated": 0, "Treated": 1})
dataset

### Distribution of Cell count and status per Subject ID

In [None]:
status_counts = dataset.groupby(["Subject_ID", "Treatment_Status"]).size().unstack(fill_value=0)
status_counts

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(status_counts, annot=True, cmap="Blues", fmt="d")
plt.title("Treatment Status Counts per Subject")
plt.ylabel("Subject ID")
plt.xlabel("Treatment Status")
plt.show()


In [None]:
cell_counts = dataset.groupby("Subject_ID")["cell"].nunique()
cell_counts

In [None]:
ax = cell_counts.plot(kind="bar", figsize=(8,5), color="skyblue")

# Add value labels
for container in ax.containers:
    ax.bar_label(container)
    
plt.title("Unique Cell Counts per Subject")
plt.xlabel("Subject ID")
plt.ylabel("Number of Unique Cells")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
summary = status_counts.join(cell_counts.rename("num_cells"))
summary

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(summary, annot=True, cmap="Blues", fmt="d")
plt.title("Treatment Status and Cell Counts per Subject")
plt.ylabel("Subject ID")
plt.xlabel("Treatment Status")
plt.show()

### Class Balance Analysis

In [None]:
dataset["Treatment_Status"].value_counts()

In [None]:
plt.figure(figsize=(5,4))
dataset["Treatment_Status"].value_counts().plot(
    kind="bar",
    color=["skyblue", "salmon"]
)
plt.title("Label Distribution (Treated vs UnTreated)")
plt.ylabel("Count")
plt.xlabel("Class")
plt.xticks(rotation=0)
plt.show()

# Classification Problem (Treated vs Untreated Cells)

## Cell Based Classification

### Dataset Definition

In [None]:
dataset = pd.get_dummies(dataset, columns=["cell_type"])
dataset

In [None]:
#"Treatment_Status","Subject_ID","treatment_encoded","sample","cell","leiden","X_spatial","Y_spatial","UMAP1","UMAP2","cell_type
X = dataset.drop(columns=["Treatment_Status","Subject_ID","treatment_encoded","sample","cell"])
y = dataset["treatment_encoded"]
feature_names = X.columns.tolist()

In [None]:
patients = dataset["Subject_ID"].unique()

train_patients, test_patients = train_test_split(
    patients,
    test_size=0.2,
    random_state=42,
    stratify=dataset.groupby("Subject_ID")["Treatment_Status"].first() 
)

In [None]:
X_train = X[dataset["Subject_ID"].isin(train_patients)]
y_train = y[dataset["Subject_ID"].isin(train_patients)]

X_test = X[dataset["Subject_ID"].isin(test_patients)]
y_test = y[dataset["Subject_ID"].isin(test_patients)]

### Feature Scaling

In [None]:
# Standardization Dataset
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=feature_names,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=feature_names,
    index=X_test.index
)

### Model Training

In [None]:
# Train Randomforest Classifier
rf = RandomForestClassifier(
   n_estimators=300, 
   random_state=42, 
   max_depth=8, 
   min_samples_split=20, 
   min_samples_leaf=10,
   class_weight="balanced",
   n_jobs=-1
)

rf.fit(X_train_scaled, y_train)

y_pred_rf = rf.predict(X_test_scaled)

train_acc = rf.score(X_train_scaled, y_train)
test_acc = rf.score(X_test_scaled, y_test)
print(f"Train Accuracy: {train_acc:.3f}, Test Accuracy: {test_acc:.3f}")


In [None]:
# Trian LightGBM Classifier
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

clf = lgb.LGBMClassifier(
    n_estimators=500,
    max_depth=-1,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    scale_pos_weight=scale_pos_weight
)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

train_acc = clf.score(X_train_scaled, y_train)
test_acc = clf.score(X_test_scaled, y_test)
print(f"Train Accuracy: {train_acc:.3f}, Test Accuracy: {test_acc:.3f}")

### Evaluation

In [None]:
print(classification_report(y_pred=y_pred, y_true=y_test))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap="Blues")
plt.show()

print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))

In [None]:
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred):.3f}")
print(f"PR-AUC (Average Precision): {average_precision_score(y_test, y_pred):.3f}")
print(f"Matthews Correlation Coefficient (MCC): {matthews_corrcoef(y_test, y_pred):.3f}")
print(f"Cohen’s Kappa: {cohen_kappa_score(y_test, y_pred):.3f}")

## Patient Based Classification

### Dataset Defintion

In [None]:
dataset = pd.get_dummies(dataset, columns=["cell_type"])
dataset.set_index('cell',inplace=True)
dataset

In [None]:
df_subjects = adata.obs.groupby("Subject_ID")["Treatment_Status"].first().reset_index()

train_subj, test_subj = train_test_split(df_subjects,test_size=0.2,random_state=42,stratify=df_subjects["Treatment_Status"])

print("Train subjects:", train_subj["Treatment_Status"].value_counts().to_dict())
print("Test subjects:", test_subj["Treatment_Status"].value_counts().to_dict())

train_idx = adata.obs.index[adata.obs["Subject_ID"].isin(train_subj["Subject_ID"])]
test_idx = adata.obs.index[adata.obs["Subject_ID"].isin(test_subj["Subject_ID"])]

print("Train cells:", adata.obs.loc[train_idx, "Treatment_Status"].value_counts().to_dict())
print("Test cells:", adata.obs.loc[test_idx, "Treatment_Status"].value_counts().to_dict())

In [None]:
y_train, y_test = dataset["treatment_encoded"].loc[train_idx], dataset["treatment_encoded"].loc[test_idx]
dataset.drop(columns=["Treatment_Status","Subject_ID","treatment_encoded","sample"], inplace=True)
X_train, X_test = dataset.loc[train_idx], dataset.loc[test_idx]
feature_names=X_train.columns.tolist()
print("Train subjects:", train_subj.shape[0], "Test subjects:", test_subj.shape[0])
print("Train cells:", X_train.shape[0], "Test cells:", X_test.shape[0])

### Feature Scaling

In [None]:
# Standardization Dataset
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=feature_names,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=feature_names,
    index=X_test.index
)

### Model Training

In [None]:
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))
print(class_weights_dict)

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    max_depth=8,
    min_samples_split=20,
    min_samples_leaf=10,
    n_jobs=-1,
    class_weight=class_weights_dict
)

rf.fit(X_train_scaled, y_train)

y_pred = rf.predict(X_test_scaled)

### Evaluation

In [None]:
test_subject_ids = adata.obs.loc[test_idx, "Subject_ID"]
cell_results = pd.DataFrame({
    "Subject_ID": test_subject_ids,
    "true": y_test,
    "pred": y_pred
})
cell_results

In [None]:
counts = cell_results.groupby("Subject_ID")["pred"].value_counts()
patient_preds = counts.groupby(level=0).idxmax().map(lambda x: x[1])
patient_preds

In [None]:
patient_true = (
    cell_results.groupby("Subject_ID")["true"].first() 
)
patient_true.dropna(inplace=True)

In [None]:
final_evaluation_df = pd.merge(patient_true,patient_preds,on='Subject_ID',how='inner')
final_evaluation_df.columns=['true','pred']
final_evaluation_df

In [None]:
print("\nClassification report:\n", classification_report(final_evaluation_df['true'], final_evaluation_df['pred']))

cm = confusion_matrix(final_evaluation_df['true'], final_evaluation_df['pred'])
print("Confusion Matrix:\n", cm)
# Display
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap="Blues")
plt.show()


print("Balanced Accuracy:", balanced_accuracy_score(final_evaluation_df['true'], final_evaluation_df['pred']))

## SHAP Analysis

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test_scaled)

In [None]:
# Treated Class
shap.summary_plot(shap_values, X_test_scaled, feature_names=feature_names)

In [None]:
# Plot SHAP for Class 1 of Random Forest
sv_class1 = shap_values[:, :, 1]
shap.summary_plot(sv_class1, features=X_test_scaled, feature_names=feature_names)

In [None]:
# Plot both of them
fig, axes = plt.subplots(1, shap_values.shape[2], figsize=(12, 6))

for c in range(shap_values.shape[2]):
    shap.summary_plot(
        shap_values[:, :, c], 
        features=X_test_scaled, 
        feature_names=feature_names, 
        show=False,  
        plot_size=None
    )
    plt.sca(axes[c])
    plt.title(f"Class {c}")

plt.tight_layout()
plt.show()

In [None]:
# shap diff plot
shap_diff = shap_values[:, :, 1] - shap_values[:, :, 0]
shap.summary_plot(
    shap_diff, 
    features=X_test_scaled, 
    feature_names=feature_names
)

In [None]:
# Feature importance via mean(|SHAP|)
mean_abs_shap = np.abs(shap_values[:,:,1]).mean(axis=0)
fi = pd.Series(mean_abs_shap, index=feature_names).sort_values(ascending=False)
print("\nTop features by mean(|SHAP|):\n", fi.head(10))

In [None]:
n_classes = shap_values.shape[2]

for c in range(n_classes):
    mean_abs_shap = np.abs(shap_values[:, :, c]).mean(axis=0)
    fi = pd.Series(mean_abs_shap, index=feature_names).sort_values(ascending=False)

    plt.figure(figsize=(8, 6))
    fi.head(10).plot(kind='barh')
    plt.xlabel("Mean(|SHAP value|)")
    plt.ylabel("Features")
    # plt.title(f"Top 10 Features by SHAP (Class {c})")
    plt.title(f"Top 10 Features by SHAP (Treated Class)")
    plt.gca().invert_yaxis()
    plt.show()

# Modern Pipeline

### Preprocessing

In [None]:
# define numerical and categorical feature title
# num_features = [col for col in X.columns if col.startswith("M")]
num_features = X.columns
# cat_features = ["Subject_ID"]

In [None]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        # ("cat", categorical_transformer, cat_features)
    ])

### Define Classifier Pipeline with Preprocessor

In [None]:
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=300, random_state=42, max_depth=8, min_samples_split=20, min_samples_leaf=10))
    # ("classifier", LogisticRegression(max_iter=1000))
])

In [None]:
clf.fit(X_train, y_train)

### Processed X data

In [None]:
Xt = preprocessor.fit_transform(X_train)
feature_names = preprocessor.get_feature_names_out()
# Xt_df = pd.DataFrame(Xt.toarray(), columns=list(feature_names))
Xt_df = pd.DataFrame(Xt, columns=list(feature_names))
Xt_df

In [None]:
# for col in [c for c in Xt_df.columns if c.startswith("num")]:
#     sns.boxplot(x=y_train, y=col, data=Xt_df)
#     plt.title(f"{col} by Treatment")
#     plt.show()

### Feature Importance

In [None]:
# Mutual Information (non-linear relationships)
mi = mutual_info_classif(Xt_df, y_train, discrete_features='auto')
mi_series = pd.Series(mi, index=Xt_df.columns).sort_values(ascending=False)
mi_series

In [None]:
# Feature importance from a model
importances = pd.Series(clf['classifier'].feature_importances_, index=Xt_df.columns).sort_values(ascending=False)
importances

In [None]:
# Pearson / Point-biserial correlation (continuous features vs binary target)
corr = Xt_df.corrwith(y_train)
corr.sort_values()

### Model Evaluation

In [None]:
y_pred = clf.predict(X_test)
classification_report(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:\n", cm)

# Display
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap="Blues")
plt.show()