In [None]:
# Python 3.10+, scanpy>=1.9, anndata>=0.9
import pandas as pd
import numpy as np
import scanpy as sc
from scipy import io
from scipy import sparse
from pathlib import Path

# --- CONFIG ---
XENIUM_DIR = Path("/path/to/cell_feature_matrix")  # contains barcodes.tsv.gz, features.tsv.gz, matrix.mtx.gz
PHENO_CSV  = Path("/path/to/phenocycler_cells.csv")
PHENO_ID_COL = "cell_id"   # change to the exact column name holding the Xenium-matching cell IDs
SAVE_H5AD  = Path("/path/to/combined_xenium_pheno.h5ad")

# --- Load Xenium counts ---
mtx = io.mmread(XENIUM_DIR / "matrix.mtx.gz")                  # rows=features, cols=cells (10x convention)
features = pd.read_csv(XENIUM_DIR / "features.tsv.gz", sep="\t", header=None, names=["feature_id","feature_name","feature_type"])
barcodes = pd.read_csv(XENIUM_DIR / "barcodes.tsv.gz", sep="\t", header=None, names=["barcode"])

# transpose to cells x features (AnnData convention is obs=cells, var=features)
X = sparse.csr_matrix(mtx).T
assert X.shape == (barcodes.shape[0], features.shape[0]), "Shape mismatch after transpose"

adata = sc.AnnData(
    X=X,
    obs=pd.DataFrame(index=barcodes["barcode"].astype(str).values),
    var=features.set_index("feature_id")
)
# Optionally keep a human-readable name for genes/features:
if "feature_name" in adata.var.columns:
    adata.var["symbol"] = adata.var["feature_name"].astype(str)

# --- Load Phenocycler intensities ---
# Use low_memory=False to avoid dtype warnings; index by the cell ID column
pheno = pd.read_csv(PHENO_CSV, low_memory=False)
if PHENO_ID_COL not in pheno.columns:
    raise KeyError(f"Column '{PHENO_ID_COL}' not found in {PHENO_CSV.name}. "
                   f"Available columns: {list(pheno.columns)[:10]}...")

pheno[PHENO_ID_COL] = pheno[PHENO_ID_COL].astype(str)
pheno = pheno.set_index(PHENO_ID_COL)

# Optional: drop obviously non-intensity columns to keep memory lean
# (Adjust this filter to your schema.)
non_intensity_like = {"x", "y", "z", "X", "Y", "Z", "fov", "FOV", "tile", "Tile", "region", "Region"}
candidate_numeric = pheno.select_dtypes(include=[np.number])
intensity_cols = [c for c in candidate_numeric.columns if c not in non_intensity_like and not c.lower().endswith(("_id","id"))]

# --- Align on shared cells ---
shared = adata.obs_names.intersection(pheno.index)
if shared.size == 0:
    raise ValueError("No shared cell IDs between Xenium barcodes and Phenocycler CSV. "
                     "Verify that the ID systems match (e.g., exact strings, prefixes, FOV/cell formats).")

# Subset AnnData to shared cells (keeps counts sparse)
adata = adata[shared, :].copy()

# Join intensities into .obs (left join on obs index)
adata.obs = adata.obs.join(pheno.loc[shared, intensity_cols], how="left")

# (Optional) preserve full Phenocycler row in obsm if you want everything:
# adata.obsm["phenocycler_all"] = pheno.loc[shared].copy()

# --- Basic QC checks ---
missing_frac = adata.obs[intensity_cols].isna().mean().mean()
print(f"Joined {len(shared)} cells. Mean fraction of missing intensity values: {missing_frac:.4f}")

# --- Save ---
adata.write_h5ad(SAVE_H5AD, compression="lzf")
print(f"Saved: {SAVE_H5AD}")
