In [None]:
import pandas as pd
import re
from pathlib import Path
import spatialdata as sd

# Load Visium data
sdata = sd.read_zarr("../data/mouse_brain_visium_hd.sdata.zarr")
adata = sdata.tables[list(sdata.tables.keys())[0]]
visium_genes = set(adata.var_names)

# Load CellMarker2 mouse brain CSV
cm_path = Path("../data/cell_markers/callmarkers2_mouse_brain.csv")
cm = pd.read_csv(cm_path)

# Normalize column names
def norm_cols(cols):
    out = []
    for c in cols:
        c = str(c).replace("\ufeff", "").strip()
        c = re.sub(r"\s+", " ", c)
        out.append(c)
    return out

cm.columns = norm_cols(cm.columns)

# Rename to a consistent schema
cm_rename = {
    "Cell name": "cell_type",
    "Cell marker": "marker_gene",
    "Species": "species",
    "Tissue Class": "tissue_class",
    "Tissue Type": "tissue_type",
    "Cancer": "cancer",
    "Source": "source_raw",
}
cm = cm.rename(columns={k: v for k, v in cm_rename.items() if k in cm.columns})


# Intersect with Visium gene list
cm["marker_gene"] = cm["marker_gene"].astype(str).str.strip()
cm = cm[cm["marker_gene"].isin(visium_genes)].copy()

# Add metadata fields we want downstream
cm["database"] = "CellMarker2"
cm["organ"] = "brain"

# Keep only columns relevant to the project
cm_markers = cm[[
    "cell_type",
    "marker_gene",
    "species",
    "organ",
    "tissue_class",
    "tissue_type",
    "cancer",
    "database",
]]

# Clean strings
for col in ["cell_type", "marker_gene"]:
    cm_markers[col] = cm_markers[col].astype(str).str.strip()

# Drop exact duplicates
cm_markers = cm_markers.drop_duplicates(
    subset=["cell_type", "marker_gene", "database"]
)

# Save and brief summary
out_path = Path("../data/cell_markers/normalized_cellmarkers2.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
cm_markers.to_csv(out_path, index=False)

print("\nCellMarker2 marker file ready")
print("  Rows:", len(cm_markers))
print("  Unique cell types:", cm_markers["cell_type"].nunique())
print("  Unique genes:", cm_markers["marker_gene"].nunique())



CellMarker2 marker file ready
  Rows: 3113
  Unique cell types: 40
  Unique genes: 2725


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cm_markers[col] = cm_markers[col].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cm_markers[col] = cm_markers[col].astype(str).str.strip()
