In [None]:
import pandas as pd
from pathlib import Path
import re

# Folder with all the individual CellMarker CSVs you downloaded
folder =Path("../metadata/raw_markers")
print("Searching in:", folder.resolve())

files = list(folder.glob("*.csv")) + list(folder.glob("*.CSV"))
if not files:
    raise FileNotFoundError("No CSV files found in metadata/raw_markers. Move your downloads there or check the path.")

dfs = []
for f in files:
    # read with BOM-safe encoding; fallback if needed
    try:
        df = pd.read_csv(f, encoding="utf-8-sig")
    except UnicodeDecodeError:
        df = pd.read_csv(f, encoding="latin1")

    if df.empty:
        print(f"Skipping empty file: {f.name}")
        continue

    # normalize headers (trim, collapse spaces, strip BOMs)
    def norm_cols(cols):
        out = []
        for c in cols:
            c = str(c).replace("\ufeff", "").strip()
            c = re.sub(r"\s+", " ", c)
            out.append(c)
        return out
    df.columns = norm_cols(df.columns)

    # map CellMarker headings -> our unified schema (handle both Class/Clas)
    rename_map = {
        "Cell marker": "marker_gene",
        "Cell name": "cell_name",
        "Species": "species",
        "Tissue Class": "tissue_class",
        "Tissue Type": "tissue_type",
        "Source": "source",
        "Reference": "reference",
        "Organ": "tissue",           # sometimes present
        "Gene Symbol": "marker_gene" # rare variant
    }
    rename_map = {k: v for k, v in rename_map.items() if k in df.columns}
    df = df.rename(columns=rename_map)

    # add cell_type from filename
    df["cell_type"] = f.stem.replace("_", " ")

    dfs.append(df)

if not dfs:
    raise ValueError("All files were empty or unreadable; nothing to combine.")

combined = pd.concat(dfs, ignore_index=True)

# keep only the columns that actually exist
wanted = ["cell_type","cell_name","marker_gene","species","tissue_class","tissue_type","source","reference","tissue"]
cols_present = [c for c in wanted if c in combined.columns]
combined = combined[cols_present].copy()

# basic cleanup
for c in cols_present:
    combined[c] = combined[c].astype(str).str.strip()

# drop duplicate cell_type+marker pairs
if all(c in combined.columns for c in ["cell_type","marker_gene"]):
    combined = combined.drop_duplicates(subset=["cell_type","marker_gene"])

out_path = Path("../metadata/cell_type_markers_raw.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
combined.to_csv(out_path, index=False)

print("Saved to:", out_path.resolve())
print("Preview:")
print(combined.head())


Searching in: C:\Users\mahsa\OneDrive\Documents\Fall2025\BMCS Project\istarc\metadata\raw_markers


AttributeError: 'NoneType' object has no attribute 'resolve'