In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
from scipy import io, sparse

from pathlib import Path

In [5]:
base_dir = Path().resolve()
base_dir

PosixPath('/mnt/ldata/Sadegh/my_codes/CosMX_Nanostring_Analysis')

# Read Converted R data and Make AnnData for All Tissue Micro Array

In [None]:
data_dir = base_dir / 'converted_files_to_python_readable'
data_dir

In [None]:
adatas = []
keys = []
for subdir in sorted(data_dir.iterdir()):
    if subdir.is_dir():
        mtx_file = subdir / "counts.mtx"
        if not mtx_file.exists():
            print(f"Skipping {subdir}, counts.mtx not found")
            continue

        print(f"Reading {mtx_file}")
        # Read MTX and convert to CSR (cells x genes)
        X = io.mmread(mtx_file).T.tocsr()  

        # Read gene names
        genes_file = subdir / "genes.csv"
        var = None
        if genes_file.exists():
            var = pd.read_csv(genes_file)
            var = var.rename(columns={"x": "gene"})

        # Read cell metadata
        obs_file = subdir / "meta_data.csv"
        obs = None
        if obs_file.exists():
            obs = pd.read_csv(obs_file, index_col=0, low_memory=False)

        # Read spatial coords
        spatial = None
        cell_spatial_file = subdir / "spatial_coords.csv"
        if cell_spatial_file.exists():
            spatial = pd.read_csv(cell_spatial_file, index_col=0)
            spatial.index = spatial["cell"].astype(str) 
            spatial = spatial.drop(columns="cell")
            spatial = spatial.groupby("cell")[["x", "y"]].mean()

        # Create AnnData
        adata = sc.AnnData(X=X, obs=obs, var=var)
        if spatial is not None:
            adata.obsm["spatial"] = spatial.loc[adata.obs_names].values

        adatas.append(adata)
        keys.append(subdir.name)
        

In [None]:
adatas

In [None]:
all_TMA = sc.concat(adatas, keys=keys, label="sample", index_unique="_", join="outer", merge="first")
all_TMA

In [None]:
print(f"Merged shape: {all_TMA.shape}")
all_TMA.write("merged_TMA.h5ad")

# Read all TMA dataset and Process it

In [4]:
sc.__version__

'1.11.4'

In [6]:
adata = sc.read("merged_TMA.h5ad")
adata

AnnData object with n_obs × n_vars = 2873892 × 1011
    obs: 'nCount_Nanostring', 'nFeature_Nanostring', 'cell_ID', 'fov', 'Area', 'AspectRatio', 'Width', 'Height', 'Mean.PanCK', 'Max.PanCK', 'Mean.CD68', 'Max.CD68', 'Mean.CD298_B2M', 'Max.CD298_B2M', 'Mean.CD45', 'Max.CD45', 'Mean.DAPI', 'Max.DAPI', 'id', 'TMA', 'Subject_ID', 'Treatment_Status', 'Run_Tissue_name', 'log10totalcounts', 'tissue', 'sample'
    var: 'gene'
    obsm: 'spatial'

In [10]:
adata.var.to_csv('gene_name.csv', header=False, index=False)

In [None]:
total_elements = adata.shape[0] * adata.shape[1] 
nonzero = adata.X.nnz                            
zero = total_elements - nonzero                  
sparsity = zero / total_elements * 100           

print(f"🔢 Total elements     : {total_elements:,}")
print(f"✅ Non-zero elements : {nonzero:,}")
print(f"⭕️ Zero elements     : {zero:,}")
print(f"📉 Sparsity          : {sparsity:.2f}% zeros")


### QC on anndata

In [None]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

### Filtering on cells and genes

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts'], jitter=0.4)


In [None]:
print("Before filtering:", adata.shape)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
print("After filtering:", adata.shape)

### Normalization and Logarithmic Data

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

### Finding Highly Variable Genes

In [None]:
# sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000)
# adata = adata[:, adata.var["highly_variable"]]
# adata

### Standardization data

In [None]:
sc.pp.scale(adata, max_value=10)

### Apply PCA - Dimensional Reduction

In [None]:
sc.tl.pca(adata, svd_solver="arpack")

### Neighborhood graph 

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

### Apply UMAP - Dimensional Reduction

In [None]:
sc.tl.umap(adata)

### Clustering by Leiden method

In [None]:
sc.tl.leiden(adata, resolution=0.5)

In [None]:
adata

In [None]:
# adata.obsm['X_umap']

### Save Processed Dataset

In [None]:
# adata.write("merged_TMA_processed.h5ad")

# Read Processed TMA dataset, Store as CSV and some Visualization

In [None]:
adata = sc.read('merged_TMA_processed.h5ad')

In [None]:
counts_df = pd.DataFrame(
        adata.X,
        index=adata.obs_names,
        columns=adata.var.gene
    )
counts_df.T

In [None]:
counts_df

In [None]:
# counts_df.to_csv('merged_TMA_preprocessed.csv')

### PCA and UMAP

In [None]:
sc.pl.pca(
    adata,
    color=["leiden"],
    legend_loc="on data",
    frameon=False,
    size=2.0,
    wspace=0.3,
    save="_umap_leiden.png"  
)

In [None]:
sc.pl.umap(
    adata,
    color=["leiden"],
    legend_loc="on data",
    frameon=False,
    size=2.0,
    wspace=0.3,
    save="_umap_leiden.png"  
)

### Plot Spatial coords based on leiden cluster

In [None]:
# sc.pl.spatial(
#     adata,
#     color="leiden",
#     spot_size=100,     
#     frameon=False,
#     alpha_img=0.0,
#     save="_spatial_leiden.png" 
# )

### QC and Cell Compostion

In [None]:
sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_in_top_50_genes"],
    groupby="sample",
    jitter=0.4,
    multi_panel=True
)

In [None]:
adata.obs["leiden"].value_counts().plot(kind="bar", figsize=(6,3))

### UMAP Visualization

In [None]:
sc.pl.umap(adata, color="leiden", legend_loc="on data", frameon=False)

In [None]:
sc.pl.umap(adata, color=["sample", "Treatment_Status"], frameon=False)

### Spatial Visualization

In [None]:
sc.pl.embedding(
    adata,
    basis="spatial",
    color="leiden",
    frameon=False,
    size=3
)


#### per Tissue Sample

In [None]:
# Per Sample
for s in adata.obs["sample"].unique():
    sc.pl.embedding(
        adata[adata.obs["sample"] == s],
        basis="spatial",
        color="leiden",
        frameon=False,
        size=3,
        title=f"Spatial - {s}"
    )

### PCA Visualization

In [None]:
sc.pl.pca(adata, color="leiden")

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

### Cluster Relationship 

In [None]:
sc.tl.dendrogram(adata, groupby="leiden")
sc.pl.dendrogram(adata, groupby="leiden")

In [None]:
sc.tl.paga(adata, groups="leiden")
sc.pl.paga(adata, threshold=0.03, show=True)