In [20]:
import scanpy as sc
import numpy as np
import pandas as pd

# === Load AnnData objects ===
adata_harmony = sc.read_h5ad("/root/Desktop/my_pan/workspace/Data/h5ad_output/harmony_corrected_bonemarrow.h5ad")
adata_scvi = sc.read_h5ad("/root/Desktop/my_pan/workspace/Data/h5ad_output/bonemarrow_merged_scvi.h5ad")
# adata_seurat = sc.read_h5ad("/root/Desktop/my_pan/workspace/Data/h5ad_output/merged_bone_marrow_batches_seurat_pca.h5ad")
adata_scatlasvae = sc.read_h5ad("/root/Desktop/my_pan/workspace/Data/h5ad_output/new_merged_bonemarrow_batches_scatlasvae_full.h5ad")
# === Use Harmony clustering (e.g., Leiden) as pseudo celltype ===
adata_harmony.obs["celltype"] = adata_harmony.obs["leiden"]


In [21]:

# === Copy celltype labels to other datasets by matching cell names ===
# adata_scvi.obs["celltype"] = adata_harmony.obs.loc[adata_scvi.obs_names, "celltype"]
adata_scatlasvae.obs["celltype"] = adata_harmony.obs.loc[adata_scatlasvae.obs_names, "celltype"]
# adata_seurat.obs["celltype"] = adata_harmony.obs.loc[adata_seurat.obs_names, "celltype"]

# === Save updated .h5ad files ===
# adata_harmony.write("/root/Desktop/my_pan/workspace/Data/h5ad_output/harmony_corrected_bonemarrow_with_celltype.h5ad")
# adata_scvi.write("/root/Desktop/my_pan/workspace/Data/h5ad_output/bonemarrow_merged_scvi_with_celltype.h5ad")
# adata_seurat.write("/root/Desktop/my_pan/workspace/Data/h5ad_output/merged_bone_marrow_batches_seurat_pca_with_celltype.h5ad")

# === Check if celltype labels are copied correctly ===
print("✅ celltype labels copied and saved into new .h5ad files.")


adata_scatlasvae.write("/root/Desktop/my_pan/workspace/Data/h5ad_output/new_merged_bonemarrow_batches_scatlasvae_full_with_celltype.h5ad")


✅ celltype labels copied and saved into new .h5ad files.


In [14]:
adata_scatlasvae.obs["batch"] = adata_scatlasvae.obs_names.str.split(".").str[0]
print("✅ 已从细胞名前缀提取 batch")

✅ 已从细胞名前缀提取 batch


In [19]:
adata_scatlasvae.write("/root/Desktop/my_pan/workspace/Data/h5ad_output/new_merged_bonemarrow_batches_scatlasvae_full_with_celltype.h5ad")

In [None]:
adata_scatlasvae.write("/root/Desktop/my_pan/workspace/Data/h5ad_output/bonemarrow_merged_scatlasvae_with_celltype.h5ad")

In [6]:
# Benchmark integration results using pseudo celltype labels (e.g., Harmony-derived clustering)

import scanpy as sc
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score

# Load preprocessed .h5ad files (with celltype from Harmony transferred)
data_paths = {
    "Harmony": "/root/Desktop/my_pan/workspace/Data/h5ad_output/harmony_corrected_bonemarrow_with_celltype.h5ad",
    "scVI": "/root/Desktop/my_pan/workspace/Data/h5ad_output/bonemarrow_merged_scvi_with_celltype_and_leiden.h5ad",
    # "Seurat": "/root/Desktop/my_pan/workspace/Data/h5ad_output/merged_bone_marrow_batches_seurat_pca_with_celltype.h5ad"
}

# Define key mappings
embedding_keys = {
    "Harmony": "X_pca",
    "scVI": "X_scVI",
    # "Seurat": "X_pca"
}
cluster_keys = {
    "Harmony": "leiden",
    "scVI": "leiden",
    # "Seurat": "leiden"
}

# Benchmark function

def benchmark_with_celltype(embedding, celltype_labels, cluster_labels):
    results = {}
    results["Silhouette_celltype"] = silhouette_score(embedding, celltype_labels) if len(set(celltype_labels)) > 1 else np.nan
    results["ARI"] = adjusted_rand_score(celltype_labels, cluster_labels) if len(set(cluster_labels)) > 1 else np.nan
    results["NMI"] = normalized_mutual_info_score(celltype_labels, cluster_labels) if len(set(cluster_labels)) > 1 else np.nan
    return results

# Run benchmark across methods
all_results = {}
for method, path in data_paths.items():
    adata = sc.read_h5ad(path)
    embedding = adata.obsm[embedding_keys[method]]
    celltypes = adata.obs["celltype"]
    clusters = adata.obs[cluster_keys[method]]
    all_results[method] = benchmark_with_celltype(embedding, celltypes, clusters)

# Convert results to DataFrame
results_df = pd.DataFrame(all_results).T
results_df.to_csv("/root/Desktop/my_pan/workspace/Data/benchmark_out/integration_celltype_benchmark.csv")
print("✅ Celltype-based benchmark results saved.")


✅ Celltype-based benchmark results saved.
