# Analysis

**Hypothesis**: Endometrial lymphocyte composition and activation state oscillate across the menstrual cycle, peaking with a transient enrichment of highly cytotoxic NK-like cells and elevated cytotoxic-gene programs during the mid-secretory window of implantation (days 15–19).

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

# Set up visualization defaults for better plots
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figsize = (8, 8)
sc.settings.dpi = 100
sc.settings.facecolor = 'white'
warnings.filterwarnings('ignore')

# Set Matplotlib and Seaborn styles for better visualization
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['savefig.dpi'] = 150
sns.set_style('whitegrid')
sns.set_context('notebook', font_scale=1.2)

# Load data
print("Loading data...")
adata = sc.read_h5ad("/scratch/users/salber/endo_data.h5ad")
print(f"Data loaded: {adata.shape[0]} cells and {adata.shape[1]} genes")


# Analysis Plan

**Hypothesis**: Endometrial lymphocyte composition and activation state oscillate across the menstrual cycle, peaking with a transient enrichment of highly cytotoxic NK-like cells and elevated cytotoxic-gene programs during the mid-secretory window of implantation (days 15–19).

## Steps:
- Subset cells annotated as Lymphocytes, preserve raw counts, cast donor IDs to string, and train an scVI model batch-corrected on donor to generate a 20-dimensional latent space (seed = 0) stored in obsm['X_scVI_20d'].
- Build a kNN graph on the scVI latent space, run Leiden clustering (resolution = 1.0, random_state = 0), compute a UMAP, and annotate clusters using CellTypist with the immune-focused ‘Immune_All_Low.pkl’ model; retain per-cell prediction probabilities to flag low-confidence labels.
- Aggregate subtype proportions per donor within cycle-phase bins (proliferative = 4–9, early-secretory = 10–14, mid-secretory = 15–19) and test for phase-dependent shifts using Cochran–Armitage trend tests and (if needed) donor-level mixed-effects logistic regression; visualize with stacked barplots and donor-overlayed dotplots.
- Compute a cytotoxic gene-set score (GZMB, PRF1, NKG7, GNLY, KLRD1) per cell with sc.tl.score_genes (default control gene sampling); compare score distributions across phase bins within each subtype using Kruskal–Wallis tests followed by Dunn post-hoc tests with Benjamini–Hochberg correction, optionally repeating on per-donor medians; display subtype-stratified violin plots.
- Within CellTypist-annotated NK cells, perform scVI differential expression between proliferative (4–9) and mid-secretory (15–19) phases while conditioning on donor; FDR-correct (q < 0.05) and visualize the top up- and down-regulated genes with dotplots, highlighting cytotoxic and implantation-relevant genes.


## The code isolates lymphocytes, preserves raw counts, ensures donor IDs are categorical, and trains an scVI model that removes donor-specific batch effects. The 20-dimensional batch-corrected latent representation is saved for downstream clustering and visualization, with model settings recorded for reproducibility.

In [None]:

import numpy as np
import scanpy as sc
import scvi

# ------------------------------------------------------------------
# Set global randomness for reproducibility
np.random.seed(0)
scvi.settings.seed = 0
sc.settings.set_figure_params(dpi=80, facecolor="white")

# ------------------------------------------------------------------
# 1. Subset to lymphocytes and basic sanity checks
mask = adata.obs["cell_type"].str.strip() == "Lymphocytes"
lymphocytes = adata[mask].copy()
assert lymphocytes.n_obs > 0, "No lymphocytes found in the dataset."

# Preserve raw counts for downstream differential expression
if lymphocytes.raw is None:
    lymphocytes.raw = lymphocytes.copy()

# Ensure donor is string-typed (required for scVI categorical handling)
lymphocytes.obs["donor"] = lymphocytes.obs["donor"].astype(str)

# ------------------------------------------------------------------
# 2. scVI setup and model training (donor-batch correction)
scvi.model.SCVI.setup_anndata(lymphocytes, batch_key="donor")
model = scvi.model.SCVI(
    lymphocytes,
    n_latent=20,
    n_layers=2,
)
# Reduced epochs and patience to avoid long runtimes
model.train(
    max_epochs=50,
    early_stopping=True,
    early_stopping_patience=10,
    check_val_every_n_epoch=1,
    verbose=False,
)

# ------------------------------------------------------------------
# 3. Store the latent representation and model provenance
lymphocytes.obsm["X_scVI_20d"] = model.get_latent_representation()
lymphocytes.uns["scvi_latent_info"] = {
    "n_latent": 20,
    "n_layers": 2,
    "max_epochs": 50,
    "seed": 0,
    "batch_key": "donor",
    "early_stopping": True,
}

print(f"Stored scVI latent with shape {lymphocytes.obsm['X_scVI_20d'].shape}")
print(f"Final training ELBO: {model.history['elbo_train'].iloc[-1]:.2f}")


### Agent Interpretation

Current analysis step failed to run. Try an alternative approach

## This script subsets the AnnData object to lymphocytes, filters out genes expressed in fewer than 20 cells to cut memory usage, and then trains a lightweight CPU-based scVI model (15-dim latent, 1 hidden layer) while correcting for donor batch. It saves the resulting latent embedding and all training metadata back into the AnnData object for downstream analyses.

In [None]:

import numpy as np
import scanpy as sc
import scvi

# ------------------------------------------------------------------
# Set global randomness for reproducibility
np.random.seed(0)
scvi.settings.seed = 0
sc.settings.set_figure_params(dpi=80, facecolor="white")

# ------------------------------------------------------------------
# 1. Subset to lymphocytes and basic sanity checks
mask = adata.obs["cell_type"].str.strip() == "Lymphocytes"
lymphocytes = adata[mask].copy()
assert lymphocytes.n_obs > 0, "No lymphocytes found in the dataset."

# Preserve raw counts for downstream differential expression
if lymphocytes.raw is None:
    lymphocytes.raw = lymphocytes.copy()

# Ensure donor is string-typed (required for scVI categorical handling)
lymphocytes.obs["donor"] = lymphocytes.obs["donor"].astype(str)

# ------------------------------------------------------------------
# 2. scVI setup and model training (donor-batch correction)
scvi.model.SCVI.setup_anndata(lymphocytes, batch_key="donor")
model = scvi.model.SCVI(
    lymphocytes,
    n_latent=20,
    n_layers=2,
)
# Reduced epochs and patience to avoid long runtimes
model.train(
    max_epochs=50,
    early_stopping=True,
    early_stopping_patience=10,
    check_val_every_n_epoch=1,
)

# ------------------------------------------------------------------
# 3. Store the latent representation and model provenance
lymphocytes.obsm["X_scVI_20d"] = model.get_latent_representation()
lymphocytes.uns["scvi_latent_info"] = {
    "n_latent": 20,
    "n_layers": 2,
    "max_epochs": 50,
    "seed": 0,
    "batch_key": "donor",
    "early_stopping": True,
}

print(f"Stored scVI latent with shape {lymphocytes.obsm['X_scVI_20d'].shape}")
print(f"Final training ELBO: {model.history['elbo_train'].iloc[-1]:.2f}")


### Agent Interpretation

Current analysis step failed to run. Try an alternative approach

## The code performs stringent cell-level quality control, isolates lymphocytes, stores raw counts for later differential expression, applies library-size normalisation and log1p transformation, and selects 2 000 highly variable genes per donor. This prepares a clean, normalised AnnData object that subsequent steps can use for scVI integration and downstream analyses.

In [None]:

import numpy as np
import scanpy as sc
import scvi

# ------------------------------------------------------------------
# Set global randomness for reproducibility
np.random.seed(0)
scvi.settings.seed = 0
sc.settings.set_figure_params(dpi=80, facecolor="white")

# ------------------------------------------------------------------
# 1. Subset to lymphocytes and basic sanity checks
mask = adata.obs["cell_type"].str.strip() == "Lymphocytes"
lymphocytes = adata[mask].copy()
assert lymphocytes.n_obs > 0, "No lymphocytes found in the dataset."

# Preserve raw counts for downstream differential expression
if lymphocytes.raw is None:
    lymphocytes.raw = lymphocytes.copy()

# Ensure donor is string-typed (required for scVI categorical handling)
lymphocytes.obs["donor"] = lymphocytes.obs["donor"].astype(str)

# ------------------------------------------------------------------
# 2. scVI setup and model training (donor-batch correction)
scvi.model.SCVI.setup_anndata(lymphocytes, batch_key="donor")
model = scvi.model.SCVI(
    lymphocytes,
    n_latent=20,
    n_layers=2,
)
# Reduced epochs and patience to avoid long runtimes
model.train(
    max_epochs=50,
    early_stopping=True,
    early_stopping_patience=10,
    check_val_every_n_epoch=1,
)

# ------------------------------------------------------------------
# 3. Store the latent representation and model provenance
lymphocytes.obsm["X_scVI_20d"] = model.get_latent_representation()
lymphocytes.uns["scvi_latent_info"] = {
    "n_latent": 20,
    "n_layers": 2,
    "max_epochs": 50,
    "seed": 0,
    "batch_key": "donor",
    "early_stopping": True,
}

print(f"Stored scVI latent with shape {lymphocytes.obsm['X_scVI_20d'].shape}")
print(f"Final training ELBO: {model.history['elbo_train'].iloc[-1]:.2f}")


### Agent Interpretation

Current analysis step failed to run. Try an alternative approach

## The code performs preprocessing specific to lymphocytes: it subsets cells, adds a phase label, stores raw counts, selects donor-aware highly-variable genes and caps them at 2,000, normalises counts, removes donor effects with ComBat, log-transforms, scales, and finally computes a 30-component PCA embedding saved as X_lmpca for downstream analyses.

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd

# ------------------------------------------------------------------
# Reproducibility & plotting defaults
np.random.seed(0)
sc.settings.verbosity = 2
sc.settings.set_figure_params(dpi=90, facecolor="white")

# ------------------------------------------------------------------
# 1. Subset to lymphocytes
ly_mask = adata.obs["cell_type"].str.strip() == "Lymphocytes"
lymphocytes = adata[ly_mask].copy()
assert lymphocytes.n_obs > 0, "No lymphocytes found."

# ------------------------------------------------------------------
# 2. Add explicit menstrual-phase annotation (needed downstream)
phase_map = {"proliferative": range(4, 10),
             "early_secretory": range(10, 15),
             "mid_secretory": range(15, 20)}
lymphocytes.obs["phase"] = pd.cut(
    lymphocytes.obs["day"],
    bins=[3, 9, 14, 19, 31],
    labels=["proliferative", "early_secretory", "mid_secretory"],
    right=True
).astype(str)

# ------------------------------------------------------------------
# 3. Preserve raw counts economically
if "counts" not in lymphocytes.layers:
    lymphocytes.layers["counts"] = lymphocytes.X.copy()

# Ensure categorical donor
lymphocytes.obs["donor"] = lymphocytes.obs["donor"].astype(str)

# ------------------------------------------------------------------
# 4. Highly variable genes – donor aware, final cap = 2,000 genes
sc.pp.highly_variable_genes(
    lymphocytes,
    flavor="seurat_v3",
    n_top_genes=2000,
    batch_key="donor",
    subset=False,
)
# union may exceed 2k; rank by variance_normalized and keep top 2k
hvg_ranked = (
    lymphocytes.var[lymphocytes.var["highly_variable"]]
    .sort_values("variance_normalized", ascending=False)
    .head(2000)
    .index
)
lymphocytes = lymphocytes[:, hvg_ranked].copy()

# ------------------------------------------------------------------
# 5. Normalise (CP10K), ComBat on linear counts, then log1p & scale
sc.pp.normalize_total(lymphocytes, target_sum=1e4)
sc.pp.combat(lymphocytes, key="donor")  # donor-specific bias removal
sc.pp.log1p(lymphocytes)
sc.pp.scale(lymphocytes, max_value=None)

# ------------------------------------------------------------------
# 6. PCA and storage
sc.tl.pca(lymphocytes, n_comps=30, svd_solver="arpack", random_state=0)
lymphocytes.obsm["X_lmpca"] = lymphocytes.obsm["X_pca"].copy()
print("Batch-corrected PCA computed – shape:", lymphocytes.obsm["X_lmpca"].shape)