# Age Distribution Analysis

This notebook analyzes the age distribution of donors and cells from the `combined_40k_test.h5ad` dataset. It replicates the functionality of `donor_plot.R` but integrates Python for data loading.

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import os

# Scanpy settings
sc.settings.verbosity = 3

# Load magic for R
%load_ext rpy2.ipython

In [None]:
%%R 
# R setup
library(ggplot2)
library(dplyr)
library(readr)
library(stringr)


## Load Data

In [None]:
file_path = '/home/rajd2/rds/rds-cam-psych-transc-Pb9UGUlrwWc/Cam_snRNAseq/combined/combined_40k_test.h5ad'
adata = sc.read_h5ad(file_path, backed='r')

In [None]:
# Extract observations
obs = adata.obs.copy()

## Clean Data and Prepare for R

In [None]:
# Ensure age_years is numeric
obs['age_years'] = pd.to_numeric(obs['age_years'], errors='coerce')

# Create a clean dataframe for R with explicit types
obs_export = pd.DataFrame()
obs_export['age_years'] = obs['age_years']

# Explicitly handle object/categorical columns
if 'source' in obs.columns:
    obs_export['source'] = obs['source'].astype(str)
else:
    print("WARNING: 'source' column missing in obs!")

if 'individual' in obs.columns:
    obs_export['individual'] = obs['individual'].astype(str)
else:
    print("WARNING: 'individual' column missing in obs!")

print("Export DataFrame Columns:", obs_export.columns.tolist())
print(obs_export.head())

In [None]:
# Transfer data to R
%R -i obs_export

In [None]:
%%R
print(colnames(obs_export))
print(head(obs_export))

## Plot 1: Donor Age Distribution

In [None]:
%%R -w 180 -u mm -r 300
# Create plot data for donors
plot_data <- obs_export %>%
  mutate(age_round = round(age_years)) %>%
  filter(!is.na(age_round)) %>%
  group_by(age_round, source) %>%
  summarise(n_donors = n_distinct(individual), .groups = 'drop')

# Plot
ggplot(plot_data, aes(x = age_round, y = n_donors, fill = source)) +
  geom_col() + 
  scale_x_continuous(breaks = seq(min(plot_data$age_round), max(plot_data$age_round), by = 5)) +
  labs(
    title = "Donor Age Distribution by Source",
    x = "Age (Years)",
    y = "Number of Unique Donors",
    fill = "Source"
  ) +
  theme_classic() + 
  theme(legend.position = "bottom")

## Plot 2: Cell Age Distribution

In [None]:
%%R -w 180 -u mm -r 300
# Plot histogram of cells
ggplot(obs_export, aes(x = round(age_years), fill = source)) +
  geom_histogram(binwidth = 1, position = "stack") + 
  scale_x_continuous(breaks = seq(min(round(obs_export$age_years), na.rm=TRUE), max(round(obs_export$age_years), na.rm=TRUE), by = 5)) +
  labs(
    title = "Cell Age Distribution by Source (Stacked Histogram)",
    x = "Age (Years, rounded)",
    y = "Number of Cells",
    fill = "Source"
  ) +
  theme_classic() + 
  theme(legend.position = "bottom")