In [None]:
# ==============================================================================
# Script:           process_methylation.py
# Purpose:          Entry-point to download and preprocess DNA methylation data
#                   for a specified project
# Author:           Sophia Li
# Affiliation:      CCG Lab, Princess Margaret Cancer Center, UHN, UofT
# Date:             11/18/2025
#Okay
# Configurations:   methylation_preproc.yaml
#
# Notes:            Checks to see if data needs to be downloaded by simply 
#                   checking if the raw data folder is empty, does not support
#                   partial downloads
# ==============================================================================

In [2]:
import pandas as pd
from pathlib import Path
import argparse
import os

from MethylCDM.utils.utils import init_environment, load_config, resolve_path
from MethylCDM.data.load_methylation import (
    download_methylation, 
    clean_methylation_data
)
from MethylCDM.preprocessing.process_methylation import process_methylation

from MethylCDM.constants import (
    RAW_METHYLATION_DIR, 
    PROCESSED_METHYLATION_DIR,
    METADATA_METHYLATION_DIR
)

%load_ext autoreload
%autoreload 2

In [5]:
args = {
    "project": "TCGA-COAD",
    "config_pipeline": "pipeline.yaml",
    "config_preproc": "methylation_preproc.yaml",
    "verbose": True
}

In [6]:
# Load the relevant configuration files 
pipeline_cfg = load_config(args['config_pipeline'])
preproc_cfg = load_config(args['config_preproc'])

# Initialize the environment for reproducible analysis
init_environment(pipeline_cfg)

In [7]:
# Check to see if DNA methylation data needs to be downloaded
raw_data_dir = preproc_cfg.get('download', {}).get('raw_data_dir', '')
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
project_raw_dir = os.path.join(raw_data_dir, args['project'])
Path(project_raw_dir).mkdir(parents = True, exist_ok = True)

In [None]:
download_methylation(args['project'], preproc_cfg, args['verbose'])

In [None]:
clean_methylation_data(project_raw_dir, args['verbose'])

In [15]:
# -----| Data Preprocessing |-----
metadata_dir = preproc_cfg.get('download', {}).get('metadata_dir', '')
metadata_dir = resolve_path(metadata_dir, METADATA_METHYLATION_DIR)
project_metadata = os.path.join(metadata_dir, args['project'],
                                f"{args['project']}_metadata.csv")
metadata = pd.read_csv(project_metadata)

In [16]:
# Preprocess the CpG matrix, outputting a gene-level matrix (AnnData)
gene_matrix = process_methylation(
    args['project'], metadata, preproc_cfg, args['verbose']
)

Beginning methylation preprocessing of project TCGA-COAD
Successfully performed probe quality control
Successfully performed sample quality control
Successfully imputed missing values
Successfully aggregated gene-level beta values
Successfully clipped gene-level beta values
Completed processing data for TCGA project TCGA-COAD


In [17]:
import anndata

proc_data_dir = (preproc_cfg.get('preprocess', {})
                            .get('processed_data_dir', ''))
proc_data_dir = resolve_path(proc_data_dir, PROCESSED_METHYLATION_DIR)
Path(proc_data_dir).mkdir(parents = True, exist_ok = True)

proc_file = os.path.join(proc_data_dir,
                         f"{args['project']}_gene_matrix.h5ad")

anndata.settings.allow_write_nullable_strings = True
gene_matrix.write_h5ad(proc_file, compression = "gzip")

In [18]:
import anndata as ad
ad.read_h5ad("/Volumes/FBI_Drive/MethylCDM-project/data/processed/methylation/TCGA-COAD_gene_matrix.h5ad")

AnnData object with n_obs × n_vars = 347 × 19240
    obs: 'id', 'data_format', 'file_id', 'data_type', 'data_category', 'state', 'experimental_strategy', 'platform', 'case_id', 'submitter_id', 'race', 'gender', 'ethnicity', 'days_to_last_follow_up', 'age_at_diagnosis', 'primary_diagnosis', 'morphology', 'tumor_grade', 'sample_id', 'sample_type'

In [19]:
from pathlib import Path
dir_path = Path("/Volumes/FBI_Drive/MethylCDM-project/data/raw/methylation/TCGA-COAD")
sum(1 for f in dir_path.iterdir())

555

In [20]:
data_dir = preproc_cfg.get('preprocess', {}).get('processed_data_dir', '')
data_dir = resolve_path(data_dir, PROCESSED_METHYLATION_DIR)

adata_files = list(Path(data_dir).glob("*.h5ad"))


'TCGA-UVM'

In [3]:
# Load the relevant configuration file
pipeline_cfg = load_config("pipeline.yaml")
model_cfg = load_config("betaVAE.yaml")

# Initialize the environment for reproducible analysis
init_environment(pipeline_cfg)

In [4]:
import os
import anndata as ad
from pathlib import Path
import scipy.sparse as sp
from sklearn.model_selection import train_test_split

In [5]:
data_dir = model_cfg.get('project_data_dir', '')
data_dir = resolve_path(data_dir, PROCESSED_METHYLATION_DIR)

# Load each project AnnData object
adata_files = list(Path(data_dir).glob("*.h5ad"))

In [6]:
adatas = []
for file in adata_files:

    # Convert to sparse if not already done
    adata = ad.read_h5ad(file)
    if not sp.issparse(adata.X):
        adata.X = sp.csr_matrix(adata.X)
    
    adata.obs['tcga_project'] = file.stem.split('_')[0]
    adatas.append(adata)


In [7]:
# Concatenate along cells (obs), taking the gene intersection
cohort_adata = ad.concat(
    adatas, join = "inner", label = "batch", 
    keys = [f.stem.split('_')[0] for f in adata_files]
)

In [8]:
cohort_adata

AnnData object with n_obs × n_vars = 1331 × 18990
    obs: 'id', 'data_format', 'file_id', 'data_type', 'data_category', 'state', 'experimental_strategy', 'platform', 'case_id', 'submitter_id', 'race', 'gender', 'ethnicity', 'age_at_diagnosis', 'primary_diagnosis', 'morphology', 'days_to_last_follow_up', 'tumor_grade', 'sample_id', 'sample_type', 'tcga_project', 'batch'

In [9]:
import os
import argparse
import anndata

from MethylCDM.utils.utils import init_environment, load_config, resolve_path
from MethylCDM.preprocessing.reconcile_methylation import (
    reconcile_methylation,
    split_cohort
)
from MethylCDM.constants import (
    PROCESSED_METHYLATION_DIR,
    TRAINING_METHYLATION_DIR
)

In [12]:
# Save the full cohort AnnData Object
train_data_dir = model_cfg.get('training_data_dir', '')
train_data_dir = resolve_path(train_data_dir, TRAINING_METHYLATION_DIR)
Path(train_data_dir).mkdir(parents = True, exist_ok = True)
cohort_path = os.path.join(train_data_dir, "tcga_cohort_gene_matrix.h5ad")
anndata.settings.allow_write_nullable_strings = True
cohort_adata.write_h5ad(cohort_path, compression = "gzip")

In [13]:
# -----| Train-Validation-Test Split |-----
train_adata, val_adata, test_adata = split_cohort(
    cohort_adata, pipeline_cfg.get('seed', 42)
)

In [17]:
# Save each split individually
train_path = os.path.join(train_data_dir, "tcga_train_gene_matrix.h5ad")
val_path = os.path.join(train_data_dir, "tcga_val_gene_matrix.h5ad")
test_path = os.path.join(train_data_dir, "tcga_test_gene_matrix.h5ad")
train_adata.write_h5ad(train_path, compression = "gzip")
val_adata.write_h5ad(val_path, compression = "gzip")
test_adata.write_h5ad(test_path, compression = "gzip")