In [None]:
# ==============================================================================
# Script:           process_methylation.py
# Purpose:          Entry-point to download and preprocess DNA methylation data
#                   for a specified project
# Author:           Sophia Li
# Affiliation:      CCG Lab, Princess Margaret Cancer Center, UHN, UofT
# Date:             11/18/2025
#Okay
# Configurations:   methylation_preproc.yaml
#
# Notes:            Checks to see if data needs to be downloaded by simply 
#                   checking if the raw data folder is empty, does not support
#                   partial downloads
# ==============================================================================

In [1]:
import pandas as pd
from pathlib import Path
import argparse
import os

from MethylCDM.utils.utils import init_environment, load_config, resolve_path
from MethylCDM.data.load_methylation import (
    download_methylation, 
    clean_methylation_data
)
from MethylCDM.preprocessing.process_methylation import process_methylation

from MethylCDM.constants import (
    RAW_METHYLATION_DIR, 
    PROCESSED_METHYLATION_DIR,
    METADATA_METHYLATION_DIR
)

%load_ext autoreload
%autoreload 2

In [2]:
args = {
    "project": "TCGA-GBM",
    "config_pipeline": "pipeline.yaml",
    "config_preproc": "methylation_preproc.yaml",
    "verbose": True
}

In [3]:
# Load the relevant configuration files 
pipeline_cfg = load_config(args['config_pipeline'])
preproc_cfg = load_config(args['config_preproc'])

# Initialize the environment for reproducible analysis
init_environment(pipeline_cfg)

In [4]:
# Check to see if DNA methylation data needs to be downloaded
raw_data_dir = preproc_cfg.get('download', {}).get('raw_data_dir', '')
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
project_raw_dir = os.path.join(raw_data_dir, args['project'])
Path(project_raw_dir).mkdir(parents = True, exist_ok = True)

In [None]:
download_methylation(args['project'], preproc_cfg, args['verbose'])

In [None]:
clean_methylation_data(project_raw_dir, args['verbose'])

In [20]:
# -----| Data Preprocessing |-----
metadata_dir = preproc_cfg.get('download', {}).get('metadata_dir', '')
metadata_dir = resolve_path(metadata_dir, METADATA_METHYLATION_DIR)
project_metadata = os.path.join(metadata_dir, args['project'],
                                f"{args['project']}_metadata.csv")
metadata = pd.read_csv(project_metadata)

In [6]:
# Preprocess the CpG matrix, outputting a gene-level matrix (AnnData)
gene_matrix = process_methylation(
    args['project'], metadata, preproc_cfg, args['verbose']
)

Beginning methylation preprocessing of project TCGA-GBM
Successfully performed probe quality control
Successfully performed sample quality control
Successfully imputed missing values
Successfully aggregated gene-level beta values
Successfully clipped gene-level beta values
Completed processing data for TCGA project TCGA-GBM


In [7]:
# Resolve the project's raw data directory
raw_data_dir = preproc_cfg.get('download', {}).get('raw_data_dir', '')
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
project_data_dir = os.path.join(raw_data_dir, f"{args['project']}")

# Identify and load all nested beta value .txt files
beta_files = [
    f for f in Path(project_data_dir).glob("*.level3betas.parquet")
]


In [21]:
# Keep the first occurrence of duplicates in the metadata (redundant)
metadata = metadata.set_index('file_name')
metadata = metadata[~metadata.index.duplicated(keep = 'first')]


In [9]:
# Identify the highest coverage array type
from MethylCDM.utils.utils import load_annotation

manifests = metadata['platform'].unique()
annotation, array_type = load_annotation(manifests)

In [22]:
# Normalize the metadata extensions
metadata.index = metadata.index.str.removesuffix(".txt")
valid_stems = set(metadata[metadata['platform'] == array_type].index)

# Fetch samples that align with the given array type
beta_files = [f for f in beta_files if f.stem in valid_stems]

In [11]:
# Preprocess the beta values into a gene-level matrix
from MethylCDM.utils.utils import load_cpg_matrix

cpg_matrix = load_cpg_matrix(beta_files)

In [12]:
from MethylCDM.preprocessing.process_methylation import process_array_methylation

gene_matrix = process_array_methylation(cpg_matrix, annotation, preproc_cfg)
gene_matrix = gene_matrix.sort_index()

Successfully performed probe quality control
Successfully performed sample quality control
Successfully imputed missing values
Successfully aggregated gene-level beta values
Successfully clipped gene-level beta values


In [None]:
import anndata as ad

# Filter the metadata for surviving sample
metadata = metadata.loc[gene_matrix.columns]
metadata = metadata.sort_index()

# Initialize the gene matrix as an AnnData object
adata = ad.AnnData(X = gene_matrix.T)
adata.obs = metadata

gene_matrix = adata

In [None]:
import anndata

proc_data_dir = (preproc_cfg.get('preprocess', {})
                            .get('processed_data_dir', ''))
proc_data_dir = resolve_path(proc_data_dir, PROCESSED_METHYLATION_DIR)
Path(proc_data_dir).mkdir(parents = True, exist_ok = True)

proc_file = os.path.join(proc_data_dir,
                         f"{args['project']}_gene_matrix.h5ad")

anndata.settings.allow_write_nullable_strings = True
gene_matrix.write_h5ad(proc_file, compression = "gzip")

In [52]:
import anndata as ad
ad.read_h5ad("/Volumes/FBI_Drive/MethylCDM-project/data/processed/methylation/TCGA-GBM_gene_matrix.h5ad")

AnnData object with n_obs × n_vars = 153 × 19197
    obs: 'id', 'data_format', 'file_id', 'data_type', 'data_category', 'state', 'experimental_strategy', 'platform', 'case_id', 'submitter_id', 'race', 'gender', 'ethnicity', 'days_to_last_follow_up', 'age_at_diagnosis', 'primary_diagnosis', 'morphology', 'tumor_grade', 'sample_id', 'sample_type'

In [53]:
from pathlib import Path
dir_path = Path("/Volumes/FBI_Drive/MethylCDM-project/data/raw/methylation/TCGA-GBM")
sum(1 for f in dir_path.iterdir())

450