In [None]:
# ==============================================================================
# Script:           process_methylation.py
# Purpose:          Entry-point to download and preprocess DNA methylation data
#                   for a specified project
# Author:           Sophia Li
# Affiliation:      CCG Lab, Princess Margaret Cancer Center, UHN, UofT
# Date:             11/18/2025
#Okay
# Configurations:   methylation_preproc.yaml
#
# Notes:            Checks to see if data needs to be downloaded by simply 
#                   checking if the raw data folder is empty, does not support
#                   partial downloads
# ==============================================================================

### Environment Initialization

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import argparse
import os

from MethylCDM.utils.utils import init_environment, load_config, resolve_path
from MethylCDM.data.load_methylation import download_methylation, merge_cohort
from MethylCDM.preprocessing.process_methylation import process_methylation
from MethylCDM.constants import (
    RAW_METHYLATION_DIR, 
    PROCESSED_METHYLATION_DIR,
    METADATA_METHYLATION_DIR
)

In [3]:
# Mimic command-line arguments
args = {
    "project": "TCGA-BRCA",
    "config_pipeline": "pipeline.yaml",
    "config_preproc": "methylation_preproc.yaml",
    "verbose": True
}

In [35]:
# Load the relevant configuration files 
pipeline_cfg = load_config(args['config_pipeline'])
preproc_cfg = load_config(args['config_preproc'])

In [36]:
# Initialize the environment for reproducible analysis
init_environment(pipeline_cfg)

### Data Downloading and Loading

download_methylation(args.project, preproc_cfg, args.verbose)

In [6]:
from pathlib import Path
import pandas as pd
from functools import reduce
import subprocess
import requests
import json
import os

from MethylCDM.utils.utils import resolve_path
from MethylCDM.constants import (
    GDC_CLIENT_PATH, 
    RAW_METHYLATION_DIR,
    METADATA_METHYLATION_DIR,
    METADATA_METHYLATION
)

In [37]:
# Mimic function parameters
project = args['project']
config = preproc_cfg

In [38]:
# Fetch all relevant values from the configurations object
download_cfg = config.get('download', {})
raw_data_dir = download_cfg.get('raw_data_dir', '')
metadata_dir = download_cfg.get('metadata_dir', '')

# Resolve directory paths relative to the project root
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
metadata_dir = resolve_path(metadata_dir, METADATA_METHYLATION_DIR)

# Initialize the output directories if necessary
project_data_dir = os.path.join(raw_data_dir, project)
project_metadata_dir = os.path.join(metadata_dir, project)

if raw_data_dir: os.makedirs(project_data_dir, exist_ok = True)
if metadata_dir: os.makedirs(project_metadata_dir, exist_ok = True)

In [None]:
# Query the GDC API for DNA methylation beta values from the projects
query_url = "https://api.gdc.cancer.gov/files"

filters = {
    "op": "and",
    "content": [
        {"op": "in", "content": {"field": "cases.project.project_id", 
                                    "value": args['project']}},
        {"op": "in", "content": {"field": "files.experimental_strategy", 
                                    "value": "Methylation Array"}},
        {"op": "in", "content": {"field": "files.data_category", 
                                    "value": ["dna methylation"]}},
        {"op": "in", "content": {"field": "files.data_type", 
                                    "value": ["Methylation Beta Value"]}},
        {"op": "in", "content": {"field": "files.access", 
                                    "value": ["open"]}},
    ]
}
data_query = {
    "filters": json.dumps(filters),
    "fields": "file_id,file_name",
    "format": "JSON",
    "size": 10000
}

In [None]:
# Query the GDC API for methylation files, filtered by configurations
response = requests.post(query_url, json = data_query)
data = response.json()
files = data['data']['hits']

In [None]:
# -----| Generate DNA Methylation Data Manifest |-----
manifest_file = os.path.join(project_metadata_dir, 
                                f"{project}_manifest.txt")

with open(manifest_file, "w") as f:
    f.write("id\tfilename\n")
    for file in files:
        f.write(f"{file['file_id']}\t{file['file_name']}\n")

In [None]:
# Require the GDC-client tool to be downloaded 
if not GDC_CLIENT_PATH.exists():
    raise FileNotFoundError("`gdc-client` was not found. Please download"
                            " and place it in the `tools/` directory.")

# Spawn a sub-process to call the `gdc-client` with the manifest
subprocess.run([GDC_CLIENT_PATH, "download", "-m", 
                manifest_file, "-d", project_data_dir], 
                check = True)

##### Clean Methylation Data

In [None]:
import shutil

raw_data_dir = preproc_cfg.get('download', {}).get('raw_data_dir', '')
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
project_raw_dir = os.path.join(raw_data_dir, args['project'])

dir_path = project_raw_dir

In [None]:
# Recursively find all beta value *.txt files
txt_files = list(Path(dir_path).rglob("*.level3betas.txt"))
parquet_paths = []

In [None]:
for txt_path in txt_files:

    # Build the parquet filename to match the original
    sample_id = txt_path.stem + ".parquet"
    parquet_path = os.path.join(dir_path, sample_id)

    # Convert the .txt to .parquet
    txt = pd.read_csv(txt_path, sep = "\t", 
                      dtype = {"beta_value": "float32"})
    txt.to_parquet(parquet_path, index = False)
    parquet_paths.append(parquet_path)

    # Delete the old .txt file
    txt_path.unlink()

In [None]:
# Delete all nested directories and files
for subdir in Path(dir_path).iterdir():
    if subdir.is_dir(): shutil.rmtree(subdir)

download metadata

In [None]:
# Query for the metadata fields provided in the configurations
metadata_query = {
    "filters": json.dumps(filters),
    "fields": ",".join(METADATA_METHYLATION),
    "format": "JSON",
    "size": 10000
}

response = requests.post(query_url, json = metadata_query)
data = response.json()
metadata = pd.json_normalize(data['data']['hits'])

In [None]:
# -----| Normalize Metadata |-----
metadata = metadata.explode('cases', ignore_index = True)
cases_metadata = pd.json_normalize(metadata['cases'])

metadata = pd.concat([
    metadata.drop(columns = ["cases"]).reset_index(drop = True),
    cases_metadata.reset_index(drop = True)
], axis = 1)

# Expand the diagnoses and samples fields nested in the cases
metadata = metadata.explode('diagnoses').explode('samples')
diagnoses_metadata = pd.json_normalize(metadata['diagnoses'])
samples_metadata = pd.json_normalize(metadata['samples'])

metadata = pd.concat([
    metadata.drop(columns = ['diagnoses', 'samples']).reset_index(drop = True),
    diagnoses_metadata.reset_index(drop = True),
    samples_metadata.reset_index(drop = True)
], axis = 1)

# Clean any remaining prefixed column names
metadata.columns = [col.split('.')[-1] for col in metadata.columns]

In [None]:
# Save the metadata to the designated data folder
metadata_file = os.path.join(project_metadata_dir,
                                f"{project}_metadata.csv")
metadata.to_csv(metadata_file, index = False)

### Merging Cohort

cpg_matrix = merge_cohort(args.project, preproc_cfg)

In [None]:
from pathlib import Path
import pandas as pd
from functools import reduce
import subprocess
import requests
import json
import os

from MethylCDM.utils.utils import resolve_path
from MethylCDM.constants import (
    GDC_CLIENT_PATH, 
    RAW_METHYLATION_DIR,
    METADATA_METHYLATION_DIR,
    METADATA_METHYLATION
)

ImportError: cannot import name 'load_beta_file' from 'MethylCDM.data.load_methylation' (/Volumes/FBI_Drive/MethylCDM-project/src/MethylCDM/data/load_methylation.py)

In [24]:
# Mimic parameters
project = args['project']
config = preproc_cfg

In [None]:
# Resolve the project's raw data directory
raw_data_dir = config.get('download', {}).get('raw_data_dir', '')
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
project_data_dir = os.path.join(raw_data_dir, f"{project}")

# Verify the raw data exists and is not empty
if not os.path.isdir(project_data_dir):
    raise FileNotFoundError(f"Raw data directory was not found at "
                            f"{project_data_dir}.")
if not os.listdir(project_data_dir):
    raise FileNotFoundError(f"Raw data directory was empty at "
                            f"{project_data_dir}.")

In [None]:
# Identify and load all nested beta value .partquet files
beta_files = [f for f in Path(project_data_dir).glob("*.level3betas.parquet")]
beta_values = [load_beta_file(f) for f in beta_files]

In [None]:
# Identify the common probes between present manifests and filter
common_probes = beta_values[0].index
for df in beta_values[1:]:
    common_probes = common_probes.intersection(df.index)
beta_values = [df.loc[common_probes].astype("float32") 
                for df in beta_values]

In [None]:
# Merge beta values into a single matrix, keeping only common probes
beta_values = pd.concat(beta_values, axis = 1)

In [None]:
# Sort the CpGs
beta_values = beta_values.sort_index()

In [None]:
inter_data_dir = (preproc_cfg.get('preprocess', {})
                             .get('intermediate_data_dir', ''))
inter_data_dir = resolve_path(inter_data_dir, INTERMEDIATE_METHYLATION_DIR)
project_inter_dir = os.path.join(inter_data_dir, args['project'])
Path(project_inter_dir).mkdir(parents = True, exist_ok = True)

In [None]:
inter_file = os.path.join(project_inter_dir, 
                         f"{args['project']}_cpg_matrix_raw.parquet")
beta_values.to_parquet(inter_file) 

#### 3. Preprocess Methylation

In [11]:
from MethylCDM.utils.utils import load_annotations, load_beta_file
from MethylCDM.constants import (
    CONFIG_DIR,
    ANNOTATION_27K,
    ANNOTATION_450K,
    ANNOTATION_EPIC
)

from MethylCDM.preprocessing.process_methylation import (
    sample_qc,
    probe_qc,
    impute_missing,
    aggregate_genes,
    clip_beta_values
)

In [12]:
metadata_dir = preproc_cfg.get('download', {}).get('metadata_dir', '')
metadata_dir = resolve_path(metadata_dir, METADATA_METHYLATION_DIR)
project_metadata = os.path.join(metadata_dir, args['project'],
                                f"{args['project']}_metadata.csv")
metadata = pd.read_csv(project_metadata)

In [127]:
from pathlib import Path
import pandas as pd
import numpy as np
import anndata as ad
from collections import defaultdict
import os
from MethylCDM.utils.utils import load_annotations, resolve_path, load_beta_file
from MethylCDM.constants import RAW_METHYLATION_DIR
from MethylCDM.preprocessing.process_methylation import (
    process_array_methylation
)

In [14]:
# Resolve the project's raw data directory
raw_data_dir = config.get('download', {}).get('raw_data_dir', '')
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
project_data_dir = os.path.join(raw_data_dir, f"{project}")

# Verify the raw data exists and is not empty
if not os.path.isdir(project_data_dir):
    raise FileNotFoundError(f"Raw data directory was not found at "
                            f"{project_data_dir}.")
if not os.listdir(project_data_dir):
    raise FileNotFoundError(f"Raw data directory was empty at "
                            f"{project_data_dir}.")

# Identify and load all nested beta value .txt files
beta_files = [
    f for f in Path(project_data_dir).glob("*.level3betas.parquet")
]

In [15]:
# Keep the first occurrence of duplicates in the metadata (redundant)
metadata = metadata.set_index('file_name')
metadata = metadata[~metadata.index.duplicated(keep = 'first')]

In [16]:
# Divide samples by array type for preprocessing
array_groups = defaultdict(list)

for file in beta_files:
    fn = file.stem + ".txt"
    if fn in metadata.index:
        array_type = metadata.loc[fn, 'platform']
        array_groups[array_type].append(file)

In [132]:
array_type = "Illumina Human Methylation 450"

In [133]:
# Load the beta values and their associated annotation
cpg_matrix = pd.concat(
    [load_beta_file(f) for f in array_groups[array_type]], 
    axis = 1
)
annotation = load_annotations(array_type)

: 

: 

In [None]:
cpg_matrix.shape

In [130]:
gene_matrix = process_array_methylation(cpg_matrix, annotation, config)