In [None]:
# ==============================================================================
# Script:           process_methylation.py
# Purpose:          Entry-point to download and preprocess DNA methylation data
#                   for a specified project
# Author:           Sophia Li
# Affiliation:      CCG Lab, Princess Margaret Cancer Center, UHN, UofT
# Date:             11/18/2025
#Okay
# Configurations:   methylation_preproc.yaml
#
# Notes:            Checks to see if data needs to be downloaded by simply 
#                   checking if the raw data folder is empty, does not support
#                   partial downloads
# ==============================================================================

### Environment Initialization

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
import argparse
import os

from MethylCDM.utils.utils import init_environment, load_config, resolve_path
from MethylCDM.data.load_methylation import download_methylation, merge_cohort
from MethylCDM.preprocessing.process_methylation import process_methylation
from MethylCDM.constants import (
    RAW_METHYLATION_DIR, 
    INTERMEDIATE_METHYLATION_DIR,
    PROCESSED_METHYLATION_DIR,
    METADATA_METHYLATION_DIR
)

In [7]:
# Mimic command-line arguments
args = {
    "project": "TCGA-BRCA",
    "config_pipeline": "pipeline.yaml",
    "config_preproc": "methylation_preproc.yaml",
    "verbose": True
}

In [8]:
args['config_pipeline']

'pipeline.yaml'

In [9]:
# Load the relevant configuration files 
pipeline_cfg = load_config(args['config_pipeline'])
preproc_cfg = load_config(args['config_preproc'])

In [10]:
# Initialize the environment for reproducible analysis
init_environment(pipeline_cfg)

### Data Downloading and Loading

download_methylation(args.project, preproc_cfg, args.verbose)

In [11]:
from pathlib import Path
import pandas as pd
from functools import reduce
import subprocess
import requests
import json
import os

from MethylCDM.utils.utils import resolve_path
from MethylCDM.constants import (
    GDC_CLIENT_PATH, 
    RAW_METHYLATION_DIR,
    METADATA_METHYLATION_DIR,
    METADATA_METHYLATION
)

In [12]:
# Mimic function parameters
project = args['project']
config = preproc_cfg

In [13]:
# Fetch all relevant values from the configurations object
download_cfg = config.get('download', {})
raw_data_dir = download_cfg.get('raw_data_dir', '')
metadata_dir = download_cfg.get('metadata_dir', '')

# Resolve directory paths relative to the project root
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
metadata_dir = resolve_path(metadata_dir, METADATA_METHYLATION_DIR)

# Initialize the output directories if necessary
project_data_dir = os.path.join(raw_data_dir, project)
project_metadata_dir = os.path.join(metadata_dir, project)

if raw_data_dir: os.makedirs(project_data_dir, exist_ok = True)
if metadata_dir: os.makedirs(project_metadata_dir, exist_ok = True)

In [None]:
# Query the GDC API for DNA methylation beta values from the projects
query_url = "https://api.gdc.cancer.gov/files"

filters = {
    "op": "and",
    "content": [
        {"op": "in", "content": {"field": "cases.project.project_id", 
                                    "value": [project]}},
        {"op": "in", "content": {"field": "files.experimental_strategy", 
                                    "value": "Methylation Array"}},
        {"op": "in", "content": {"field": "files.data_category", 
                                    "value": ["dna methylation"]}},
        {"op": "in", "content": {"field": "files.data_type", 
                                    "value": ["Methylation Beta Value"]}},
        {"op": "in", "content": {"field": "files.access", 
                                    "value": ["open"]}},
    ]
}
data_query = {
    "filters": json.dumps(filters),
    "fields": "file_id,file_name",
    "format": "JSON",
    "size": 10000
}

In [None]:
# Query the GDC API for methylation files, filtered by configurations
response = requests.post(query_url, json = data_query)
data = response.json()
files = data['data']['hits']

In [None]:
# -----| Generate DNA Methylation Data Manifest |-----
manifest_file = os.path.join(project_metadata_dir, 
                                f"{project}_manifest.txt")

with open(manifest_file, "w") as f:
    f.write("id\tfilename\n")
    for file in files:
        f.write(f"{file['file_id']}\t{file['file_name']}\n")

In [None]:
# Require the GDC-client tool to be downloaded 
if not GDC_CLIENT_PATH.exists():
    raise FileNotFoundError("`gdc-client` was not found. Please download"
                            " and place it in the `tools/` directory.")

# Spawn a sub-process to call the `gdc-client` with the manifest
subprocess.run([GDC_CLIENT_PATH, "download", "-m", 
                manifest_file, "-d", project_data_dir], 
                check = True)

##### Clean Methylation Data

In [15]:
import shutil
dir_path = project_data_dir

In [18]:
sample_id

'db68a2ca-9969-4cc6-aa7a-4ab188e28fc6.methylation_array.sesame'

In [17]:
# Recursively find all beta value *.txt files
txt_files = list(Path(dir_path).rglob("*.level3betas.txt"))
parquet_paths = []

for txt_path in txt_files:

    # Build the parquet filename to match the original
    sample_id = txt_path.stem.replace(".level3betas", "")
    parquet_path = dir_path / f"{sample_id}.parquet"

    # Convert the .txt to .parquet
    txt = pd.read_csv(txt_path, sep = "\t", dtype = str)
    txt.to_parquet(parquet_path, index = False)
    parquet_paths.append(parquet_path)

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
# Delete all non-parquet files
for item in dir_path.iterdir():
    if item.is_file() and item.suffix != ".parquet":
        item.unlink()
    elif item.isdir():
        shutil.rmtree(item)

In [None]:
# Query for the metadata fields provided in the configurations
metadata_query = {
    "filters": json.dumps(filters),
    "fields": ",".join(METADATA_METHYLATION),
    "format": "JSON",
    "size": 10000
}

response = requests.post(query_url, json = metadata_query)
data = response.json()
metadata = pd.json_normalize(data['data']['hits'])

In [None]:
# -----| Normalize Metadata |-----
metadata = metadata.explode('cases', ignore_index = True)
cases_metadata = pd.json_normalize(metadata['cases'])

metadata = pd.concat([
    metadata.drop(columns = ["cases"]).reset_index(drop = True),
    cases_metadata.reset_index(drop = True)
], axis = 1)

# Expand the diagnoses and samples fields nested in the cases
metadata = metadata.explode('diagnoses').explode('samples')
diagnoses_metadata = pd.json_normalize(metadata['diagnoses'])
samples_metadata = pd.json_normalize(metadata['samples'])

metadata = pd.concat([
    metadata.drop(columns = ['diagnoses', 'samples']).reset_index(drop = True),
    diagnoses_metadata.reset_index(drop = True),
    samples_metadata.reset_index(drop = True)
], axis = 1)

# Clean any remaining prefixed column names
metadata.columns = [col.split('.')[-1] for col in metadata.columns]

In [None]:
# Save the metadata to the designated data folder
metadata_file = os.path.join(project_metadata_dir,
                                f"{project}_metadata.csv")
metadata.to_csv(metadata_file, index = False)

### Merging Cohort

cpg_matrix = merge_cohort(args.project, preproc_cfg)

In [8]:
from pathlib import Path
import pandas as pd
from functools import reduce
import subprocess
import requests
import json
import os

from MethylCDM.utils.utils import resolve_path
from MethylCDM.constants import (
    GDC_CLIENT_PATH, 
    RAW_METHYLATION_DIR,
    METADATA_METHYLATION_DIR,
    METADATA_METHYLATION
)

from MethylCDM.data.load_methylation import load_beta_file

In [9]:
# Mimic parameters
project = args['project']
config = preproc_cfg

In [10]:
# Resolve the project's raw data directory
raw_data_dir = config.get('download', {}).get('raw_data_dir', '')
raw_data_dir = resolve_path(raw_data_dir, RAW_METHYLATION_DIR)
project_data_dir = os.path.join(raw_data_dir, f"{project}")

# Verify the raw data exists and is not empty
if not os.path.isdir(project_data_dir):
    raise FileNotFoundError(f"Raw data directory was not found at "
                            f"{project_data_dir}.")
if not os.listdir(project_data_dir):
    raise FileNotFoundError(f"Raw data directory was empty at "
                            f"{project_data_dir}.")

In [11]:
# Identify and load all nested beta value .txt files
beta_files = [f for f in Path(project_data_dir).glob("*/*.level3betas.txt")]
beta_values = [load_beta_file(f) for f in beta_files]


: 

: 

In [None]:
beta_files = list(Path(project_data_dir).glob("*/**/*.level3betas.txt"))

#### 3. Preprocess Methylation