In [None]:
# ==============================================================================
# Script:           process_methylation.py
# Purpose:          Entry-point to download and preprocess DNA methylation data
#                   for a specified project
# Author:           Sophia Li
# Affiliation:      CCG Lab, Princess Margaret Cancer Center, UHN, UofT
# Date:             11/18/2025
#Okay
# Configurations:   methylation_preproc.yaml
#
# Notes:            Checks to see if data needs to be downloaded by simply 
#                   checking if the raw data folder is empty, does not support
#                   partial downloads
# ==============================================================================

### Environment Initialization

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
#import argparse
#import os

from MethylCDM.utils.utils import (
    init_environment, 
    load_config, 
    build_meta_fields
)
from MethylCDM.data.load_methylation import (
    download_methylation, 
    load_raw_methylation
)

In [32]:
# Mimic command-line arguments
args = {
    "project": "TCGA-BRCA",
    "config_pipeline": "pipeline.yaml",
    "config_preproc": "methylation_preproc.yaml",
    "verbose": True
}

In [33]:
# Load the relevant configuration files 
pipeline_cfg = load_config(args['config_pipeline'])
preproc_cfg = load_config(args['config_preproc'])

In [34]:
# Initialize the environment for reproducible analysis
init_environment(pipeline_cfg)

### Data Downloading and Loading

#### 1. Download Methylation, Manifest, and Metadata

In [35]:
import pandas as pd
import subprocess
import requests
import json
import os

from MethylCDM.utils.utils import resolve_path
from MethylCDM.constants import DATA_DIR, GDC_CLIENT_PATH, METHYLATION_METADATA

In [36]:
# Mimic function parameters
project = args['project']
config = preproc_cfg

In [37]:
# Fetch all relevant values from the configurations object
download_cfg = config.get('download', {})
raw_data_dir = download_cfg.get('raw_data_dir', '')
metadata_dir = download_cfg.get('metadata_dir', '')

# Resolve directory paths relative to the project root
raw_data_dir = resolve_path(raw_data_dir, DATA_DIR)
metadata_dir = resolve_path(metadata_dir, DATA_DIR)

# Initialize the output directories if necessary
project_data_dir = os.path.join(
    raw_data_dir, f"{project}_raw_methylation"
)
project_metadata_dir = os.path.join(
    metadata_dir, f"{project}_methylation_metadata"
)
if raw_data_dir: os.makedirs(project_data_dir, exist_ok = True)
if metadata_dir: os.makedirs(project_metadata_dir, exist_ok = True)

In [38]:
# Query the GDC API for DNA methylation beta values from the projects
query_url = "https://api.gdc.cancer.gov/files"

filters = {
    "op": "and",
    "content": [
        {"op": "in", "content": {"field": "cases.project.project_id", 
                                    "value": [project]}},
        {"op": "in", "content": {"field": "files.experimental_strategy", 
                                    "value": "Methylation Array"}},
        {"op": "in", "content": {"field": "files.data_category", 
                                    "value": ["dna methylation"]}},
        {"op": "in", "content": {"field": "files.data_type", 
                                    "value": ["Methylation Beta Value"]}},
        {"op": "in", "content": {"field": "files.access", 
                                    "value": ["open"]}},
    ]
}
data_query = {
    "filters": json.dumps(filters),
    "fields": "file_id,file_name",
    "format": "JSON",
    "size": 10000
}

In [39]:
# Query the GDC API for methylation files, filtered by configurations
response = requests.post(query_url, json = data_query)
data = response.json()
files = data['data']['hits']

In [40]:
# # -----| Generate DNA Methylation Data Manifest |-----
# manifest_file = os.path.join(
#     project_metadata_dir, f"{project}_methylation_manifest.txt"
# )

# # with open(manifest_file, "w") as f:
#     f.write("id\tfilename\n")
#     for file in files:
#         f.write(f"{file['file_id']}\t{file['file_name']}\n")


# Write to local device rather than USB, avoid metadata corruption
temp_manifest = f"/Users/sophiali/Downloads/{project}_methylation_manifest.txt"
with open(temp_manifest, "w") as f:
    f.write("id\tfilename\n")
    for file in files:
        f.write(f"{file['file_id']}\t{file['file_name']}\n")

In [None]:
# # Require the GDC-client tool to be downloaded 
# if not GDC_CLIENT_PATH.exists():
#     raise FileNotFoundError("`gdc-client` was not found. Please download"
#                             " and place it in the `tools/` directory.")

# # Spawn a sub-process to call the `gdc-client` with the manifest
# subprocess.run([GDC_CLIENT_PATH, "download", "-m", 
#                 manifest_file, "-d", project_data_dir], 
#                 check = True)

# Write to local device rather than USB, avoid metadata corruption
temp_data = "/Users/sophiali/Downloads/TCGA-BRCA_raw_methylation"
# Spawn a sub-process to call the `gdc-client` with the manifest
subprocess.run([GDC_CLIENT_PATH, "download", "-m", 
                temp_manifest, "-d", temp_data], 
                check = True)

In [10]:
# Query for the metadata fields provided in the configurations
metadata_query = {
    "filters": json.dumps(filters),
    "fields": ",".join(METHYLATION_METADATA),
    "format": "JSON",
    "size": 10000
}

response = requests.post(query_url, json = metadata_query)
data = response.json()

In [26]:
# -----| Normalize Metadata |-----

# Expand the cases field
metadata = pd.json_normalize(data['data']['hits'])
metadata = metadata.explode('cases', ignore_index = True)
cases_metadata = pd.json_normalize(metadata['cases'])

metadata = pd.concat([
    metadata.drop(columns = ["cases"]).reset_index(drop = True),
    cases_metadata.reset_index(drop = True)
], axis = 1)

# Expand the diagnoses and samples fields nested in the cases
metadata = metadata.explode('diagnoses').explode('samples')
diagnoses_metadata = pd.json_normalize(metadata['diagnoses'])
samples_metadata = pd.json_normalize(metadata['samples'])

metadata = pd.concat([
    metadata.drop(columns = ['diagnoses', 'samples']).reset_index(drop = True),
    diagnoses_metadata.reset_index(drop = True),
    samples_metadata.reset_index(drop = True)
], axis = 1)

# Clean any remaining prefixed column names
metadata.columns = [col.split('.')[-1] for col in metadata.columns]

In [28]:
# # Normalize into a table and save it
# metadata_file = os.path.join(
#     project_metadata_dir, f"{project}_methylation_metadata.csv"
# )
# metadata.to_csv(metadata_file, index = False)

# Write to local device rather than USB, avoid metadata corruption
temp_metadata = f"/Users/sophiali/Downloads/{project}_methylation_metadata.csv"
metadata.to_csv(temp_metadata, index = False)

#### 2. Load Raw Methylation

#### 3. Preprocess Methylation