## Preprocess the DNA methylation (epigenomic) dataset

In [1]:
import pandas as pd
import numpy as np
import os

We first need the path to our folder containing case-organized data and the destination for storing the processed epigenomic data.

In [2]:
ORGANIZED_BY_CASE_PATH = ".../TCGA/data_by_cases"
DESTINATION_DATA_PATH = ".../TCGA/data_processed/PRCSD_epigenomic_data.csv"

We use the following function to read in DNA methylation data. This function should be adapted to the format of DNA methylation data used for a project. 

In [3]:
def read_methylation(filepath, case_id):
    return pd.read_csv(filepath, sep='\t', header = None).set_index(0).rename(columns={1:case_id})

In [None]:
def create_epi_dataset(by_cases_path):
    cases = os.listdir(by_cases_path)
    """
    Loop through every case filepath and search for epigenomic data. 
    Apply the read CSV function to each epigenomic data found. 
    After all the epigenomic files are read, we can concatenate them to create a matrix where rows are cases, columns are genomic regions, and values are the respective methylation values.
    """
    epigenomic_data = []
    l = len(cases)
    for i, case in enumerate(cases):
        #print(f"Case {i}/{l}")
        contents_gene_meth = os.listdir(os.path.join(by_cases_path, case, "dna_methylation"))
        if len(contents_gene_meth) == 0:
            print(f"{case} has no methylation data")
        else:
            filename = contents_gene_meth[0]
            path = os.path.join(by_cases_path, case, "dna_methylation", filename)
            epigenomic_data.append(read_methylation(path, case)) 
    #We remove any features with NAs, transpose the matrix so cases are rows and features are columns, then finally reset the index to the case id.
    all_epigenomic = pd.concat(epigenomic_data, axis = 1)
    all_epigenomic = all_epigenomic.dropna()
    all_epigenomic = all_epigenomic.transpose()
    all_epigenomic = all_epigenomic.reset_index().rename(columns={"index": "case_id"})
    return all_epigenomic

In [None]:
epi_data = create_epi_dataset(ORGANIZED_BY_CASE_PATH)
epi_data.to_csv(DESTINATION_DATA_PATH, index=False)

---