# Batch Correction for TCGA DNA Methylation Data
In this notebook, I want to remove batch effects from the previously downloaded DNA methylation data.
The obtained data was already preprocessed such that we have two sample matrices $S_{tumor} \in \mathbb{R}^{N \times M_1}$ and $S_{normal} \in \mathbb{R}^{N \times M_2}$ with $M_1$ and $M_2$ being the number of samples for tumor and normal tissues across all cancer types and $N$ being the number of genes.

**That is, we already have computed the average promoter DNA methylation across all measured CpG sites per gene.**

Next, we want to remove the batch effects using *ComBat*. As batches, we use the plate IDs as suggested in multiple articles.

The workflow of that notebook is as follows:
1. Load the big sample matrices for tumor and normal
2. Split them into smaller gene-sample matrices for each cancer type and write them to disk
3. Call a R script which does the batch correction with ComBat

The results from batch correction are then read by another script and the final feature matrix is computed there.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import gridspec
plt.rc('font', family='Times New Roman')
import h5py
import seaborn as sns
import os

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

%matplotlib inline

  from ._conv import register_converters as _register_converters


## 1. Load Tumor and Normal Sample Matrices

In [2]:
tumor_samples = pd.read_csv('../../data/pancancer/TCGA/methylation/download_new/tumor_sample_matrix.tsv.gz',
                               compression='gzip',
                               sep='\t')
# rename first column to 'Symbol'
tumor_samples.columns = ['Symbol'] + tumor_samples.columns[1:].tolist()
# get rid of all the support columns
tumor_betaval_cols = ['Symbol'] + [i for i in tumor_samples.columns[1:] if "mean_beta_value_promoter" in i]
tumor_samples = tumor_samples[tumor_betaval_cols]
# put the symbol as index
tumor_samples.set_index('Symbol', inplace=True)
# get cancer type, transform and add it as column
ctype_tumor = [i.split('|')[1].upper() for i in tumor_samples.columns]
tumor_samples_t = tumor_samples.T
tumor_samples_t['ctype'] = ctype_tumor
tumor_samples.head()

Unnamed: 0_level_0,TCGA-F2-6880-01A-11D-2157-05|paad|mean_beta_value_promoter,TCGA-2E-A9G8-01A-11D-A409-05|ucec|mean_beta_value_promoter,TCGA-DQ-7588-01A-11D-2079-05|hnsc|mean_beta_value_promoter,TCGA-78-7166-01A-12D-2064-05|luad|mean_beta_value_promoter,TCGA-CQ-7068-01A-11D-2079-05|hnsc|mean_beta_value_promoter,TCGA-P3-A6T4-01A-11D-A34K-05|hnsc|mean_beta_value_promoter,TCGA-CX-7082-01A-11D-2014-05|hnsc|mean_beta_value_promoter,TCGA-55-8621-01A-11D-2398-05|luad|mean_beta_value_promoter,TCGA-BA-4077-01B-01D-1433-05|hnsc|mean_beta_value_promoter,TCGA-44-7671-01A-11D-2064-05|luad|mean_beta_value_promoter,...,TCGA-J4-AATZ-01A-11D-A41L-05|prad|mean_beta_value_promoter,TCGA-ET-A39N-01A-11D-A19K-05|thca|mean_beta_value_promoter,TCGA-5L-AAT0-01A-12D-A41Q-05|brca|mean_beta_value_promoter,TCGA-G2-A3VY-01A-11D-A231-05|blca|mean_beta_value_promoter,TCGA-G9-6338-01A-12D-1963-05|prad|mean_beta_value_promoter,TCGA-BS-A0V4-01A-11D-A14H-05|ucec|mean_beta_value_promoter,TCGA-B0-4819-01A-01D-1275-05|kirc|mean_beta_value_promoter,TCGA-06-0210-01A-01D-A45W-05|gbm|mean_beta_value_promoter,TCGA-44-2665-01A-01D-A276-05|luad|mean_beta_value_promoter,TCGA-D1-A17K-01A-11D-A12K-05|ucec|mean_beta_value_promoter
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.443866,0.320773,0.72326,0.531928,0.908139,0.780224,0.405876,0.717752,0.73637,0.600592,...,0.557431,0.80208,0.680144,0.212928,0.735104,0.46706,0.709056,0.739527,0.628191,0.693468
A1BG-AS1,0.110391,0.025817,0.050617,0.073089,0.116924,0.036841,0.216182,0.100127,0.104327,0.077859,...,0.091708,0.049971,0.041602,0.084231,0.197002,0.211159,0.098477,0.033417,0.059654,0.045232
A1CF,0.391441,0.250301,0.462136,0.567162,0.695086,0.54352,0.409346,0.785829,0.622589,0.6632,...,0.359442,0.891064,0.67527,0.36313,0.436892,0.403012,0.715297,0.700912,0.73437,0.234995
A2M,0.810426,0.464732,0.529455,0.606023,0.491957,0.59508,0.445457,0.499083,0.55793,0.581326,...,0.29514,0.534252,0.463106,0.73227,0.574303,0.522476,0.534992,0.205325,0.434673,0.445305
A2ML1,0.703449,0.297757,0.311103,0.529984,0.415163,0.499372,0.320599,0.727881,0.39772,0.674001,...,0.332043,0.778914,0.792049,0.192367,0.34738,0.310234,0.803575,0.751523,0.708201,0.415714


In [3]:
normal_samples = pd.read_csv('../../data/pancancer/TCGA/methylation/download_new/normal_sample_matrix.tsv.gz',
                             compression='gzip',
                             sep='\t')
# put symbol in first column name
normal_samples.columns = ['Symbol'] + normal_samples.columns[1:].tolist()
# get rid of all the support columns
normal_betaval_cols = ['Symbol'] + [i for i in normal_samples.columns[1:] if "mean_beta_value_promoter" in i]
normal_samples = normal_samples[normal_betaval_cols]
# put the symbol as index
normal_samples.set_index('Symbol', inplace=True)
# get cancer type, transform and add it as column
ctype_normal = [i.split('|')[1].upper() for i in normal_samples.columns]
normal_samples_t = normal_samples.T
normal_samples_t['ctype'] = ctype_normal
normal_samples_t.head()

Symbol,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,yR211F11.2,ctype
TCGA-22-5471-11A-01D-1633-05|lusc|mean_beta_value_promoter,0.624236,0.044528,0.838009,0.59834,0.761133,0.69398,0.178805,0.06379,0.784948,0.898372,...,0.068541,0.187936,0.076922,0.291098,0.060295,0.047897,0.075362,0.033015,0.919028,LUSC
TCGA-GC-A3WC-11A-11D-A231-05|blca|mean_beta_value_promoter,0.796221,0.060968,0.853203,0.426811,0.751024,0.773104,0.205047,0.068452,0.889046,0.88944,...,0.490259,0.453815,0.110322,0.33883,0.072566,0.056229,0.086861,0.036836,0.925634,BLCA
TCGA-AA-3713-11A-01D-1721-05|coad|mean_beta_value_promoter,0.703644,0.061137,0.627578,0.530384,0.61338,0.55186,0.234964,0.047511,0.865786,0.905003,...,0.130354,0.15429,0.082387,0.404636,0.054379,0.041391,0.070617,0.032585,0.922868,COAD
TCGA-CU-A0YN-11A-11D-A10W-05|blca|mean_beta_value_promoter,0.745175,0.051093,0.810508,0.222311,0.834544,0.742291,0.181212,0.056555,0.837956,0.837963,...,0.095871,0.245532,0.088191,0.304986,0.061216,0.056205,0.083569,0.043757,0.882693,BLCA
TCGA-38-4631-11A-01D-1756-05|luad|mean_beta_value_promoter,0.684194,0.06542,0.662524,0.52289,0.730755,0.685777,0.204331,0.091474,0.683887,0.823476,...,0.47471,0.45091,0.117562,0.35038,0.085787,0.063076,0.074156,0.067302,0.876185,LUAD


## 2. Split per Cancer Type and write small matrices to disk

In [14]:
cols_of_interest = [i for i in tumor_samples.columns[1:] if i.split('|')[1] == 'blca']
tumor_samples[cols_of_interest].shape

(28352, 419)

In [56]:
base_dir = '../../data/pancancer/TCGA/methylation/gene_sample_matrices_cancertype'

processed_cancertypes = []
for ctype in tumor_samples_t.ctype.unique():
    # tumor samples
    cols_of_interest = [i for i in tumor_samples.columns if i.split('|')[1].upper() == ctype]
    tumor_samples_ctype = tumor_samples[cols_of_interest]
    
    # normal samples
    cols_of_interest = [i for i in normal_samples.columns if i.split('|')[1].upper() == ctype]
    normal_samples_ctype = normal_samples[cols_of_interest]
    
    # don't process the cancer type when there are no samples
    if tumor_samples_ctype.shape[1] <= 1 or normal_samples_ctype.shape[1] <= 1:
        print ("Warning: No normal or tumor data for {}... Not processing it".format(ctype))
        continue

    # directory
    ctype_dir = os.path.join(base_dir, ctype)
    if not os.path.isdir(ctype_dir):
        os.mkdir(ctype_dir)

    # write them to disk
    tumor_samples_ctype.dropna(axis=0).to_csv(os.path.join(ctype_dir, 'tumor_samples.tsv'), sep='\t')
    normal_samples_ctype.dropna(axis=0).to_csv(os.path.join(ctype_dir, 'normal_samples.tsv'), sep='\t')
    
    # pheno data
    pheno_t = pd.DataFrame(tumor_samples_ctype.columns, columns=['Name'])
    pheno_t['index'] = np.arange(pheno_t.shape[0])
    pheno_t['cancer'] = 'tumor'
    pheno_t['batch'] = [i[0].split('-')[5] for i in pheno_t.Name.str.split('|')]
    pheno_t.set_index('Name', inplace=True)
    pheno_t.to_csv(os.path.join(ctype_dir, 'pheno_tumor.tsv'), sep='\t')

    pheno_n = pd.DataFrame(normal_samples_ctype.columns, columns=['Name'])
    pheno_n['index'] = np.arange(pheno_n.shape[0])
    pheno_n['cancer'] = 'normal'
    pheno_n['batch'] = [i[0].split('-')[5] for i in pheno_n.Name.str.split('|')]
    pheno_n.set_index('Name', inplace=True)
    pheno_n.to_csv(os.path.join(ctype_dir, 'pheno_normal.tsv'), sep='\t')

    print ("Wrote matrices for cancer type {}".format(ctype))
    processed_cancertypes.append(ctype)

Wrote matrices for cancer type PAAD
Wrote matrices for cancer type UCEC
Wrote matrices for cancer type HNSC
Wrote matrices for cancer type LUAD
Wrote matrices for cancer type KIRP
Wrote matrices for cancer type KIRC
Wrote matrices for cancer type PRAD
Wrote matrices for cancer type COAD
Wrote matrices for cancer type GBM
Wrote matrices for cancer type LUSC
Wrote matrices for cancer type CESC
Wrote matrices for cancer type SARC
Wrote matrices for cancer type STAD
Wrote matrices for cancer type BRCA
Wrote matrices for cancer type BLCA
Wrote matrices for cancer type THCA
Wrote matrices for cancer type LIHC
Wrote matrices for cancer type SKCM
Wrote matrices for cancer type READ
Wrote matrices for cancer type ESCA
Wrote matrices for cancer type OV


## 3. Do batch correction

In [59]:
import subprocess

base_path = '../../data/pancancer/TCGA/methylation/gene_sample_matrices_cancertype/{}'
call = 'Rscript batch_correction.R {} {} {}'
for ctype in processed_cancertypes:
    ctype_dir = base_path.format(ctype)
    tumor_sample_path = os.path.join(ctype_dir, 'tumor_samples.tsv')
    tumor_pheno_path = os.path.join(ctype_dir, 'pheno_tumor.tsv')
    tumor_out_path = os.path.join(ctype_dir, 'tumor_samples.adjusted.tsv')
    subprocess.call(call.format(tumor_sample_path, tumor_pheno_path, tumor_out_path), shell=True)

    normal_sample_path = os.path.join(ctype_dir, 'normal_samples.tsv')
    normal_pheno_path = os.path.join(ctype_dir, 'pheno_normal.tsv')
    normal_out_path = os.path.join(ctype_dir, 'normal_samples.adjusted.tsv')
    subprocess.call(call.format(normal_sample_path, normal_pheno_path, normal_out_path), shell=True)
    print ("Processed {}".format(ctype))

Processed PAAD
Processed UCEC
Processed HNSC
Processed LUAD
Processed KIRP
Processed KIRC
Processed PRAD
Processed COAD
Processed GBM
Processed LUSC
Processed CESC
Processed SARC
Processed STAD
Processed BRCA
Processed BLCA
Processed THCA
Processed LIHC
Processed SKCM
Processed READ
Processed ESCA


## 4. Plot a PCA to see how well the batches cluster

In [60]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
emb = TSNE(n_components=2).fit_transform(tumor_samples_ctype.dropna(axis=0).T)
emb.shape

(10, 2)

In [None]:
fig = plt.figure(figsize=(20, 10))
cmap = pd.DataFrame(pd.Series([i[0].split('-')[5] for i in tumor_samples_ctype.columns[1:].str.split('|')]).unique(), columns=['ID'])
cmap['number'] = np.arange(cmap.shape[0])
cmap.set_index('ID', inplace=True)
codes = pd.DataFrame([i[0].split('-')[5] for i in tumor_samples_ctype.columns.str.split('|')], columns=['ID'])
colors = codes.join(mapping, on='ID').number
plt.scatter(emb[:, 0], emb[:, 1], c=colors)

In [62]:
adjusted_tumor = pd.read_csv('../../data/pancancer/TCGA/methylation/gene_sample_matrices_cancertype/PAAD/tumor_samples.adjusted.tsv', sep='\t')
fig = plt.figure(figsize=(20, 10))
emb_new = TSNE(n_components=2).fit_transform(adjusted_tumor.T)
plt.scatter(emb_new[:, 0], emb_new[:, 1], c=colors)

NameError: name 'colors' is not defined

<Figure size 1440x720 with 0 Axes>