In [2]:
import os as os
import io as io

import numpy as np
import pandas as pd

repo_base = '/home/pebert/work/code/mpggit/statediff'

annotations = os.path.join(repo_base, 'annotation')

remc_table = os.path.join(annotations, 'jul2013.roadmapData.qc - Consolidated_EpigenomeIDs_summary_Table.tsv')

cache_file = os.path.join(annotations, 'remc_cache.h5')

remc_clean = os.path.join(annotations, 'remc_metadata.tsv')
remc_design = os.path.join(annotations, 'remc_design.tsv')

norm_cols = {'Epigenome ID (EID)': 'EID', 'GROUP': 'group', 'COLOR': 'hex_color',
             'Epigenome Mnemonic': 'mnemonic', 'Quality Rating': 'quality_rating',
             'Standardized Epigenome name': 'sample_name', 'ANATOMY': 'anatomy',
             'TYPE': 'biotype', 'SEX (Male, Female, Mixed, Unknown)': 'sex',
             'AGE (Post Birth in YEARS/ Fetal in GESTATIONAL WEEKS/CELL LINE CL) ': 'age',
             'Epigenome name (from EDACC Release 9 directory)': 'sample_label'}

def read_remc_table():
    with open(remc_table, 'r') as tab:
        header = tab.readline().strip().split('\t')
        _ = tab.readline()
        _ = tab.readline()
        content = io.StringIO(tab.read())
        df = pd.read_csv(content, sep='\t', header=None, names=header)
        to_drop = [c for c in df.columns if c not in norm_cols]
        df.drop(to_drop, axis=1, inplace=True)
        new_cols = [norm_cols[c] for c in df.columns]
        df.columns = new_cols
    
    with pd.HDFStore(cache_file, 'r') as hdf:
        smp = hdf['samples']
    
    df = df.loc[df['EID'].isin(smp['EID']), :]
    df.replace({'age': {'Fetus (GW unknown)': 'fetal',
                         np.nan: 'n/a', '3Y, 34Y': '3Y_34Y'}},
               inplace=True)
    df.sort_values('EID', axis=0, inplace=True)
    return df


def derive_design_matrix(table):
    biotypes = sorted(table['biotype'].unique())
    num_cols = 5 + len(biotypes)
    col_names = ['sex_Male', 'sex_Female', 'sex_Other', 'phenotype_normal', 'phenotype_cancer']
    for bt in biotypes:
        col_names.append('biotype_' + bt)
    anatomy = sorted(table['anatomy'].unique())
    num_cols += len(anatomy)
    for an in anatomy:
        col_names.append('anatomy_' + an)
    num_rows = table.shape[0]
    design = pd.DataFrame(np.zeros((num_rows, num_cols), dtype=np.int8),
                          index=table['EID'], columns=col_names)
    for row in table.itertuples():
        if row.sex == 'Male' or row.sex == 'Female':
            design.loc[row.EID, 'sex_' + row.sex] = 1
        else:
            design.loc[row.EID, 'sex_Other'] = 1
        design.loc[row.EID, 'biotype_' + row.biotype] = 1
        design.loc[row.EID, 'anatomy_' + row.anatomy] = 1
        if row.mnemonic.endswith('CNCR'):
            design.loc[row.EID, 'phenotype_cancer'] = 1
        else:
            design.loc[row.EID, 'phenotype_normal'] = 1
    return design


md = read_remc_table()
dm = derive_design_matrix(md)

md.to_csv(remc_clean, sep='\t', index=False)
dm.to_csv(remc_design, sep='\t', index_label='EID')