In [2]:

import os as os

import numpy as np
import pandas as pd

annotation_root = '/home/pebert/work/code/mpggit/crossspecies/annotation'
ar = annotation_root

data_root = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species'

base_out = '/TL/deep-external01/nobackup/pebert/cloudshare/mpiinf/phd/chapter_projects/crossspecies'
supp_out = os.path.join(base_out, 'supplement', 'supp_tables')

table_out = os.path.join(supp_out, 'Add-file-2_Table-S1_file-sources.tsv')

dataset_file = os.path.join(ar, 'exec', 'datasets.tsv')
blueprint_file = os.path.join(ar, 'datasrc', 'blueprint', 'blueprint_metadata_ro.tsv')
deep_file = os.path.join(ar, 'datasrc', 'deep', '20170516_deep.tsv')
encode_file = os.path.join(ar, 'datasrc', 'encode', 'encode_metadata_ro.tsv')
sra_file = os.path.join(ar, 'datasrc', 'sra', '20170818_SRA_expression_fastq.tsv')

def read_project_datasets(fpath):
    df = pd.read_csv(fpath, sep='\t')
    df = df.loc[df['comment'] != 'ignore', :].copy()
    df = df.loc[df['comment'] != 'dnase', :].copy()
    df.drop(['multid', 'layout', 'lifestage', 'experiment'], axis=1, inplace=True)
    df.columns = ['short_id'] + df.columns[1:].tolist()
    df = df.astype(str)
    annot = dict()
    for row in df.itertuples():
        sid = row.short_id
        com = row.comment
        if com != 'complete':
            if com == 'histone':
                com = 'complete'
        if row.lab == 'AFSCAM':
            lut_key = row.assembly, row.type, row.project, row.biosample, 'WTSI'
            annot[lut_key] = sid, com
            lut_key = row.assembly, row.type, row.project, row.biosample, 'NCMLS'
            annot[lut_key] = sid, com
        else:
            lut_key = row.assembly, row.type, row.project, row.biosample, row.lab
            annot[lut_key] = sid, com
    return annot

def read_blueprint_metadata(fpath):
    df = pd.read_csv(fpath, sep='\t')
    df.drop(['AssemblyName', 'ReadHash', 'InsertDev', 'LibrarySelection', 'RunHash', 'fastq_path',
             'LoadDate', 'spots_with_mates', 'bases', 'Model', 'LibraryStrategy',
             'Platform', 'SampleName', 'ScientificName', 'InsertSize', 'Tumor', 'SampleType',
             'LibrarySource', 'Consent', 'avgLength', 'ProjectID', 'size_MB', 'spots', 'Sex',
             'Submission', 'ReleaseDate', 'Subject_ID', 'TaxID', 'LibraryLayout'],
            axis=1, inplace=True)
    df = df.loc[df['short_biosample'] == 'ncd4', :].copy()
    select_libs = df['Experiment_target'].isin(['H3K4me3', 'H3K27ac', 'H3K36me3'])
    df = df.loc[select_libs, :].copy()
    df['assembly'] = 'mm9'
    df['biosample'] = 'ncd4'
    df['project'] = 'BLUEPRINT'
    df['type'] = 'epigenome'
    df['lab'] = 'unknown'
    df.loc[df['CenterName'].str.startswith('Nijmegen'), 'lab'] = 'NCMLS'
    df.loc[df['CenterName'].str.startswith('THE'), 'lab'] = 'WTSI'
    df.drop(['short_biosample', 'CenterName'], axis=1, inplace=True)
    new_cols = []
    for c in df.columns:
        if c == 'download_path':
            new_cols.append('url')
        elif c == 'Biological_replicate':
            new_cols.append('rep')
        else:
            new_cols.append(c)
    df.columns = new_cols
    df = df.astype(str)
    return df

def read_deep_metadata(fpath):
    df = pd.read_csv(fpath, sep='\t')
    df.drop(['lifestage'], axis=1, inplace=True)
    select_libs = df['filename'].str.contains('(H3K36me3|H3K4me3|H3K27ac|mRNA)', case=True)
    df = df.loc[select_libs, :].copy()
    df['Experiment_target'] = df['filename'].str.extract('(H3K36me3|H3K4me3|H3K27ac|mRNA)', expand=False)
    df['url'] = df['url'].str.replace('local', 'local file - see www.epigenomesportal.ca/ihec for public version')
    df['project'] = 'DEEP'
    df['type'] = 'unknown'
    df.loc[df['filename'].str.endswith('.fastq.gz'), 'type'] = 'transcriptome'
    df.loc[df['filename'].str.endswith('.bw'), 'type'] = 'epigenome'
    df = df.astype(str)
    return df

def read_encode_metadata(fpath):
    df = pd.read_csv(fpath, sep='\t')
    deselect_samples = np.array(df['Biosample term name'].isin(['CH12', 'MEL', 'GM12878', 'K562']), dtype=np.bool)
    deselect_libs = np.array(df['Assay'] == 'DNase-seq', dtype=np.bool)
    deselect = np.logical_or(deselect_samples, deselect_libs)
    df = df.loc[~deselect, :].copy()
    
    select_libs = df['Experiment target'].isin(['H3K4me3', 'H3K36me3', 'H3K27ac'])
    select_assay = df['Assay'].isin(['ChIP-seq', 'RNA-seq'])
    select_both = np.logical_or(select_libs, select_assay)
    df = df.loc[select_both, :].copy()
    df['type'] = 'unknown'
    df.loc[df['Assay'] == 'ChIP-seq', 'type'] = 'epigenome'
    df.loc[df['Assay'] == 'RNA-seq', 'type'] = 'transcriptome'
    
    df.drop(['Technical replicate', 'Read length', 'Run type', 'Paired end',
             'Paired with', 'Derived from', 'Platform', 'Antibody accession',
             'Biological replicate(s)', 'Biosample Age', 'Library size range',
             'Library depleted in', 'Library made from', 'Assay', 'Assembly', 'File format'],
            axis=1, inplace=True)
    
    df['project'] = 'ENCODE'
    df['assembly'] = 'unknown'
    df.loc[df['Biosample organism'] == 'hsa', 'assembly'] = 'hg19'
    df.loc[df['Biosample organism'] == 'mmu', 'assembly'] = 'mm9'
    new_cols = []
    for c in df.columns:
        if c == 'Biosample term name':
            new_cols.append('biosample')
        elif c == 'Lab':
            new_cols.append('lab')
        elif c == 'Biosample organism':
            new_cols.append('species')
        elif c == 'File download URL':
            new_cols.append('url')
        else:
            new_cols.append(c.replace(' ', '_'))
    df.columns = new_cols
    filenames = df['url'].str.extract('/(ENC[A-Z0-9]+\.(bigWig|fastq\.gz))', expand=False)
    df['filename'] = filenames[0]
    df = df.astype(str)
    return df

def read_sra_metadata(fpath):
    df = pd.read_csv(fpath, sep='\t')
    select_cells = df['cell'].isin(['liver', 'kidney', 'ncd4', 'heart', 'blood'])
    df = df.loc[select_cells, :].copy()
    df.drop(['comment', 'use', 'date', 'runtype', 'readlength',
             'SRA_ReadClass', 'platform', 'url', 'experiment'], axis=1, inplace=True)
    df['project'] = 'SRA'
    df.loc[df['GEO_Sample'] == 'na', ['GEO_Sample', 'GEO_Series']] = 'N/A'
    df['type'] = 'transcriptome'
    new_cols = []
    for c in df.columns:
        if c == 'download_path':
            new_cols.append('url')
        elif c == 'cell':
            new_cols.append('biosample')
        else:
            new_cols.append(c)
    df.columns = new_cols
    df = df.astype(str)
    return df
 
dset_lut = read_project_datasets(dataset_file)

bp = read_blueprint_metadata(blueprint_file)
deep = read_deep_metadata(deep_file)
enc = read_encode_metadata(encode_file)
sra = read_sra_metadata(sra_file)

final = bp.merge(deep, how='outer')
final = final.merge(enc, how='outer')
final = final.merge(sra, how='outer')

sids = []
comm = []
species = []
spec_lut = {'hg19': 'human', 'mm9': 'mouse', 'rn5': 'rat', 'canFam3': 'dog',
            'monDom5': 'opossum', 'galGal3': 'chicken', 'felCat5': 'cat',
            'bosTau7': 'cow', 'susScr2': 'pig', 'oviAri3': 'sheep', 'rheMac2': 'rhesus',
            'equCab2': 'horse', 'oryCun2': 'rabbit'}
for row in final.itertuples():
    lut_key = row.assembly, row.type, row.project, row.biosample, row.lab
    spec = spec_lut[row.assembly]
    sid, com = dset_lut[lut_key]
    sids.append(sid)
    comm.append(com)
    species.append(spec)
    
final['short_id'] = sids
final['comment'] = comm
final['species'] = species

final.fillna(value='N/A', inplace=True)
final = final.astype(str)

sort_cols = []
for c in final.columns:
    if c == 'short_id':
        sort_cols.append((0, c))
    elif c == 'assembly':
        sort_cols.append((1, c))
    elif c == 'type':
        sort_cols.append((2, c))
    elif c == 'project':
        sort_cols.append((3, c))
    elif c == 'biosample':
        sort_cols.append((4, c))
    elif c == 'lab':
        sort_cols.append((5, c))
    elif c == 'species':
        sort_cols.append((6, c))
    elif c == 'comment':
        sort_cols.append((1000, c))
    else:
        n = int((final[c] == 'N/A').sum())
        sort_cols.append((n, c))
        
sort_cols = sorted(sort_cols)
final = final[[c for n, c in sort_cols]]

final['data_id'] = final['short_id'].str.extract('([0-9]+)', expand=False).astype(np.int16)
final['data_type'] = final['short_id'].str.extract('(E|T){1}', expand=False)

final.sort_values(['data_type', 'data_id', 'type', 'short_id', 'project', 'assembly'], inplace=True)

final.drop(['data_id', 'data_type'], axis=1, inplace=True)

final.to_csv(table_out, header=True, index=False)


