In [3]:

import os as os

import numpy as np
import pandas as pd

project_root = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff'
mirna_data_root = '/TL/deep/fhgfs/data/incoming/mirror/MDC/human/GRCh38'
reference_root = os.path.join(project_root, 'references')

pc_gene_file = os.path.join(reference_root, 'gencode_v21_pcg_bglist.bed')
refseq_file = os.path.join(reference_root, '20180711_ENSv78_Ensembl-to-RefSeq.tsv')
target_file = os.path.join(reference_root, 'miRDB_v5.0_prediction_result.txt.gz')

cache_file = os.path.join(project_root, 'caching', 'notebooks', '20180711_mirna_exp_targets.h5')

use_samples = ['01_HepG2_LiHG_Ct1', '01_HepG2_LiHG_Ct2',
               '43_Hm01_BlMo_Ct', '43_Hm03_BlMo_Ct', '43_Hm05_BlMo_Ct',
               '43_Hm03_BlMa_Ct', '43_Hm05_BlMa_Ct',
               '41_Hf02_LiHe_Ct', '41_Hf03_LiHe_Ct']

short_label = {'01_HepG2_LiHG_Ct1': 'Hc01_LiHG_Ct',
               '01_HepG2_LiHG_Ct2': 'Hc02_LiHG_Ct'}

def collect_mirna_expression(root_path):
    # #miRNA  read_count  precursor   total   ID1 ID1(norm)
    # 43_Hm01_BlMo_Ct_snRNA_M_1
    exp_data = []
    for root, dirs, mirna_files in os.walk(root_path):
        if mirna_files:
            tsv_files = [f for f in mirna_files if f.endswith('.known.tsv')]
            for tsv in tsv_files:
                assert 'snRNA' in tsv, 'Unexpected library: {}'.format(tsv)
                sample = tsv.rsplit('_', 3)[0]
                if sample in use_samples:
                    if sample == '41_Hf02_LiHe_Ct':
                        if '20150923' not in tsv:
                            continue
                    store_label = short_label.get(sample, sample.split('_', 1)[1])
                    celltype = store_label[7:9]
                    df = pd.read_csv(os.path.join(root, tsv), 
                                     sep='\t', header=None, skiprows=1,
                                     names=['mirna_id', 'raw_count', 'precursor',
                                            'total_count', 'same_count', 'norm_count'])
                    df.drop(['total_count', 'same_count'], axis=1, inplace=True)
                    df['sample'] = store_label
                    df['celltype'] = celltype
                    exp_data.append(df)
    exp_data = pd.concat(exp_data, axis=0, ignore_index=False)
    assert not pd.isnull(exp_data).any().any(), 'missing data'
    return exp_data


def load_gene_mirna_annotation(pc_genes, refseq_map, mirna_targets):
    
    genes = pd.read_csv(pc_genes, sep='\t', skiprows=1, header=None,
                        names='chrom start end name score strand symbol'.split())
    genes = genes[['chrom', 'name', 'symbol']]
    
    refseq = pd.read_csv(refseq_map, sep='\t', skiprows=1, header=None,
                         names=['name', 'pct_gc', 'refseq'])
    
    refseq = refseq.loc[refseq['name'].isin(genes['name']), :].copy()
    genes = genes.loc[genes['name'].isin(refseq['name']), :].copy()
    
    pc_genes = refseq.merge(genes, on='name', how='outer')
    assert not pd.isnull(pc_genes).any().any(), 'Missing data'
    
    targets = pd.read_csv(mirna_targets, sep='\t', header=None,
                          names=['mirna_id', 'refseq', 'confidence'])
    targets = targets.loc[targets['refseq'].isin(pc_genes['refseq']), :].copy()
        
    pc_genes = pc_genes.loc[pc_genes['refseq'].isin(targets['refseq']), :].copy()   
    
    targets = targets.merge(pc_genes, on='refseq', how='outer')
    targets.reset_index(drop=True, inplace=True)
        
    assert not pd.isnull(targets).any().any(), 'Targets: missing data'
    
    return targets
    
    
    

mirna_exp = collect_mirna_expression(mirna_data_root)
mirna_targets = load_gene_mirna_annotation(pc_gene_file, refseq_file, target_file)
mirna_targets = mirna_targets.loc[mirna_targets['mirna_id'].isin(mirna_exp['mirna_id']), :].copy()

mirna_exp = mirna_exp.merge(mirna_targets[['mirna_id', 'name', 'symbol', 'pct_gc']],
                            on='mirna_id', how='outer')

with pd.HDFStore(cache_file, 'w') as hdf:
    hdf.put('mirna_targets', mirna_targets, format='table')
    hdf.put('mirna_exp/raw', mirna_exp, format='table')
    
    for s1 in ['He', 'Ma']:
        for s2 in ['Mo', 'Ma']:
            if s1 == s2:
                continue
            comp = s1 + '_vs_' + s2
            sub_select = np.logical_or(mirna_exp['celltype'] == s1, mirna_exp['celltype'] == s2)
            sub = mirna_exp.loc[sub_select, ['name', 'mirna_id', 'pct_gc', 'norm_count', 'celltype']].copy()
            sub.reset_index(drop=True, inplace=True)
            store_path = os.path.join('mirna_exp', comp)
            hdf.put(store_path, sub, format='table')
    
print('Done')

            

Done
