In [37]:

import sys as sys
import os as os
import pickle as pck
import collections as col

import pandas as pd
import numpy as np

fhgfs_base = '/TL/deep/fhgfs/projects/pebert/thesis'
stat_folder = os.path.join(fhgfs_base, 'projects/cross_species/processing/norm/task_summarize')
stat_file = os.path.join(stat_folder, 'agg_expstat_est.h5')

cache_dir = '/home/pebert/.jupyter/cache'

feature_dir = os.path.join(fhgfs_base, 'projects/cross_species/processing/norm/task_testdata_exp/test_datasets')

annotated_species = ['human', 'mouse', 'cow', 'opossum', 'chicken',
                     'rhesus', 'rat', 'dog']

assm_map = {'human': 'hg19', 'mouse': 'mm9', 'opossum': 'monDom5', 'rhesus': 'rheMac2',
           'rat': 'rn5', 'dog': 'canFam3', 'cow': 'bosTau7', 'chicken': 'galGal3'}

column_map = {'Cellular_organisms': 'ftage_abs_cellorg', 'Euk_Archaea': 'ftage_abs_eukarch',
              'Euk+Bac': 'ftage_abs_eukbac', 'Eukaryota': 'ftage_abs_euk',
              'Opisthokonta': 'ftage_abs_opist', 'Eumetazoa': 'ftage_abs_eumeta',
              'Vertebrata': 'ftage_abs_vert',  'Mammalia': 'ftage_abs_mamm',
              'entropy': 'ftconf_abs_entropy', 'Bimodality': 'ftconf_abs_bimodality',
              'NodeError': 'ftconf_abs_nodeerror', 'HGT_flag': 'ftconf_bin_hgt'}

out_dir = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/geneage/norm'
run_norm = True
run_collect = True
run_finalize = True

def map_identifiers():
    ga_annot = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/geneage/raw'
    up_map = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/geneage/map_uniprot'
    ens_map = '/TL/deep/fhgfs/projects/pebert/thesis/refdata/geneage/map_ensembl'
    
    for fn in os.listdir(ga_annot):
        species = fn.split('.')[0].split('_')[1]
        ent_col = '{}_entity'.format(species)
        name_col = '{}_name'.format(species)
        
        fp = os.path.join(ga_annot, fn)
        df = normalize_annotation(fp, species)

        up_fp = os.path.join(up_map, '{}_uniprot_ensgene.txt'.format(species))
        up_df = pd.read_csv(up_fp, sep='\t')
        up_df.columns = [ent_col, name_col]
        df = df.merge(up_df, on=ent_col, how='outer')
        
        ens_fp = os.path.join(ens_map, '{}_ensprot_ensgene.txt'.format(species))
        ens_df = pd.read_csv(ens_fp, sep='\t')
        ens_df.drop(['Ensembl Transcript ID'], axis=1, inplace=True)
        ens_df = ens_df.loc[ens_df['Ensembl Protein ID'].isin(df[ent_col]), :].copy()
        ens_df.columns = [name_col, ent_col]
        df = df.merge(ens_df, on=[ent_col, name_col], how='outer')
        
        df.dropna(axis=0, how='any', inplace=True)
        df.drop_duplicates(subset=['{}_name'.format(species)], inplace=True)       
        if pd.isnull(df).any(axis=0).any():
            null_col = np.array(pd.isnull(df).any(axis=0), dtype=np.bool)
            null_row = np.array(pd.isnull(df).any(axis=1), dtype=np.bool)
            null_sub = df.loc[null_row, null_col]
            print(null_sub.shape)
            print(df.columns[null_col])
            print(null_sub.head())
            print(null_sub.tail())
            raise ValueError('NULL after dropping')
        
        if species == 'chicken':
            df['ftage_abs_mamm'] = 0
        
        outname = '{}_agefeat.h5'.format(species)
        outpath = os.path.join(out_dir, outname)
        with pd.HDFStore(outpath, 'w', complevel=9) as hdf:
            hdf.put('feat', df, format='table')
    return True
  

def normalize_annotation(fp, species):
    df = pd.read_csv(fp, sep=',', na_values='None')
    # impute missing values
    bimod_mean = df['Bimodality'].mean()
    nderr_mean = df['NodeError'].mean()
    
    df.loc[pd.isnull(df['Bimodality']), 'Bimodality'] = bimod_mean
    df.loc[pd.isnull(df['NodeError']), 'NodeError'] = nderr_mean
    
    new_columns = []
    for c in df.columns:
        new_c = column_map.get(c, c)
        new_columns.append(new_c)
    df.columns = new_columns
    df.index = df['{}_entity'.format(species)]
    to_drop = [c for c in df.columns if 'DB' in c]
    df.drop(to_drop, axis=1, inplace=True)

    if pd.isnull(df).any(axis=0).any():
        null_col = np.array(pd.isnull(df).any(axis=0), dtype=np.bool)
        null_row = np.array(pd.isnull(df).any(axis=1), dtype=np.bool)
        null_sub = df.loc[null_row, null_col]
        print(null_sub.shape)
        print(df.columns[null_col])
        print(null_sub.head())
        print(null_sub.tail())
            
        raise ValueError('NULL after reading')
    
    # "normalize" df entries
    df.replace({'modeAge': column_map}, inplace=True)
    df['ftconf_bin_hgt'] = df['ftconf_bin_hgt'].astype(np.int8, copy=True)
    ga_feat_names = []
    for v in column_map.values():
        if v.startswith('ftage'):
            w = v.replace('_abs_', '_bin_')
            ga_feat_names.append(w)
    ga_feat_names = sorted(ga_feat_names)
    # add gene age feature
    ga_feat = pd.DataFrame(np.zeros((df.shape[0], len(ga_feat_names)), dtype=np.int8),
                           index=df.index, columns=ga_feat_names)
    for age_class in df['modeAge'].unique():
        entities = df.loc[df['modeAge'] == age_class, :].index
        age_cat = age_class.replace('_abs_', '_bin_')
        ga_feat.loc[entities, age_cat] = 1
    df = df.join(ga_feat, how='inner')
    df.drop(['modeAge'], axis=1, inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    if pd.isnull(df).any(axis=0).any():
        null_col = np.array(pd.isnull(df).any(axis=0), dtype=np.bool)
        null_row = np.array(pd.isnull(df).any(axis=1), dtype=np.bool)
        null_sub = df.loc[null_row, null_col]
        print(null_sub.shape)
        print(df.columns[null_col])
        print(null_sub.head())
        print(null_sub.tail())
        raise ValueError('NULL after normalizing')
    
    return df


def collect_model_stat_perf(fpath, species):
    """
    """
    done = set()
    cache_keys = os.path.join(cache_dir, 'geneage_keys.pck')
    count_collect = dict()
    data_collect = dict()
    with pd.HDFStore(fpath, 'r') as hdf:
        if os.path.isfile(cache_keys):
            all_keys = pck.load(open(cache_keys, 'rb'))
        else:
            all_keys = list(hdf.keys())
            all_keys = list(filter(lambda x: x.startswith('/pos/can') and x.endswith('/data'), all_keys))
            all_keys = list(filter(lambda x: all([c not in x for c in ['GM12878', 'CH12', 'K562', 'MEL', 'brain']]), all_keys))
            with open(cache_keys, 'wb') as dump:
                pck.dump(all_keys, dump)
        for k in all_keys:
            data = hdf[k]
            path_comp = k.split('/')
            ref_spec, qry_spec = path_comp[3], path_comp[4]
            if not (ref_spec in species and qry_spec in species):
                continue
            ref_data, qry_data = path_comp[5], path_comp[6]
            if (ref_data, qry_data) in done:
                continue
            else:
                done.add((ref_data, qry_data))
            if (ref_spec, qry_spec) not in data_collect:
                sub = data.loc[:, ['pair_ortho', 'group_ortho']].copy()
                sub.columns = ['ftorth_pair', 'ftorth_group']
                sub['true_pred'] = 0
                sub['false_pred'] = 0
                data_collect[(ref_spec, qry_spec)] = sub
            if (ref_spec, qry_spec) not in count_collect:
                count_collect[(ref_spec, qry_spec)] = {'true_pred': col.Counter(),
                                                       'false_pred': col.Counter()}
            gene_names = data.index
            
            true_pos = np.array(data['tp'].values, dtype=np.bool)
            true_neg = np.array(data['tn'].values, dtype=np.bool)
            true_select = np.logical_or(true_pos, true_neg)
            
            select_names = gene_names[true_select].tolist()
            count_collect[(ref_spec, qry_spec)]['true_pred'].update(select_names)
            
            select_names = gene_names[~true_select].tolist()
            count_collect[(ref_spec, qry_spec)]['false_pred'].update(select_names)
    
    final_collect = dict()
    for ref, qry in data_collect.keys():
        base_data = data_collect[(ref, qry)]
        true_pred = pd.DataFrame.from_dict(count_collect[(ref, qry)]['true_pred'],
                                           orient='index', dtype=np.int32)
        true_pred.columns = ['true_pred']        
        base_data.loc[true_pred.index, 'true_pred'] = true_pred
                
        false_pred = pd.DataFrame.from_dict(count_collect[(ref, qry)]['false_pred'],
                                            orient='index', dtype=np.int32)
        false_pred.columns = ['false_pred']
        base_data.loc[false_pred.index, 'false_pred'] = false_pred
        
        base_data['total_pred'] = base_data['true_pred'] + base_data['false_pred']
        base_data['target'] = base_data['true_pred'] / base_data['total_pred']
        
        base_data = add_conservation_feature(base_data, ref, qry)
        
        assert not (pd.isnull(base_data).any(axis=0).any()), 'Base data has NULL'
                
        cache_file = os.path.join(out_dir, '{}_agefeat.h5'.format(qry))
        with pd.HDFStore(cache_file, 'a') as hdf:
            hdf.put('model/{}'.format(ref), base_data, format='table')
        final_collect[(ref, qry)] = base_data
        
    done_queries = set()
    for ref, qry in final_collect.keys():
        if qry in ['human', 'mouse']:
            continue
        if qry in done_queries:
            continue
        if ref == 'human':
            ref2 = 'mouse'
        else:
            ref2 = 'human'
        final = final_collect[(ref, qry)]
        try:
            data2 = final_collect[(ref2, qry)]
        except KeyError:
            continue
        done_queries.add(qry)
        final['true_pred'] += data2['true_pred']
        final['false_pred'] += data2['false_pred']
        final['total_pred'] = final['true_pred'] + final['false_pred']
        final['target'] = final['true_pred'] / final['total_pred']
        
        final['ftorth_pair'] += data2['ftorth_pair']
        final['ftcons_pct_reg5p'] += data2['ftcons_pct_reg5p']
        final['ftcons_pct_reg5p'] /= 2
        final['ftcons_pct_body'] += data2['ftcons_pct_body']
        final['ftcons_pct_body'] /= 2
        
        assert not pd.isnull(final).any(axis=0).any(), 'Final has NULL'
        
        cache_file = os.path.join(out_dir, '{}_agefeat.h5'.format(qry))
        with pd.HDFStore(cache_file, 'a') as hdf:
            hdf.put('model/joint', final, format='table')

    return True


def add_conservation_feature(dataframe, ref, qry):
    ref_assm = assm_map[ref]
    qry_assm = assm_map[qry]
    folder = os.path.join(feature_dir, '{}_from_{}'.format(qry_assm, ref_assm))
    all_files = os.listdir(folder)
    all_files = list(filter(lambda x: x.endswith('.feat.h5'), all_files))
    for fn in all_files:
        fp = os.path.join(folder, fn)
        data = []
        with pd.HDFStore(fp, 'r') as hdf:
            for k in hdf.keys():
                if k == '/metadata':
                    continue
                data.append(hdf[k])
        data = pd.concat(data, axis=0, ignore_index=False)
        data.index = data['name']
        keep_cols = ['ftmsig_H3K4me3_pct_cons_reg5p', 'ftmsig_H3K36me3_pct_cons_body']
        data = data.loc[:, keep_cols].copy()
        data.columns = ['ftcons_pct_reg5p', 'ftcons_pct_body']
        dataframe = pd.concat([dataframe, data], ignore_index=False, axis=1)
        break
    return dataframe


def finalize_featuresets():
    featfiles = os.listdir(out_dir)
    for ff in featfiles:
        fp = os.path.join(out_dir, ff)
        with pd.HDFStore(fp, 'a') as hdf:
            age_feat = hdf['/feat']
            assert not (pd.isnull(age_feat).any(axis=0).any()), 'NULL before finalize'
            name_col = [c for c in age_feat.columns if c.endswith('name')]
            name_col = name_col[0]
            age_feat.index = age_feat[name_col]
            rem_cols = [c for c in age_feat.columns if c.endswith('entity')]
            rem_cols.append(name_col)
            age_feat.drop(rem_cols, axis=1, inplace=True)
            model_keys = [k for k in hdf.keys() if k.startswith('/model')]
            for k in model_keys:
                model_type = k.split('/')[2]
                model_data = hdf[k]
                assert not (pd.isnull(model_data).any(axis=0).any()), 'Model data NULL: {}'.format(k)
                sub_feat = age_feat.loc[age_feat.index.isin(model_data.index), :].copy()
                sub_model = model_data.loc[model_data.index.isin(sub_feat.index), :].copy()
                
                final = pd.concat([sub_model, sub_feat], axis=1, ignore_index=False)
                final['name'] = final.index
                final.reset_index(drop=True, inplace=True)
                assert not (pd.isnull(final).any(axis=0).any()), 'Final data NULL: {}'.format(k)
                out_group = os.path.join(k, 'final')
                hdf.put(out_group, final, format='table')
    return  

if run_norm:           
    map_identifiers()
    
if run_collect:
    collect_model_stat_perf(stat_file, annotated_species)

if run_finalize:
    finalize_featuresets()

    
