In [1]:
%matplotlib inline

import os as os
import json as js
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

fhgfs_base = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species'
workdir = os.path.join(fhgfs_base, 'processing/norm/task_summarize')
cloud_base = '/TL/deep-external01/nobackup/pebert/cloudshare'
outdir = os.path.join(cloud_base, 'mpiinf/phd/chapter_projects/crossspecies/figures/pub')

repodir = '/home/pebert/work/code/mpggit/crossspecies'

metrics_file = os.path.join(workdir, 'train_test_perf_agg.h5')
color_file = os.path.join(repodir, 'graphics', 'colors', 'cs_colors.json')
color_codes = js.load(open(color_file, 'r'))


def plot_classifier_perf(data, colors, title):
    """
    """
    queries = sorted(data['query'].unique())
    tasks = ['seq', 'sig', 'full']
    x_pos = {'seq': np.arange(1,3,0.5), 'sig': np.arange(4,6,0.5), 'full': np.arange(7,9,0.5)}
    assm_to_spec = {'mm9': 'mmu', 'bosTau7': 'bta', 'hg19': 'hsa', 'canFam3': 'cfa', 'susScr2': 'ssc'}
    fig, ax = plt.subplots(1, figsize=(11, 4))
    ax.set_xlim(0, 11)
    ax.set_ylim(0, 1.1)
    boxes = []
    box_positions = []
    box_colors = []
    box_labels = []
    box_props = dict(linestyle='solid', linewidth=1, color='white')
    whisk_props = dict(linestyle='solid', linewidth=1, color='black')
    for t in tasks:
        task_x = x_pos[t]
        for idx, q in enumerate(queries):
            plotdata = data.loc[((data['query'] == q) & (data.model_type == t)), 'cv_perf']
            assert not plotdata.empty, 'No data to plot'
            boxes.append(plotdata)
            box_positions.append(task_x[idx])
            box_colors.append(colors[q]['rgb'])
            box_labels.append(assm_to_spec[q])
    boxes = ax.boxplot(boxes, positions=box_positions, widths=0.4, patch_artist=True,
                      boxprops=box_props, whiskerprops=whisk_props, labels=box_labels)
    for patch, col in zip(boxes['boxes'], box_colors):
        patch.set_facecolor(col)
    return
            

def rank_features(data):
    """
    """
    # note to self: 'NA' is not the same as numpy.nan for count - only numpy.nan is ignored!
    feat_used = data.count(axis=0)
    use_max = data.shape[0]
    penalty = use_max / feat_used
    row_ranks = data.rank(axis=1, method='average', ascending=False, na_option='bottom')
    worst_rank = row_ranks.max(axis=0)
    final_ranks = worst_rank * penalty
    final_ranks.sort_values(inplace=True, na_position='last')
    return final_ranks

    
    
def plot_feature_imp(data):
    """
    """
    rk = rank_features(data)
    print('===== Feature set')
    print(rk[:20])
    print('======')
    #print(rk[-5:])
    return
        



def plot_featimp_per_task(models, feats):
    """
    """
    tasks = models.model_type.unique()
    for t in tasks:
        print(t)
        selection = models.loc[(models.model_type == t), :].index
        features = feats.loc[selection,:]
        plot_feature_imp(features)
    
plot_featimp_per_task(subset, subset_feat)

(1296, 19)
(648, 19)
(234, 19)
(234, 3360)
full
===== Feature set
ftoecpg_rat_oeCpG_body            13.0
ftmsig_H3K36me3_abs_mean_body     39.0
ftprm_abs_dpeM9_body              48.0
ftmsig_H3K4me3_abs_mean_body     104.0
ftrep_pct_repcon_body            114.0
ftkmf_pct_kGAT_body              126.0
ftkmf_pct_kTTCG_body             129.0
ftprm_abs_dtieMx_body            133.0
ftkmf_pct_kATC_body              149.0
ftkmf_pct_kACTG_body             155.0
ftkmf_pct_kGATC_body             166.0
ftkmf_pct_kGGAT_body             179.0
ftkmf_pct_kCGAT_body             185.0
ftkmf_pct_kTCCA_body             191.0
ftkmf_pct_kAACC_body             193.0
ftprm_abs_inrM4_body             194.0
ftkmf_pct_kCTTG_body             201.0
ftkmf_pct_kTGGA_body             201.0
ftkmf_pct_kGTCA_body             206.0
ftkmf_pct_kTGAT_body             207.0
dtype: float64
seq
===== Feature set
ftoecpg_rat_oeCpG_body     12.0
ftkmf_pct_kGATC_body       37.0
ftprm_abs_dpeM9_body       39.0
ftkmf_pct_kCCAA_body 