In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from functools import  reduce
from itertools import combinations
from IPython.display import display

In [2]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    s = pd.to_numeric(s, errors='coerce')
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [5]:
def summarize_results(expts_folder, show_figures=True, select_best_on_valid=True):
    res_folders = [el for el in os.listdir(expts_folder) 
               if os.path.isdir(os.path.join(expts_folder, el))]
    metric = "pcc_mean"
    dfs = []
    for res_folder in res_folders:
        
        if select_best_on_valid:
            template = os.path.join(expts_folder, res_folder) + '/log*'
            fnames = glob(template)
            best_val_loss = float('Inf')
            best_hp =None
            for fname in fnames:
                try:
                    temp = pd.read_csv(fname, delimiter='\t')
                except pd.errors.EmptyDataError:
                    continue
                best_val_loss = temp.val_loss.min()
                best_hp = fname
            fnames = [best_hp.replace('log', 'results')]
        else:            
            template = os.path.join(expts_folder, res_folder) + '/results*'
            fnames = glob(template)
        res_all_hps = []
        temp = None
        for fname in fnames:
            try:
                temp = pd.read_csv(fname, delimiter='\t')
            except pd.errors.EmptyDataError:
                continue
            if temp.shape[1] < 2:
                temp = pd.read_csv(fname, delimiter='\t', skiprows=1)
                temp = temp.dropna(how='any', axis=0)
                temp = temp.drop_duplicates(keep=False)
                # temp = temp.loc[[temp.pcc_median.argmax()]] 
                temp = temp.loc[temp.groupby(["name"])[metric].idxmax()] 
            res_all_hps.append(temp)
        
        if len(res_all_hps) > 0:
            # print(res_all_hps)
            algo_name = '_'.join(res_folder.split('_')[-2:])
            res = pd.concat(res_all_hps, ignore_index=True)
            # print(res)
            # res['hps'] = ['_'.join(f.split('/')[-1].split('_')[5:]) for f in fnames]
            res[metric] = pd.to_numeric(res[metric])
            res['size'] = pd.to_numeric(res['size'])
            res = res.loc[res.groupby(["name"])[metric].idxmax()] 
            res.to_csv(os.path.join(expts_folder, 'recap_'+res_folder+'.txt'), 
                       index=False,  float_format='%.3f')
            if show_figures:
                res.plot.scatter('size', metric)
                plt.title(algo_name)
                plt.show()
            dfs.append(res[['name', metric]].set_index('name').rename(
                columns=dict(pcc_median=algo_name)))

    res = pd.concat(dfs, axis=1).apply(pd.to_numeric)
    if show_figures:
        for x, y in combinations(res.columns.tolist(), 2):
            print('% x > y', res[x].gt(res[y]).mean())
            ax = res.plot.scatter(x, y)
            ax.plot((-1, 1), (-1, 1), ls="-", c=".3")
            plt.show()
    else:
        # print(res.to_latex(float_format='%.3f'))
        print(res)
        display(res.style.apply(highlight_max, axis=1).set_precision(3))

In [6]:
summarize_results(expts_folder='/home/prtos/workspace/code/few_shot_regression/expt_results/mhc', show_figures=False)
# summarize_results(expts_folder='/home/prtos/workspace/code/few_shot_regression/expt_results/expts_helios', show_figures=True)

               pcc_mean  pcc_mean  pcc_mean  pcc_mean
HLA-DRB1*0101   0.49587       NaN       NaN       NaN
HLA-DRB1*1501       NaN       NaN       NaN  0.562256
HLA-DRB3*0101       NaN  0.182476       NaN       NaN
HLA-DRB4*0101       NaN       NaN  0.301347       NaN


ValueError: style is not supported for non-unique indicies.