In [60]:
%load_ext autoreload
%autoreload 2
!cd ../src
import sys  
sys.path.insert(0, '../src')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
#
DATASET = 'SGD' if mode <10 else 'MultiWOZ'
SPLIT = 'val' if DATASET == 'SGD' else 'test'
GT_DIR = f'../datasets/{DATASET}/action_prediction_gt_labels_{SPLIT}_only'
GT_GRAPH_DIR = f'../datasets/{DATASET}/gt_graph'
rows = ['model', 'graph']
columns = ['postprocess']
USE_NEG_PCOND = False

if mode % 10 == 1:
    # Graph method comparison on SGD on all the models w/ single/multi sampling.
    INF_GRAPH_DIR = f'../graphs/{DATASET}/inferred_graph'
    PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_single_multi_sample/'
    GRAPH_ALG_NAMES = ["MaxRILP", "SKRILP"]
elif mode % 10 == 2:
    #INF_GRAPH_DIR = f'../graphs/{DATASET}/{GRAPH_ALG_NAME}_hparam_search/trial{EXP_NUM}'
    #PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_org/'
    INF_GRAPH_DIR = f'../graphs/{DATASET}/{GRAPH_ALG_NAME}_hparam_search/trial{EXP_NUM}'
    PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_single_multi_sample/'
    #PREDICTIONS_DIR = f'../outputs/MWF_combined/'
elif mode % 10 == 3: # should search
    if mode < 10: # SGD
        PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_single_multi_sample/'
        GT_DIR = f'../datasets/{DATASET}/action_prediction_gt_labels_{SPLIT}_only'
    else:
        PREDICTIONS_DIR = f'../outputs/MWF_combined/'
        GT_DIR = f'../datasets/{DATASET}_full/action_prediction_gt_labels_{SPLIT}_only'
    INF_GRAPH_DIR = f'../graphs/{DATASET}_final/{GRAPH_ALG_NAME}/'
    SHD_ALG_NAME = "SHDILP"
    SHD_DIR = f'../graphs/{DATASET}_full/{SHD_ALG_NAME}_hparam_search/trial{SHD_EXP_NUM}'
    
elif mode % 10 == 3:
    USE_NEG_PCOND = True
    GRAPH_ALG_NAME = "SKRILP"
    INF_GRAPH_DIR = f'../graphs/{DATASET}/inferred_graph'
    PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_single_multi_sample/'
    NEG_HPARAM="b1e-03_th3e-05"
    

In [62]:
from pathlib import Path
import json
import os
import warnings
from tqdm import tqdm

import numpy as np
import pandas as pd

## Find and organize the predictions by the models

In [63]:
domains = set()
models = set()
prompts = set()

In [64]:
len(list(Path(PREDICTIONS_DIR).glob(FILTER)))

96

In [65]:
predictions, mapped_predictions = {}, {}
predictions_by_domain, mapped_predictions_by_domain = {}, {}
print(f'searching @ {PREDICTIONS_DIR}')
for pth in Path(PREDICTIONS_DIR).glob(FILTER):
    for config_pth in pth.glob('config*.json'):
        try:
            with config_pth.open() as cf:
                config = json.load(cf)
        except Exception as e:
            print(f'Error while reading {config_pth}')
            print(e)
        for seed in config['seed']:
            for pred_pth in config_pth.parent.glob(f'DM_prediction_S{seed}.json'):
                dataset = config['dataset']
                if '_trajectories.json' in config['traj_path']:
                    domain = config['traj_path'].rsplit('/', 1)[-1].split('_trajectories.json')[0]
                else:
                    domain = config['traj_path'].rsplit('/', 1)[-1].split('.json')[0]
                model = config['model']
                prompt = (config['prompt_style'], config['num_shot'], config['use_mask_prompt'])
                temp = 0.0 if 'temperature' not in config else config['temperature']
                sampling = 'multi' if 'sampling' not in config else config['sampling']
                domains.add(domain)
                models.add(model)
                prompts.add(prompt)
                key = (domain, model, prompt, temp, sampling, seed)
                """
                if temp > 1e-3 and 'merged' not in str(pred_pth):
                    continue"""
                try:
                    with pred_pth.open() as f:
                        predictions[key] = json.load(f)
                    predictions_by_domain[domain] = predictions[key]
                    #print(pred_pth)
                except Exception as e:
                    print(f'Error while reading {pred_pth}')
                    print(e)
                mapped_file_pth = str(pred_pth).replace("_prediction_", "_mapped_prediction_").replace(".json",".npy")
                if key in mapped_predictions:
                    print('duplicated!!')
                    print(key)
                    print(mapped_file_pth)
                    dup_key = key
                    assert False
                try:
                    mapped_predictions[key] = np.load(mapped_file_pth, allow_pickle=True)
                    mapped_predictions_by_domain[domain] = mapped_predictions[key]
                    #print(f'loading @ {mapped_file_pth}')
                except Exception as e:
                    print(f'Error while reading {mapped_file_pth}')
                    print(e)
print(f"num data: {len(mapped_predictions)}")

searching @ ../outputs/FINAL_SGD_single_multi_sample/
num data: 96


## Load GT files and GT graphs (if available)

In [66]:
gt_labels = {}
# GT files
print(f'Finding GT @ {GT_DIR}')
for domain in domains:
    matching = list(Path(GT_DIR).glob(f'{domain}_labels.json'))
    if len(matching) == 0:
        warnings.warn(f'{domain}: GT labels not found!')
    else:
        pth = matching[0]
        with pth.open('r') as f:
            gt_labels[domain] = json.load(f)
        print(f'loaded @ {pth}')

Finding GT @ ../datasets/SGD/action_prediction_gt_labels_val_only
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Events_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Hotels_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/RentalCars_2_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Services_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Flights_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/RideSharing_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Restaurants_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Flights_2_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Movies_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Homes_1_labels.json
loaded @ ../datasets/SGD/action_prediction_gt_labels_val_only/Events_2_labels.json
load

In [67]:
from util.graph_utils import get_graph_sop
from util.eval_utils import validate_trajs

gt_graphs = {}
"""
for domain in domains:
    matching = list(Path(GT_GRAPH_DIR).glob(f'*{domain}_gt_graph.npy'))
    if len(matching) == 0:
        warnings.warn(f'{domain} GT graph not found!')
    else:
        pth = matching[0]
        gt_raw = np.load(str(pth), allow_pickle=True).item()
        all_acts, all_statuses = validate_trajs(gt_labels[domain], predictions_by_domain[domain])
        gt_sop = get_graph_sop(
            gt_raw,
            subtask_list=all_statuses,
            option_list=all_acts,
        )
        graphs.setdefault(domain, {}).setdefault('GT', gt_sop)"""
pass

## Load inferred graphs and neg precondition

In [68]:
def get_neg_precond_mat(neg_precond, all_acts, all_statuses):
    assert len(set(all_acts) - set(neg_precond['option_labels'])) == 0
    assert len(set(all_statuses) - set(neg_precond['subtask_labels'])) == 0
    op_inv_index = [neg_precond['option_labels'].index(s) for s in all_acts]
    su_inv_index = [neg_precond['subtask_labels'].index(s) for s in all_statuses]
    neg_precond_mat = np.stack(neg_precond['precondition_vectors'])
    neg_precond_mat = neg_precond_mat[op_inv_index, :]
    neg_precond_mat = neg_precond_mat[:, su_inv_index]
    return neg_precond_mat

In [69]:
def load_graphs(domains_list, root_dir, dataset, graph_alg_name):
    load_count = 0
    num_graph_per_domain = None
    graphs = {}
    for domain in domains_list:
        print(f'loading @ {root_dir}/{dataset}_{domain}*/*{graph_alg_name}*.npy')
        graph_algo_dict = {}
        matchings = list(Path(root_dir).glob(f'{dataset}_{domain}/*{graph_alg_name}*.npy'))
        if len(matchings) == 0:
            txt =f'{domain} inferred graph not found!'
            print(f'{root_dir}/{dataset}_{domain}*/*{graph_alg_name}*.npy')
            assert False, txt
        else:
            #matchings = [matchings[0]]
            if num_graph_per_domain is None:
                num_graph_per_domain = len(matchings)
            if num_graph_per_domain !=len(matchings):
                print(f'Error! number of graphs in {domains_list[0]} and {domain} are different: {num_graph_per_domain} != {len(matchings)}')
                assert False
            for matching in matchings:
                graph_path = str(matching)
                #print(f'loading inferred graph for {domain}')
                graph_raw = np.load(graph_path, allow_pickle=True).item()
                alg_name = graph_path.split('/')[-1].replace('.npy', '')
                all_acts, all_statuses = validate_trajs(gt_labels[domain], predictions_by_domain[domain])
                graph_sop = get_graph_sop(
                    graph_raw,
                    subtask_list=all_statuses,
                    option_list=all_acts,
                )
                graph_algo_dict[alg_name] = graph_sop
                load_count += 1
        graphs[domain] = graph_algo_dict
    return graphs, load_count

In [71]:
domains_list = list(domains)
#
graphs, load_count = load_graphs(domains_list, INF_GRAPH_DIR, DATASET, GRAPH_ALG_NAME)
print(f"Loaded {load_count} inferred CAN+SHDNT graphs from {len(domains)} domains")
#
#shd_sops, load_count = load_graphs(domains_list, SHD_DIR, DATASET, SHD_ALG_NAME)
#print(f"Loaded {load_count} inferred SHD graphs from {len(domains)} domains")
shd_sops = None
    

loading @ ../graphs/SGD_final/CSILP//SGD_Events_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Hotels_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_RentalCars_2*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Services_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Flights_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_RideSharing_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Restaurants_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Flights_2*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Movies_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Homes_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Events_2*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Music_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Calendar_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Banks_1*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SGD_Music_2*/*CSILP*.npy
loading @ ../graphs/SGD_final/CSILP//SG

In [72]:
print('Loaded graphs:', {domain: vals.keys() for domain, vals in graphs.items()})

Loaded graphs: {'Events_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Hotels_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'RentalCars_2': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Services_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Flights_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'RideSharing_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Restaurants_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Flights_2': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Movies_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Homes_1': dict_keys(['inferred_graph_CSILP_fgam

## Calculate metrics

In [73]:
# need to install multiprocess, a fork of multiprocessing (built in) because of this issue:
# https://stackoverflow.com/questions/8804830/python-multiprocessing-picklingerror-cant-pickle-type-function
!pip install multiprocess



In [74]:
from multiprocess import Pool# use multiprocessing to speed up evaluation!

In [83]:
# %%timeit -r 1 -n 1
from util.eval_utils import dact_traj_metrics_report, dact_traj_multi_sample_metrics_report, standardize_dact
from copy import deepcopy

def eval_job(args):
    pred_params, graph_params, traj = args
    domain, model, prompt_params, temp, sampling, seed = pred_params
    prompt_style, num_shot, use_mask_prompt = prompt_params
    is_multisampling = float(temp) > 0 and ('repeat' not in sampling)
    if domain not in gt_labels:
        print(f"domain not in gt_label: {domain}, {gt_labels}")
        return None
    graph_names, graph_tuples = graph_params
    graphs, neg_pcond_mats, should_sops = [], [], []
    for graph_tuple in graph_tuples:
        if graph_tuple is not None:
            graph, should_sop = graph_tuple
        else:
            graph, should_sop = None, None
        graphs.append(graph)
        neg_pcond_mats.append(None)
        should_sops.append(should_sop)
    
    if not isinstance(traj, tuple):
        gt_processed_label_tuple = tuple(traj)
    else:
        gt_processed_label_tuple = traj
    #print(f'In {pred_params} with multisampling={is_multisampling}')
    if is_multisampling:
        report_list = dact_traj_multi_sample_metrics_report(*gt_processed_label_tuple, graph_sop=graphs, neg_precond_mat=neg_pcond_mats, should_sops=should_sops, verbose=False)
    else:
        report_list = dact_traj_metrics_report(*gt_processed_label_tuple, graph_sop=graphs, neg_precond_mat=neg_pcond_mats, should_sops=should_sops, verbose=False)
    """
    try:
        #print(f'In {pred_params} with multisampling={is_multisampling}')
        if is_multisampling:
            report_list = dact_traj_multi_sample_metrics_report(*gt_processed_label_tuple, graph_sop=graphs, neg_precond_mat=neg_pcond_mats, should_sops=should_sops, verbose=False)
        else:
            report_list = dact_traj_metrics_report(*gt_processed_label_tuple, graph_sop=graphs, neg_precond_mat=neg_pcond_mats, verbose=False)
    except Exception as e:
        print(f'Error in {pred_params} {prompt_params} with multisampling={is_multisampling}')
        warnings.warn(f'Error in traj metrics: {e}')
        return None"""
    
    metrics_list = []
    for report, graph_name in zip(report_list, graph_names):
        stats = report['Predicted']
        post = report['post']
        metrics = {
            'domain': domain[:4]+domain[-1],
            'model': model,
            'prompt': prompt_style,
            'shot': num_shot,
            'use_mask_prompt': use_mask_prompt,
            'temp': temp,
            'sampling': sampling,
            'seed': seed,
            'graph': graph_name,    
            'precision': stats['precision'],
            'recall': stats['recall'],
            'f1': stats['f1-score'],
            'support': stats['support'],
            'postprocess': post
        }
        metrics_list.append(metrics)
    return metrics_list
    
jobs = []
for pred_params, traj_pred in mapped_predictions.items():
    domain, model, prompt_params, temp, sampling, seed = pred_params
    if shd_sops is not None:
        if temp > 0: # in case multi sampling, we cannot run without graph
            graph_list = list(graphs.get(pred_params[0], {}).items())
            shd_list = list(shd_sops.get(pred_params[0], {}).items())
        else:
            graph_list = [('(None)', None)] + list(graphs.get(pred_params[0], {}).items())
            shd_list = [('(None)', None)] + list(shd_sops.get(pred_params[0], {}).items())
        
        graph_names = [graph[0]+shd[0] for graph in graph_list for shd in shd_list]
        graph_tuples = [(graph[1], shd[1]) for graph in graph_list for shd in shd_list]
    else:
        if temp > 0: # in case multi sampling, we cannot run without graph
            graph_list = list(graphs.get(pred_params[0], {}).items())
        else:
            graph_list = [('(None)', None)] + list(graphs.get(pred_params[0], {}).items())
        graph_names = [graph[0] for graph in graph_list]
        graph_tuples = [(graph[1], None) for graph in graph_list]
    graph_params = (graph_names, graph_tuples)
    jobs.append((pred_params, graph_params, traj_pred))
print(f"# jobs={len(jobs)}")

with Pool(min(60, len(jobs))) as p:
    raw_metrics = [result for result in tqdm(p.imap(eval_job, jobs)) if result is not None]
#raw_metrics = [eval_job(job) for job in jobs]
metrics = []
for elem in raw_metrics:
    for metric_dict in elem:
        metrics.append(metric_dict)
print(f"output={len(metrics)}")

# jobs=96


96it [00:00, 124.98it/s]


In [76]:
metrics_df = pd.DataFrame(metrics)
#metrics_df.head()

### 1. Aggregated performance (averaged over schemas)

In [77]:
base_df = metrics_df.loc[metrics_df['graph'] == '(None)']
gpt_base_df = base_df.loc[base_df['model'] == 'gpt-turbo']
t5_base_df = base_df.loc[base_df['model'] == 'flan-t5-xxl']
base_performance = base_df['f1'].mean()
gpt_base_performance = gpt_base_df['f1'].mean()
t5_base_performance = t5_base_df['f1'].mean()
print(base_performance, gpt_base_performance, t5_base_performance)

0.643341730858631 0.7875129507787983 0.49917051093846404


In [78]:
display_df = metrics_df.pivot_table(index=rows, columns=columns, values='f1', aggfunc='mean')

In [79]:

if mode % 10 == 2 or mode % 10 == 3:
    new_metrics = []
    for metric_dict in metrics:
        hparam_str = metric_dict['graph']
        if hparam_str == '(None)' or (metric_dict['postprocess'] != 'max' and 't5' in metric_dict['model']) or (metric_dict['postprocess'] != 'None' and 'gpt' in metric_dict['model']):
            continue
        hparam_str = hparam_str[len("inferred_graph_"):].replace('.npy', '')
        tokens = hparam_str.split('_')
        if GRAPH_ALG_NAME == 'NAILP':
            #inferred_graph_NAILP_gam=0.9_eneg=2_cneg=2_pos=1.0_bw=5_bd=1_cp=0.0_dep=8_leaf=0.01_vth=0.5_vw=0.05
            param_names = ['gamma', 'eneg', 'cneg', 'pos', 'bw', 'bd', 'cmplx', 'dep', 'leaf', 'vth', 'vw']
        elif GRAPH_ALG_NAME == 'ACCILP':
            param_names = ['gamma', 'eneg', 'cneg', 'pos', 'bw', 'bd', 'cmplx', 'dep', 'leaf', 'mss', 'minp']
        elif GRAPH_ALG_NAME == 'VARILP': # inferred_graph_VARILP_gam=0.9_eneg=2_cneg=2_pos=1.0_bw=5_bd=2_cp=0.0_dep=8_leaf=0.01_sbias=0.1_mins=0.0
            param_names = ['gamma', 'eneg', 'cneg', 'pos', 'bw', 'bd', 'cmplx', 'dep', 'leaf', 'sbias', 'mins']
        elif GRAPH_ALG_NAME == 'CSILP': # inferred_graph_CSILP_fgam=0.6_bgam=0.6_cneg=0_pos=1.0_dep=8_leaf=0.01
            param_names = ['fgam', 'bgam', 'cneg', 'pos', 'dep', 'leaf']
        elif GRAPH_ALG_NAME == 'BCILP': # inferred_graph_BCILP_cneg=0_dep=8_leaf=0.01
            param_names = ['cneg', 'dep', 'leaf']
        elif GRAPH_ALG_NAME == 'CCAOILP': # inferred_graph_BCILP_cneg=0_dep=8_leaf=0.01
            param_names = ['th']
        elif GRAPH_ALG_NAME == 'ILP': # inferred_graph_BCILP_cneg=0_dep=8_leaf=0.01
            param_names = ['dep', 'leaf']
        hparam_dict = dict(algo=tokens[0])
        for par, token in zip(param_names, tokens[1:]):
            value = token.split('=')[1]
            try:
                hparam_dict[par] = int(value)
                continue
            except Exception as e:
                pass
            try:
                hparam_dict[par] = float(value)
                continue
            except Exception as e:
                pass
            hparam_dict[par] = value
        metric_dict.update(hparam_dict)
        new_metrics.append(metric_dict)
    print(f'[{GRAPH_ALG_NAME}-v{EXP_NUM} @{DATASET}] num runs: {len(new_metrics)}')
    new_metrics_df = pd.DataFrame(new_metrics)
    graph_columns = param_names[1::2]
    graph_rows = param_names[::2]
    df_agg = new_metrics_df.pivot_table(index=graph_rows, columns=graph_columns, values='f1', aggfunc='mean')
    df_agg -= base_performance
    display(df_agg.round(decimals=3))


[CSILP-v1 @SGD] num runs: 48


Unnamed: 0_level_0,Unnamed: 1_level_0,bgam,0.95
Unnamed: 0_level_1,Unnamed: 1_level_1,pos,3.0
Unnamed: 0_level_2,Unnamed: 1_level_2,leaf,0.0003
fgam,cneg,dep,Unnamed: 3_level_3
0.8,0,12,0.102


In [80]:
if mode % 10 == 2:
    for hparam_label in param_names:
        new_df = new_metrics_df.groupby(hparam_label).agg('mean', numeric_only=True)['f1'].to_frame()
        new_df -= base_performance # subtract no graph performance
        if len(new_df.index) == 1 or hparam_label == 'model':
            continue
        display_df = new_df.T.round(decimals=3)
        display(display_df)

In [81]:
mode = 3 # 2: SGD / 12: MW
EXP_NUM = "1"
SHD_EXP_NUM = "1"
GRAPH_ALG_NAME = 'CSILP' # NAILP ACCILP VARILP CSILP CCAOILP
FILTER="*"

In [82]:
graph_columns = ['model'] + param_names[1::2]
graph_columns = param_names[1::2]
graph_rows = ['model'] + param_names[::2]
df_agg2 = new_metrics_df.pivot_table(index=graph_rows, columns=graph_columns, values='f1', aggfunc='mean')
display(df_agg2.round(decimals=3))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bgam,0.95
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,pos,3.0
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,leaf,0.0003
model,fgam,cneg,dep,Unnamed: 4_level_3
flan-t5-xxl,0.8,0,12,0.656
gpt-turbo,0.8,0,12,0.835


### 2. schema-wise performance

In [None]:
from IPython.display import display
print(rows+columns)
display_df = metrics_df.pivot(index=rows+columns, columns=['domain'], values='f1')
display_df = display_df.round(3)
num_columns = len(display_df.columns)
if num_columns > 12:
    df1 = display_df.iloc[:, :num_columns//2]
    df2 = display_df.iloc[:, num_columns//2:]
    display(df1)
    display(df2)
else:
    display(display_df)


['model', 'graph', 'postprocess']


Unnamed: 0_level_0,Unnamed: 1_level_0,domain,Bank1,Buse1,Buse2,Cale1,Even1,Even2,Flig1,Flig2,Home1,Hote1,Hote2,Hote3
model,graph,postprocess,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
flan-t5-xxl,(None),,0.514,0.432,0.434,0.321,0.345,0.4,0.166,0.45,0.749,0.516,0.437,0.492
flan-t5-xxl,inferred_graph_CCAOILP_th=0.6,,0.529,0.422,0.492,0.314,0.339,0.394,0.179,0.53,0.759,0.512,0.431,0.503
flan-t5-xxl,inferred_graph_CCAOILP_th=0.6,max,0.545,0.384,0.442,0.429,0.266,0.353,0.244,0.511,0.662,0.402,0.383,0.368
flan-t5-xxl,inferred_graph_CCAOILP_th=0.7,,0.529,0.422,0.492,0.312,0.344,0.394,0.176,0.5,0.759,0.531,0.473,0.515
flan-t5-xxl,inferred_graph_CCAOILP_th=0.7,max,0.547,0.384,0.442,0.463,0.284,0.353,0.24,0.488,0.663,0.409,0.441,0.415
flan-t5-xxl,inferred_graph_CCAOILP_th=0.8,,0.525,0.435,0.492,0.312,0.337,0.394,0.176,0.491,0.759,0.535,0.485,0.515
flan-t5-xxl,inferred_graph_CCAOILP_th=0.8,max,0.523,0.385,0.442,0.463,0.271,0.362,0.242,0.478,0.663,0.409,0.445,0.415
flan-t5-xxl,inferred_graph_CCAOILP_th=0.85,,0.525,0.435,0.492,0.312,0.337,0.39,0.176,0.478,0.761,0.535,0.485,0.515
flan-t5-xxl,inferred_graph_CCAOILP_th=0.85,max,0.523,0.385,0.441,0.463,0.271,0.362,0.246,0.474,0.663,0.409,0.445,0.415
flan-t5-xxl,inferred_graph_CCAOILP_th=0.9,,0.525,0.419,0.48,0.312,0.337,0.39,0.176,0.478,0.761,0.535,0.446,0.515


Unnamed: 0_level_0,Unnamed: 1_level_0,domain,Medi1,Movi1,Musi1,Musi2,Rent1,Rent2,Rest1,Ride1,Ride2,Serv1,Serv2,Serv3
model,graph,postprocess,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
flan-t5-xxl,(None),,0.66,0.554,0.748,0.637,0.326,0.415,0.38,0.574,0.535,0.641,0.624,0.631
flan-t5-xxl,inferred_graph_CCAOILP_th=0.6,,0.645,0.594,0.748,0.634,0.327,0.28,0.4,0.574,0.535,0.618,0.634,0.648
flan-t5-xxl,inferred_graph_CCAOILP_th=0.6,max,0.585,0.565,0.759,0.667,0.315,0.287,0.265,0.477,0.596,0.433,0.429,0.518
flan-t5-xxl,inferred_graph_CCAOILP_th=0.7,,0.679,0.614,0.748,0.629,0.327,0.309,0.405,0.574,0.535,0.618,0.64,0.644
flan-t5-xxl,inferred_graph_CCAOILP_th=0.7,max,0.615,0.584,0.759,0.661,0.311,0.311,0.262,0.477,0.596,0.433,0.44,0.534
flan-t5-xxl,inferred_graph_CCAOILP_th=0.8,,0.737,0.615,0.748,0.629,0.327,0.418,0.416,0.574,0.535,0.663,0.645,0.643
flan-t5-xxl,inferred_graph_CCAOILP_th=0.8,max,0.658,0.587,0.759,0.661,0.311,0.403,0.271,0.477,0.596,0.478,0.44,0.538
flan-t5-xxl,inferred_graph_CCAOILP_th=0.85,,0.738,0.614,0.748,0.629,0.327,0.418,0.4,0.574,0.535,0.663,0.645,0.643
flan-t5-xxl,inferred_graph_CCAOILP_th=0.85,max,0.673,0.584,0.759,0.661,0.311,0.403,0.261,0.477,0.596,0.478,0.446,0.538
flan-t5-xxl,inferred_graph_CCAOILP_th=0.9,,0.705,0.614,0.748,0.629,0.327,0.418,0.413,0.574,0.535,0.659,0.645,0.651


### 3. Prec and Rec

In [None]:
display_df = metrics_df.pivot_table(index=rows, columns=columns, values=['precision', 'recall'], aggfunc='mean')
display_df.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,precision,recall,recall
Unnamed: 0_level_1,postprocess,None,max,None,max
model,graph,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
flan-t5-xxl,(None),0.57,,0.464,
flan-t5-xxl,inferred_graph_CCAOILP_th=0.6,0.611,0.475,0.441,0.439
flan-t5-xxl,inferred_graph_CCAOILP_th=0.7,0.619,0.487,0.445,0.446
flan-t5-xxl,inferred_graph_CCAOILP_th=0.8,0.626,0.492,0.455,0.455
flan-t5-xxl,inferred_graph_CCAOILP_th=0.85,0.625,0.493,0.454,0.455
flan-t5-xxl,inferred_graph_CCAOILP_th=0.9,0.621,0.491,0.45,0.452
gpt-turbo,(None),0.795,,0.783,
gpt-turbo,inferred_graph_CCAOILP_th=0.6,0.824,0.751,0.741,0.759
gpt-turbo,inferred_graph_CCAOILP_th=0.7,0.836,0.765,0.743,0.764
gpt-turbo,inferred_graph_CCAOILP_th=0.8,0.839,0.77,0.755,0.778


### 3. Paired t-test over all schema

In [None]:
display_df.query("graph == '(None)' and model == 'flan-t5-xxl'")

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,precision,recall,recall
Unnamed: 0_level_1,postprocess,None,max,None,max
model,graph,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
flan-t5-xxl,(None),0.569896,,0.464041,


In [None]:
# ref: https://pythonfordatascienceorg.wordpress.com/paired-samples-t-test-python/
from scipy import stats
if mode == 1: # with/without graph for gpt & T5 on SGD
    model='gpt-turbo' # 'flan-t5-xxl'
    df1 = display_df.query(f"graph == '(None)' and model == '{model}'")
    df1 = df1.squeeze()
    df2 = display_df.query(f"graph == 'RILP' and model == '{model}'")
    df2 = df2.squeeze()
else:
    df1 = display_df.loc[display_df['graph']=='(None)']
    df2 = display_df.loc[display_df['graph']=='RILP']
stat, pval = stats.ttest_rel(df1, df2)
print(f'Mean before={df1.mean()} Mean after={df2.mean()}')
if df1.mean() < df2.mean():
    change_text = "increased"
else:
    change_text = "decreased"
print(f'pval = {pval}')
if pval < 0.05:
    print(f'Statistically significant {change_text} by {df2.mean() - df1.mean()}!')
else:
    print('Not significant')

KeyError: 'graph'

In [None]:
t5_models=['flan-t5-xxl', 't5-xxl-lm-adapt']
mterics_t5 = metrics_df[metrics_df["model"].isin(t5_models)]
mterics_t5_5shot = mterics_t5[mterics_t5["shot"]==5]
display_df = mterics_t5_5shot.pivot(index=['model', 'graph'], columns=['domain'], values='f1')
rounded_df = display_df.round(decimals=3)
rounded_df

ValueError: Index contains duplicate entries, cannot reshape

In [None]:
print(metrics_df.pivot(index=['model', 'graph', 'shot'], columns=['domain'], values='f1').to_latex())


ValueError: Index contains duplicate entries, cannot reshape