In [37]:
%load_ext autoreload
%autoreload 2
!cd ../src
import sys  
sys.path.insert(0, '../src')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [90]:
#
DATASET = 'SGD' if mode <10 else 'MultiWOZ'
SPLIT = 'val' if DATASET == 'SGD' else 'test'
GT_DIR = f'../datasets/{DATASET}/action_prediction_gt_labels_{SPLIT}_only'
GT_GRAPH_DIR = f'../datasets/{DATASET}/gt_graph'
INF_GRAPH_DIR = f'../datasets/{DATASET}/inferred_graph'
FILTER="*"
rows = ['model', 'graph']
columns = ['postprocess']
USE_NEG_PCOND = False

if mode % 10 == 1:
    # Graph method comparison on SGD on all the models w/ single/multi sampling.
    PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_single_multi_sample/'
    GRAPH_ALG_NAMES = ["MaxRILP", "SKRILP"]
elif mode % 10 == 2:
    INF_GRAPH_DIR = f'../datasets/{DATASET}/{GRAPH_ALG_NAME}_hparam_search/trial{EXP_NUM}'
    PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_single_multi_sample/'
elif mode % 10 == 3:
    USE_NEG_PCOND = True
    GRAPH_ALG_NAME = "SKRILP"
    INF_GRAPH_DIR = f'../datasets/{DATASET}/inferred_graph'
    PREDICTIONS_DIR = f'../outputs/FINAL_{DATASET}_single_multi_sample/'
    NEG_HPARAM="b1e-03_th3e-05"
    

In [6]:
from pathlib import Path
import json
import os
import warnings
from tqdm import tqdm

import numpy as np
import pandas as pd

## Find and organize the predictions by the models

In [93]:
domains = set()
models = set()
prompts = set()

In [77]:
len(list(Path(PREDICTIONS_DIR).glob(FILTER)))

96

In [94]:
predictions, mapped_predictions = {}, {}
predictions_by_domain, mapped_predictions_by_domain = {}, {}
for pth in Path(PREDICTIONS_DIR).glob(FILTER):
    for config_pth in pth.glob('config*.json'):
        if 'merged' in str(config_pth):
            continue
        print(config_pth)
        try:
            with config_pth.open() as cf:
                config = json.load(cf)
        except Exception as e:
            print(f'Error while reading {config_pth}')
            print(e)
        for seed in config['seed']:
            for pred_pth in config_pth.parent.glob(f'DM_prediction_S{seed}.json'):
                dataset = config['dataset']
                if '_trajectories.json' in config['traj_path']:
                    domain = config['traj_path'].rsplit('/', 1)[-1].split('_trajectories.json')[0]
                else:
                    domain = config['traj_path'].rsplit('/', 1)[-1].split('.json')[0]
                model = config['model']
                prompt = (config['prompt_style'], config['num_shot'], config['use_mask_prompt'])
                temp = 0.0 if 'temperature' not in config else config['temperature']
                sampling = 'multi' if 'sampling' not in config else config['sampling']
                domains.add(domain)
                models.add(model)
                prompts.add(prompt)
                key = (domain, model, prompt, temp, sampling, seed)
                if temp > 1e-3 and 'merged' not in str(pred_pth):
                    continue
                try:
                    with pred_pth.open() as f:
                        predictions[key] = json.load(f)
                    predictions_by_domain[domain] = predictions[key]
                    #print(pred_pth)
                except Exception as e:
                    print(f'Error while reading {pred_pth}')
                    print(e)
                mapped_file_pth = str(pred_pth).replace("_prediction_", "_mapped_prediction_").replace(".json",".npy")
                try:
                    mapped_predictions[key] = np.load(mapped_file_pth, allow_pickle=True)
                    mapped_predictions_by_domain[domain] = mapped_predictions[key]
                except Exception as e:
                    print(f'Error while reading {mapped_file_pth}')
                    print(e)
print(f"num data: {len(mapped_predictions)}")

../outputs/FINAL_MultiWOZ_single_multi_sample/Taxi_gpt-turbo_entire-concise_5shot_T0_0515/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Attraction_flan-t5-xxl_entire-concise_5shot_T1_0524/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Hotel_flan-t5-xxl_entire-concise_5shot_T1_0524/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Hotel_flan-t5-xxl_entire-concise_5shot_T0_0510/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Taxi_gpt-turbo_entire-concise_5shot_T1_0524/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Taxi_flan-t5-xxl_entire-concise_5shot_T1_0524/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Train_gpt-turbo_entire-concise_5shot_T1_0524/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Hotel+Train_gpt-turbo_entire-concise_5shot_T1_0619/config_S1636423.json
../outputs/FINAL_MultiWOZ_single_multi_sample/Attraction_gpt-turbo_entire-concise_5shot_

## Load GT files and GT graphs (if available)

In [79]:
gt_labels = {}
# GT files
for domain in domains:
    matching = list(Path(GT_DIR).glob(f'{domain}_labels.json'))
    if len(matching) == 0:
        warnings.warn(f'{domain}: GT labels not found!')
    else:
        pth = matching[0]
        with pth.open('r') as f:
            gt_labels[domain] = json.load(f)

In [81]:
from util.graph_utils import get_graph_sop
from util.eval_utils import validate_trajs

gt_graphs = {}
"""
for domain in domains:
    matching = list(Path(GT_GRAPH_DIR).glob(f'*{domain}_gt_graph.npy'))
    if len(matching) == 0:
        warnings.warn(f'{domain} GT graph not found!')
    else:
        pth = matching[0]
        gt_raw = np.load(str(pth), allow_pickle=True).item()
        all_acts, all_statuses = validate_trajs(gt_labels[domain], predictions_by_domain[domain])
        gt_sop = get_graph_sop(
            gt_raw,
            subtask_list=all_statuses,
            option_list=all_acts,
        )
        graphs.setdefault(domain, {}).setdefault('GT', gt_sop)"""
#pass



## Load inferred graphs and neg precondition

In [80]:
def get_neg_precond_mat(neg_precond, all_acts, all_statuses):
    assert len(set(all_acts) - set(neg_precond['option_labels'])) == 0
    assert len(set(all_statuses) - set(neg_precond['subtask_labels'])) == 0
    op_inv_index = [neg_precond['option_labels'].index(s) for s in all_acts]
    su_inv_index = [neg_precond['subtask_labels'].index(s) for s in all_statuses]
    neg_precond_mat = np.stack(neg_precond['precondition_vectors'])
    neg_precond_mat = neg_precond_mat[op_inv_index, :]
    neg_precond_mat = neg_precond_mat[:, su_inv_index]
    return neg_precond_mat

In [82]:
load_count = 0
num_graph_per_domain = None
graphs = {}
domains_list = list(domains)
for domain in domains_list:
    # print(f'loading @ {DATASET}_{domain}*/*{GRAPH_ALG_NAME}*.npy')
    graph_algo_dict = {}
    matchings = list(Path(INF_GRAPH_DIR).glob(f'{DATASET}_{domain}*/*{GRAPH_ALG_NAME}*.npy'))
    if len(matchings) == 0:
        txt =f'{domain} inferred graph not found!'
        print(f'{INF_GRAPH_DIR}/{DATASET}_{domain}*/*{GRAPH_ALG_NAME}*.npy')
        warnings.warn(txt)
    else:
        #matchings = [matchings[0]]
        if num_graph_per_domain is None:
            num_graph_per_domain = len(matchings)
        if num_graph_per_domain !=len(matchings):
            print(f'Error! number of graphs in {domains_list[0]} and {domain} are different: {num_graph_per_domain} != {len(matchings)}')
            assert False
        for matching in matchings:
            graph_path = str(matching)
            #print(f'loading inferred graph for {domain}')
            graph_raw = np.load(graph_path, allow_pickle=True).item()
            alg_name = graph_path.split('/')[-1].replace('.npy', '')
            all_acts, all_statuses = validate_trajs(gt_labels[domain], predictions_by_domain[domain])
            graph_sop = get_graph_sop(
                graph_raw,
                subtask_list=all_statuses,
                option_list=all_acts,
            )
            if USE_NEG_PCOND:
                alg_name = alg_name + '+NEG'
                neg_precond_path = graph_path[:graph_path.rfind('/')] + f"/inferred_negative_precondition_{NEG_HPARAM}.json"
                neg_precond = json.load(open(neg_precond_path))
                neg_pcond_mat = get_neg_precond_mat(neg_precond, all_acts, all_statuses)
            else:
                neg_pcond_mat = None
            graph_algo_dict[alg_name] = (graph_sop, neg_pcond_mat)
            load_count += 1
    graphs[domain] = graph_algo_dict
print(f"Loaded {load_count} inferred graphs from {len(domains)} domains")

Loaded 24 inferred graphs from 24 domains


In [83]:
print('Loaded graphs:', {domain: vals.keys() for domain, vals in graphs.items()})

Loaded graphs: {'Services_2': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Calendar_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'RentalCars_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'RentalCars_2': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Buses_2': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Music_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Hotels_3': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Homes_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Movies_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003']), 'Events_1': dict_keys(['inferred_graph_CSILP_fgam=0.8_bga

## Calculate metrics

In [22]:
# need to install multiprocess, a fork of multiprocessing (built in) because of this issue:
# https://stackoverflow.com/questions/8804830/python-multiprocessing-picklingerror-cant-pickle-type-function
!pip install multiprocess

Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 5.2 MB/s eta 0:00:01
[?25hCollecting dill>=0.3.6
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[K     |████████████████████████████████| 110 kB 38.4 MB/s eta 0:00:01
[?25hInstalling collected packages: dill, multiprocess
Successfully installed dill-0.3.6 multiprocess-0.70.14


In [84]:
from multiprocess import Pool# use multiprocessing to speed up evaluation!

In [85]:
# %%timeit -r 1 -n 1
from util.eval_utils import dact_traj_metrics_report, dact_traj_multi_sample_metrics_report, standardize_dact
from copy import deepcopy

def eval_job(args):
    pred_params, graph_params, traj = args
    domain, model, prompt_params, temp, sampling, seed = pred_params
    prompt_style, num_shot, use_mask_prompt = prompt_params
    is_multisampling = float(temp) > 0 and ('repeat' not in sampling)
    if domain not in gt_labels:
        print(f"domain not in gt_label: {domain}, {gt_labels}")
        return None
    graph_names, graph_tuples = graph_params
    graphs, neg_pcond_mats = [], []
    for graph_tuple in graph_tuples:
        if graph_tuple is not None:
            graph, neg_pcond_mat = graph_tuple
        else:
            graph, neg_pcond_mat = None, None
        graphs.append(graph)
        neg_pcond_mats.append(neg_pcond_mat)
    
    if not isinstance(traj, tuple):
        gt_processed_label_tuple = tuple(traj)
    else:
        gt_processed_label_tuple = traj
    try:
        #print(f'In {pred_params} with multisampling={is_multisampling}')
        if is_multisampling:
            report_list = dact_traj_multi_sample_metrics_report(*gt_processed_label_tuple, graph_sop=graphs, neg_precond_mat=neg_pcond_mats, verbose=False)
        else:
            report_list = dact_traj_metrics_report(*gt_processed_label_tuple, graph_sop=graphs, neg_precond_mat=neg_pcond_mats, verbose=False)
    except Exception as e:
        print(f'Error in {pred_params} {prompt_params} with multisampling={is_multisampling}')
        warnings.warn(f'Error in traj metrics: {e}')
        return None
    
    metrics_list = []
    for report, graph_name in zip(report_list, graph_names):
        stats = report['Predicted']
        post = report['post']
        metrics = {
            'domain': domain[:4]+domain[-1],
            'model': model,
            'prompt': prompt_style,
            'shot': num_shot,
            'use_mask_prompt': use_mask_prompt,
            'temp': temp,
            'sampling': sampling,
            'seed': seed,
            'graph': graph_name,    
            'precision': stats['precision'],
            'recall': stats['recall'],
            'f1': stats['f1-score'],
            'support': stats['support'],
            'postprocess': post
        }
        metrics_list.append(metrics)
    return metrics_list
    
jobs = []
for pred_params, traj_pred in mapped_predictions.items():
    domain, model, prompt_params, temp, sampling, seed = pred_params
    if temp > 0: # in case multi sampling, we cannot run without graph
        graph_list = list(graphs.get(pred_params[0], {}).items())
    else:
        graph_list = [('(None)', None)] + list(graphs.get(pred_params[0], {}).items())
    graph_names = [graph[0] for graph in graph_list]
    graph_tuples = [graph[1] for graph in graph_list]
    graph_params = (graph_names, graph_tuples)
    jobs.append((pred_params, graph_params, traj_pred))
print(f"# jobs={len(jobs)}")
with Pool(min(60, len(jobs))) as p:
    raw_metrics = [result for result in tqdm(p.imap(eval_job, jobs)) if result is not None]
#raw_metrics = [eval_job(job) for job in jobs]
metrics = []
for elem in raw_metrics:
    for metric_dict in elem:
        metrics.append(metric_dict)
print(f"output={len(metrics)}")

# jobs=48


48it [00:00, 236.57it/s]

output=96





In [86]:
metrics_df = pd.DataFrame(metrics)
#metrics_df.head()

### 1. Aggregated performance (averaged over schemas)

In [87]:
display_df = metrics_df.pivot_table(index=rows, columns=columns, values='f1', aggfunc='mean')
#display_df.round(3)

In [89]:
display_df

Unnamed: 0_level_0,postprocess,None
model,graph,Unnamed: 2_level_1
flan-t5-xxl,(None),0.499171
flan-t5-xxl,inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003,0.58399
gpt-turbo,(None),0.787513
gpt-turbo,inferred_graph_CSILP_fgam=0.8_bgam=0.95_cneg=0_pos=3.0_dep=12_leaf=0.0003,0.835492


In [27]:
if mode == 12 or mode == 2:
    new_metrics = []
    for metric_dict in metrics:
        hparam_str = metric_dict['graph']
        if hparam_str == '(None)' or (metric_dict['postprocess'] != 'max' and 't5' in metric_dict['model']) or (metric_dict['postprocess'] != 'None' and 'gpt' in metric_dict['model']):
            continue
        hparam_str = hparam_str[len("inferred_graph_"):].replace('.npy', '')
        tokens = hparam_str.split('_')
        if GRAPH_ALG_NAME == 'NAILP':
            #inferred_graph_NAILP_gam=0.9_eneg=2_cneg=2_pos=1.0_bw=5_bd=1_cp=0.0_dep=8_leaf=0.01_vth=0.5_vw=0.05
            param_names = ['gamma', 'eneg', 'cneg', 'pos', 'bw', 'bd', 'cmplx', 'dep', 'leaf', 'vth', 'vw']
        elif GRAPH_ALG_NAME == 'ACCILP':
            param_names = ['gamma', 'eneg', 'cneg', 'pos', 'bw', 'bd', 'cmplx', 'dep', 'leaf', 'mss', 'minp']
        elif GRAPH_ALG_NAME == 'VARILP': # inferred_graph_VARILP_gam=0.9_eneg=2_cneg=2_pos=1.0_bw=5_bd=2_cp=0.0_dep=8_leaf=0.01_sbias=0.1_mins=0.0
            param_names = ['gamma', 'eneg', 'cneg', 'pos', 'bw', 'bd', 'cmplx', 'dep', 'leaf', 'sbias', 'mins']
        elif GRAPH_ALG_NAME == 'CSILP': # inferred_graph_CSILP_fgam=0.6_bgam=0.6_cneg=0_pos=1.0_dep=8_leaf=0.01
            param_names = ['fgam', 'bgam', 'cneg', 'pos', 'dep', 'leaf']
        elif GRAPH_ALG_NAME == 'BCILP': # inferred_graph_BCILP_cneg=0_dep=8_leaf=0.01
            param_names = ['cneg', 'dep', 'leaf']
        elif GRAPH_ALG_NAME == 'ILP': # inferred_graph_BCILP_cneg=0_dep=8_leaf=0.01
            param_names = ['dep', 'leaf']
        hparam_dict = dict(algo=tokens[0])
        for par, token in zip(param_names, tokens[1:]):
            hparam_dict[par] = token.split('=')[1]  
        metric_dict.update(hparam_dict)
        new_metrics.append(metric_dict)
    print(f'[{GRAPH_ALG_NAME}-v{EXP_NUM} @{DATASET}] num runs: {len(new_metrics)}')
    new_metrics_df = pd.DataFrame(new_metrics)
    graph_columns = param_names[1::2]
    graph_rows = param_names[::2]
    df_agg = new_metrics_df.pivot_table(index=graph_rows, columns=graph_columns, values='f1', aggfunc='mean')
    if mode >= 10:
        df_agg = df_agg - 0.3715 # subtract SKRILP performance
    else:
        df_agg = df_agg - 0.679 # subtract SKRILP performance
    display(df_agg.round(decimals=3))


In [28]:
if mode % 10 == 2:
    for hparam_label in param_names:
        new_df = new_metrics_df.groupby(hparam_label).agg('mean', numeric_only=True)['f1'].to_frame()
        if mode >=10:
            new_df = new_df - 0.3715 # subtract SKRILP performance
        else:
            new_df = new_df - 0.679 # subtract SKRILP performance
        if len(new_df.index) == 1 or hparam_label == 'model':
            continue
        display(new_df.T.round(decimals=3))

In [None]:
mode = 12 # 2: SGD / 12: MW
EXP_NUM = 17
GRAPH_ALG_NAME = 'CSILP' # NAILP ACCILP VARILP CSILP

In [88]:
graph_columns = ['model'] + param_names[1::2]
graph_columns = param_names[1::2]
graph_rows = ['model'] + param_names[::2]
df_agg2 = new_metrics_df.pivot_table(index=graph_rows, columns=graph_columns, values='f1', aggfunc='mean')
display(df_agg2.round(decimals=3))

NameError: name 'param_names' is not defined

### 2. schema-wise performance

In [None]:
from IPython.display import display
display_df = metrics_df.pivot(index=rows+columns, columns=['domain'], values='f1')
display_df = display_df.round(3)
num_columns = len(display_df.columns)
if num_columns > 12:
    df1 = display_df.iloc[:, :num_columns//2]
    df2 = display_df.iloc[:, num_columns//2:]
    display(df1)
    display(df2)
else:
    display(display_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,domain,Attrn,Hotel,Restt,Taxii,Train
model,graph,postprocess,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
flan-t5-xxl,(None),,0.495,0.16,0.242,0.026,0.085
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=3_leaf=0.01,,0.509,0.128,0.223,0.026,0.053
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=3_leaf=0.01,max,0.503,0.244,0.364,0.197,0.184
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=4_leaf=0.01,,0.526,0.12,0.207,0.026,0.048
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=4_leaf=0.01,max,0.527,0.251,0.37,0.232,0.193
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=5_leaf=0.01,,0.543,0.116,0.212,0.026,0.044
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=5_leaf=0.01,max,0.581,0.265,0.372,0.238,0.173
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=6_leaf=0.01,,0.543,0.109,0.213,0.026,0.044
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=6_leaf=0.01,max,0.601,0.256,0.379,0.234,0.178
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=4.0_dep=3_leaf=0.01,,0.549,0.115,0.223,0.026,0.049


### 3. Prec and Rec

In [None]:
display_df = metrics_df.pivot_table(index=rows, columns=columns, values=['precision', 'recall'], aggfunc='mean')
display_df.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,precision,recall,recall
Unnamed: 0_level_1,postprocess,None,max,None,max
model,graph,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
flan-t5-xxl,(None),0.388,,0.164,
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=3_leaf=0.01,0.463,0.372,0.136,0.25
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=4_leaf=0.01,0.491,0.426,0.135,0.251
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=5_leaf=0.01,0.515,0.452,0.139,0.258
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=3.0_dep=6_leaf=0.01,0.506,0.452,0.138,0.263
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=4.0_dep=3_leaf=0.01,0.475,0.382,0.143,0.263
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=4.0_dep=4_leaf=0.01,0.495,0.435,0.138,0.263
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=4.0_dep=5_leaf=0.01,0.503,0.474,0.133,0.281
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=4.0_dep=6_leaf=0.01,0.509,0.475,0.138,0.298
flan-t5-xxl,inferred_graph_CSILP_fgam=0.95_bgam=0.9_cneg=4_pos=5.0_dep=3_leaf=0.01,0.488,0.397,0.142,0.266


### 3. Paired t-test over all schema

In [None]:
display_df.query("graph == '(None)' and model == 'flan-t5-xxl'")

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,precision,recall,recall
Unnamed: 0_level_1,postprocess,None,max,None,max
model,graph,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
flan-t5-xxl,(None),0.387579,,0.16351,


In [None]:
# ref: https://pythonfordatascienceorg.wordpress.com/paired-samples-t-test-python/
from scipy import stats
if mode == 1: # with/without graph for gpt & T5 on SGD
    model='gpt-turbo' # 'flan-t5-xxl'
    df1 = display_df.query(f"graph == '(None)' and model == '{model}'")
    df1 = df1.squeeze()
    df2 = display_df.query(f"graph == 'RILP' and model == '{model}'")
    df2 = df2.squeeze()
else:
    df1 = display_df.loc[display_df['graph']=='(None)']
    df2 = display_df.loc[display_df['graph']=='RILP']
stat, pval = stats.ttest_rel(df1, df2)
print(f'Mean before={df1.mean()} Mean after={df2.mean()}')
if df1.mean() < df2.mean():
    change_text = "increased"
else:
    change_text = "decreased"
print(f'pval = {pval}')
if pval < 0.05:
    print(f'Statistically significant {change_text} by {df2.mean() - df1.mean()}!')
else:
    print('Not significant')

KeyError: 'graph'

In [None]:
t5_models=['flan-t5-xxl', 't5-xxl-lm-adapt']
mterics_t5 = metrics_df[metrics_df["model"].isin(t5_models)]
mterics_t5_5shot = mterics_t5[mterics_t5["shot"]==5]
display_df = mterics_t5_5shot.pivot(index=['model', 'graph'], columns=['domain'], values='f1')
rounded_df = display_df.round(decimals=3)
rounded_df

ValueError: Index contains duplicate entries, cannot reshape

In [None]:
print(metrics_df.pivot(index=['model', 'graph', 'shot'], columns=['domain'], values='f1').to_latex())


ValueError: Index contains duplicate entries, cannot reshape