## Zero Shot Evaluation

In [4]:
import sys
import os
sys.path.append('..')

from Parser import parse_LLM_output
from evaluate_tasks import *
import json
from nltk.sem.logic import *
import nltk
from nltk.sem.logic import LogicParser, Expression
from nltk.sem.evaluate import Valuation, Model
import pandas as pd

## Evaluation (via Parsers)

In [5]:
files = [f for f in os.listdir('../results/zeroshot-eval') if f.endswith('.json')]
results =[]
for f in files:
    print(f)
    names = f.split('_')
    model_name = names[0]
    task_name = names[1]
    task_name = os.path.splitext(task_name)[0]

    dataset = pd.read_json('../results/zeroshot-eval/' + f)

    if task_name == "task1":
        correctIncorrect, gibberish = eval_task1(dataset)
    elif task_name == "task2":
        correctIncorrect, gibberish = eval_task2(dataset)
    elif task_name == "task3":
        correctIncorrect, gibberish = eval_task3(dataset)

    # add two new columns to df and change original file
    dataset['Correct'] = correctIncorrect
    dataset['Gibberish'] = gibberish
    dataset.to_json('../results/zeroshot-eval/' + f)

    # calculate overall acc + acc without gibberish
    accuracy = sum(correctIncorrect) / len(correctIncorrect)
    print(accuracy)
    if accuracy > 0.0:
        accuracyNoGibberish = sum(correctIncorrect) / (len(correctIncorrect) - sum(gibberish))
    else:
        accuracyNoGibberish = 0.0
    
    results.append({'Task': task_name, 'Model': model_name, 'Accuracy': accuracy, 'AccuracyNoGibberish': accuracyNoGibberish})



Llama-2-13b-chat-hf_task2.json
0.0
orca-13b_task3.json
0.474
falcon-7b_task3.json
0.027
Wizard-15b_task3.json
0.49
Wizard-15b_task2.json
0.0
falcon-7b_task2.json
0.0
orca-13b_task2.json
0.0
Llama-2-13b-chat-hf_task3.json
0.401
flan-ul2_task3.json
0.513
flan-ul2_task2.json
0.0
flan-ul2_task1.json
0.0
Wizard-15b_task1.json
0.0
falcon-7b_task1.json
0.0
orca-13b_task1.json
0.0
Llama-2-13b-chat-hf_task1.json
0.0


In [6]:
results

[{'Task': 'task2',
  'Model': 'Llama-2-13b-chat-hf',
  'Accuracy': 0.0,
  'AccuracyNoGibberish': 0.0},
 {'Task': 'task3',
  'Model': 'orca-13b',
  'Accuracy': 0.474,
  'AccuracyNoGibberish': 0.474},
 {'Task': 'task3',
  'Model': 'falcon-7b',
  'Accuracy': 0.027,
  'AccuracyNoGibberish': 0.02710843373493976},
 {'Task': 'task3',
  'Model': 'Wizard-15b',
  'Accuracy': 0.49,
  'AccuracyNoGibberish': 0.49},
 {'Task': 'task2',
  'Model': 'Wizard-15b',
  'Accuracy': 0.0,
  'AccuracyNoGibberish': 0.0},
 {'Task': 'task2',
  'Model': 'falcon-7b',
  'Accuracy': 0.0,
  'AccuracyNoGibberish': 0.0},
 {'Task': 'task2',
  'Model': 'orca-13b',
  'Accuracy': 0.0,
  'AccuracyNoGibberish': 0.0},
 {'Task': 'task3',
  'Model': 'Llama-2-13b-chat-hf',
  'Accuracy': 0.401,
  'AccuracyNoGibberish': 0.46845794392523366},
 {'Task': 'task3',
  'Model': 'flan-ul2',
  'Accuracy': 0.513,
  'AccuracyNoGibberish': 0.513},
 {'Task': 'task2',
  'Model': 'flan-ul2',
  'Accuracy': 0.0,
  'AccuracyNoGibberish': 0.0},
 {'Tas

## Table

#### Table / Summary:

In [7]:
summary_df = pd.DataFrame(results, columns=['Task', 'Model', 'Accuracy', 'AccuracyNoGibberish'])
summary_df = summary_df.pivot(index='Model', columns='Task', values=['Accuracy', 'AccuracyNoGibberish'])

display(summary_df)

Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,AccuracyNoGibberish,AccuracyNoGibberish,AccuracyNoGibberish
Task,task1,task2,task3,task1,task2,task3
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Llama-2-13b-chat-hf,0.0,0.0,0.401,0.0,0.0,0.468458
Wizard-15b,0.0,0.0,0.49,0.0,0.0,0.49
falcon-7b,0.0,0.0,0.027,0.0,0.0,0.027108
flan-ul2,0.0,0.0,0.513,0.0,0.0,0.513
orca-13b,0.0,0.0,0.474,0.0,0.0,0.474
