In [2]:
import os
import json
from collections import defaultdict

In [3]:
def compare_files(completion_path, task_path):
    with open(completion_path, 'r') as f:
        completion = f.read()
    with open(task_path, 'r') as f:
        task = json.load(f)
    
    expected_output = task['test'][0]['output']
    print("Expected:", expected_output)

    start_ix = completion.find("GPT4 Correct Assistant:")
    if start_ix == -1:
        return "Corrupted Output"
    else:
        start_ix += len("GPT4 Correct Assistant:")
        end_ix = completion.find("]]", start_ix)
        if end_ix == -1:
            return "Incomplete Output"
        else:
            end_ix += 2
            output = completion[start_ix:end_ix].strip()
            print("Actual:  ", output)
            return "Correct" if output == str(expected_output) else "Incorrect"
        

In [4]:
def compare_results(completions_path, tasks_path):
    results = defaultdict(int)
    tasks = os.listdir(tasks_path)
    tasks = sorted([task for task in tasks if task.endswith(".json")])
    for task in tasks:
        print(task)
        completion = task.replace(".json", ".txt")
        completion_path = os.path.join(completions_path, completion)
        task_path = os.path.join(tasks_path, task)
        result = compare_files(completion_path, task_path)
        results[result] += 1
        print(f"{result}\n")

    return results


# Fine Tuned

## Evaluation dataset.

In [46]:
results = compare_results('fine_tuned_results/evaluation', 'data/evaluation')

00576224.json
Expected: [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]
Actual:   [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 7, 2, 7, 2, 7], [8, 3, 8, 3, 8, 3], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3]]
Incorrect

009d5c81.json
Expected: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 0, 7, 0, 0, 0, 7, 0, 7, 0, 7], [0, 0, 0, 0, 0, 7, 0, 7, 0, 7, 0, 0, 0, 7], [0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Actual:   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3

In [47]:
results

defaultdict(int, {'Incorrect': 296, 'Incomplete Output': 96, 'Correct': 8})

In [48]:
results["Correct"] / (results["Correct"] + results["Incorrect"])

0.02631578947368421

## Train dataset.

In [49]:
results = compare_results('fine_tuned_results/training', 'data/training')

007bbfb7.json
Expected: [[7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 7, 0, 0, 0, 0, 7, 7, 0], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 7, 0, 0, 0, 0, 7, 7, 0], [7, 0, 7, 7, 0, 7, 0, 0, 0], [7, 0, 7, 7, 0, 7, 0, 0, 0], [7, 7, 0, 7, 7, 0, 0, 0, 0]]
Actual:   [[7, 0, 7, 7, 0, 7, 7, 0, 7], [7, 0, 7, 7, 0, 7, 7, 0, 7], [7, 7, 0, 0, 7, 7, 0, 0, 0], [7, 0, 7, 7, 0, 7, 7, 0, 7], [7, 0, 7, 7, 0, 7, 7, 0, 7], [7, 7, 0, 0, 7, 7, 0, 0, 0], [7, 0, 7, 7, 0, 7, 7, 0, 7], [7, 0, 7, 7, 0, 7, 7, 0, 7], [7, 7, 0, 0, 7, 7, 0, 0, 0]]
Incorrect

00d62c1b.json
Expected: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 3, 4, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 3, 4, 4, 4, 4, 3, 4, 4, 3, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [50]:
results

defaultdict(int, {'Incorrect': 244, 'Correct': 125, 'Incomplete Output': 31})

In [51]:
results["Correct"] / (results["Correct"] + results["Incorrect"])

0.33875338753387535

## Concept

In [52]:
categories = next(os.walk("fine_tuned_results/ConceptARC"))[1]

In [53]:
results = defaultdict(int)
for category in categories:
    categoric_result = compare_results(f'fine_tuned_results/ConceptARC/{category}', f'data/ConceptARC/{category}')
    results[category] = categoric_result
    

InsideOutside1.json
Expected: [[0], [0], [0], [0]]
Actual:   [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 5, 5, 5, 5, 0], [0, 0, 0, 5, 5, 5, 5, 0], [0, 0, 0, 5, 5, 5, 5, 0], [0, 0, 0, 5, 5, 5, 5, 0], [0, 0, 0, 5, 5, 5, 5, 0], [0, 0, 0, 5, 5, 5, 5, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]]
Incorrect

InsideOutside10.json
Expected: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0], [0, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0], [0, 7, 7, 0, 0, 0, 7, 7, 0, 0, 0, 0], [0, 7, 7, 6, 6, 6, 7, 7, 0, 0, 0, 0], [0, 7, 7, 0, 0, 0, 7, 7, 0, 0, 0, 0], [0, 7, 7, 0, 0, 0, 7, 7, 0, 0, 0, 0], [0, 7, 7, 0, 6, 0, 7, 7, 0, 0, 0, 0], [0, 7, 7, 6, 0, 0, 7, 7, 0, 0, 0, 0], [0, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0], [0, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Actual:   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0], [0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0], [0, 0, 0, 0, 0, 7, 7, 0, 0, 7, 7, 0], [0, 0, 0, 0, 0, 

In [54]:
results

defaultdict(int,
            {'InsideOutside': defaultdict(int, {'Incorrect': 11}),
             'TopBottom3D': defaultdict(int, {'Incorrect': 10, 'Correct': 1}),
             'Order': defaultdict(int, {'Incorrect': 9, 'Correct': 2}),
             'FilledNotFilled': defaultdict(int,
                         {'Incorrect': 10, 'Correct': 1}),
             'CleanUp': defaultdict(int, {'Correct': 2, 'Incorrect': 9}),
             'ExtendToBoundary': defaultdict(int,
                         {'Incorrect': 10, 'Correct': 1}),
             'MoveToBoundary': defaultdict(int,
                         {'Incorrect': 10, 'Correct': 1}),
             'Copy': defaultdict(int, {'Correct': 1, 'Incorrect': 10}),
             'TopBottom2D': defaultdict(int, {'Correct': 3, 'Incorrect': 8}),
             'ExtractObjects': defaultdict(int,
                         {'Incorrect': 9, 'Correct': 2}),
             'HorizontalVertical': defaultdict(int, {'Incorrect': 11}),
             'AboveBelow': defaultdict(

In [55]:
total_dict = {"Correct": 0, "Incorrect": 0, "Incomplete Output": 0}
for category in categories:
    for key in results[category]:
        total_dict[key] += results[category][key]

In [56]:
total_dict

{'Correct': 23, 'Incorrect': 153, 'Incomplete Output': 0}

# Base Model

## Concept

In [61]:
results_path = "basemodel_results/ConceptARC/"
categories = next(os.walk(results_path))[1]
results = defaultdict(int)

for category in categories:
    categoric_result = compare_results(results_path + category, f'data/ConceptARC/{category}')
    results[category] = categoric_result

SameDifferent1.json
Expected: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 3, 0, 0, 4, 4, 4], [0, 0, 0, 3, 3, 3, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Actual:   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 3, 0, 0, 4, 4, 4], [0, 0, 0, 3, 3, 3, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4], [0, 4, 4, 0, 0, 0, 0, 0, 4, 0, 4], [0, 4, 4, 0, 0, 0, 0, 0, 4, 4, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Incorrect

SameDifferent10.json
Expected: [[0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0], [0, 2, 2, 2, 2, 2, 0, 5, 0, 2, 2, 2, 2, 2, 0], [0, 2, 3, 3, 3, 2, 0, 5, 0, 2, 3, 3, 3, 2, 0], [0, 

In [62]:
results

defaultdict(int,
            {'SameDifferent': defaultdict(int,
                         {'Incorrect': 10, 'Correct': 1}),
             'CompleteShape': defaultdict(int,
                         {'Incorrect': 10, 'Incomplete Output': 1}),
             'TopBottom2D': defaultdict(int, {'Incorrect': 11}),
             'Center': defaultdict(int, {'Incorrect': 11}),
             'MoveToBoundary': defaultdict(int,
                         {'Incorrect': 10, 'Incomplete Output': 1}),
             'CleanUp': defaultdict(int,
                         {'Incorrect': 9, 'Incomplete Output': 2}),
             'TopBottom3D': defaultdict(int, {'Incorrect': 11}),
             'Count': defaultdict(int,
                         {'Incorrect': 7,
                          'Incomplete Output': 3,
                          'Correct': 1}),
             'ExtractObjects': defaultdict(int, {'Incorrect': 11}),
             'Copy': defaultdict(int,
                         {'Incorrect': 10, 'Incomplete Output': 1}

In [63]:
total_dict = {"Correct": 0, "Incorrect": 0, "Incomplete Output": 0}
for category in categories:
    for key in results[category]:
        total_dict[key] += results[category][key]

In [64]:
total_dict

{'Correct': 4, 'Incorrect': 162, 'Incomplete Output': 10}

## Evaluation dataset.

In [4]:
results = compare_results('basemodel_results/evaluation', 'data/evaluation')

00576224.json
Expected: [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]
Actual:   [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]
Correct

009d5c81.json
Expected: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 0, 7, 0, 0, 0, 7, 0, 7, 0, 7], [0, 0, 0, 0, 0, 7, 0, 7, 0, 7, 0, 0, 0, 7], [0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Actual:   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 8, 

In [5]:
results

defaultdict(int, {'Correct': 7, 'Incorrect': 270, 'Incomplete Output': 123})

In [6]:
results["Correct"] / (results["Correct"] + results["Incorrect"])

0.02527075812274368

## Train dataset.

In [5]:
results = compare_results('basemodel_results/training_with_prompt', 'data/train')

007bbfb7.json
Expected: [[7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 7, 0, 0, 0, 0, 7, 7, 0], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 7, 0, 0, 0, 0, 7, 7, 0], [7, 0, 7, 7, 0, 7, 0, 0, 0], [7, 0, 7, 7, 0, 7, 0, 0, 0], [7, 7, 0, 7, 7, 0, 0, 0, 0]]
Actual:   [[7, 7, 7, 7, 7, 7, 7, 7, 7], [7, 0, 7, 0, 7, 0, 7, 0, 7], [7, 7, 0, 7, 7, 0, 7, 7, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0], [7, 7, 0, 7, 7, 0, 7, 7, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0]]
Incorrect

00d62c1b.json
Expected: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 3, 4, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 3, 4, 4, 4, 4, 3, 4, 4, 3, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [6]:
results

defaultdict(int, {'Incorrect': 328, 'Incomplete Output': 59, 'Correct': 13})