# GPT-4 zero shot and few shot

Compute the GPT-4 zero-shot and few-shot accuracies.

In [1]:
import numpy as np
from tqdm import tqdm
from utils import load_responses, responses_to_acc, majority_acc, compute_averages

In [2]:
responses = load_responses({}, ['gpt4', 'gpt4-fewshot'], base_dir='results/model_responses/')
responses['gpt4-2'] = responses['gpt4']  # just a copy

100%|██████████| 260/260 [00:00<00:00, 4540.93it/s]


gpt4 has 260 tasks


100%|██████████| 260/260 [00:00<00:00, 7970.06it/s]

gpt4-fewshot has 260 tasks





In [3]:
accs = {}
for model, model_responses in responses.items():
    accs[model] = {}
    for file, response in tqdm(model_responses.items()):
        if model == 'gpt4-2':
            accs[model][file] = responses_to_acc(response, gpt4=True)
        else:
            # evaluated on the very same examples as gpt-4 few-shot
            # note that we evaluate gpt-4 few-shot on fewer examples due to the much higher cost of evaluation
            accs[model][file] = responses_to_acc(response[:5], gpt4=True)
accs['maj'] = {k: majority_acc(v) for k, v in list(responses.values())[0].items()}

100%|██████████| 260/260 [00:00<00:00, 72706.12it/s]
100%|██████████| 260/260 [00:00<00:00, 187890.94it/s]
100%|██████████| 260/260 [00:00<00:00, 14666.19it/s]


In [4]:
# sc_issue and sc_casetyp1 are the overage of certain tasks
tasks_to_average = {
    'sc_issue_': 'sc_issue',
    'songer_casetyp1_': 'songer_casetyp1',
}
averages = compute_averages(responses, tasks_to_average)
for model in accs.keys():
    for task in tasks_to_average.values():
        accs[model][task] = averages[model][task]

In [5]:
prefixes = {
    '': 'All',
    'songer_': 'Songer',
    'sc_': 'SC',
}

for prefix, prefix_name in prefixes.items():
    accs1 = []
    accs2 = []
    accs3 = []

    for task, v in accs['gpt4-fewshot'].items():
        if task.startswith(prefix):
            accs1.append(v)
            accs2.append(accs['gpt4'][task])
            accs3.append(accs['gpt4-2'][task])

    print(f'- {prefix_name}')
    print(f'  * GPT4 32k few-shot:                 {np.mean(accs1):.4f}')
    print(f'  * GPT4 8k zero-shot (same examples): {np.mean(accs2):.4f}') 
    print(f'  * GPT4 8k zero-shot (all examples):  {np.mean(accs3):.4f}')

- All
  * GPT4 32k few-shot:                 0.5838
  * GPT4 8k zero-shot (same examples): 0.6161
  * GPT4 8k zero-shot (all examples):  0.6289
- Songer
  * GPT4 32k few-shot:                 0.5912
  * GPT4 8k zero-shot (same examples): 0.6284
  * GPT4 8k zero-shot (all examples):  0.6342
- SC
  * GPT4 32k few-shot:                 0.5399
  * GPT4 8k zero-shot (same examples): 0.5436
  * GPT4 8k zero-shot (all examples):  0.5978


**Note:** These numbers compute GPT-4 zero shot performance on the same examples as the GPT few shot evaluations. This is a subset of all cases. The line `all examples` refers to the performance on all examples.