# Statistical Evaluation of the LLM 4 Model Completion Paper

## Experiment 3
The third experiment aims at answering the research question "Is domain-specific fine-tuning a viable alternative to few-shot learning?"

In [None]:
PATH_TO_RESULTS_ADA = './../../model_completion_dataset/Synthetic/results/results_finetuning/all_results_ada.csv'
PATH_TO_RESULTS_CURIE = './../../model_completion_dataset/Synthetic/results/results_finetuning/all_results_curie.csv'
PATH_TO_RESULTS_DAVINCI = './../../model_completion_dataset/Synthetic/results/results_finetuning/all_results_davinci.csv'


First we import all libraries. Make sure to have them installed (e.g., via pip).

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt # for plotting
from scipy.stats import pearsonr # for statistics

Then, we load and merge the results file:

In [None]:
results_ada = pd.read_csv(PATH_TO_RESULTS_ADA, delimiter=';')
results_curie = pd.read_csv(PATH_TO_RESULTS_CURIE, delimiter=';')
results_davinci = pd.read_csv(PATH_TO_RESULTS_DAVINCI, delimiter=';')

results_all = pd.concat([results_ada, results_curie, results_davinci])
print(f"We have {len(results_all)} datapoints")

We add a numeric value for the base model, to be able to correlate.

In [None]:
def base_model_to_numeric(base_model:str):
    if base_model == 'ada':
        return 1
    elif base_model == 'curie':
        return 2
    elif base_model == 'davinci':
        return 3
    else:
        print("ERROR: invalid base model")

In [None]:
results_all['base_model_numeric'] = results_all['Base_Model'].apply(lambda base_model: base_model_to_numeric(base_model))

We want to identify now the best models according to average token accuracy.

In [None]:
results_all.sort_values('Average_Token_Acc', ascending=False, inplace=True)
print(results_all[['Id', 'Epochs', 'Base_Model', 'Number_Tokens']].head(n=10))

We correlate dataset size, base model, and epochs.

In [None]:
results_relevant_columns = results_all[['Diffs', 'EOs', 'Pertubation', 'Epochs', 'Number_Tokens', 'base_model_numeric', 'Average_Token_Acc']]

rho = results_relevant_columns.corr()
pval = results_relevant_columns.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
rho.round(2).astype(str) + p

### Experiment 3 - Part 2
Fixing the dataset, do larger models always provide better accuracies and
fixing the language model, do larger dataset always provide better accuracies?.

In [None]:
all_columns = ['Diffs', 'EOs', 'Pertubation', 'Epochs', 'Number_Tokens', 'base_model_numeric', 'Average_Token_Acc']
property_of_interest = 'EOs'
data_ada = results_relevant_columns[results_relevant_columns['base_model_numeric'] == 1]#[['Average_Token_Acc', property_of_interest]]
data_curie = results_relevant_columns[results_relevant_columns['base_model_numeric'] == 2]#[['Average_Token_Acc', property_of_interest]]
data_davinci = results_relevant_columns[results_relevant_columns['base_model_numeric'] == 3]#[['Average_Token_Acc', property_of_interest]]

Ada:

In [None]:
rho = data_ada.corr()
pval = data_ada.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
rho.round(2).astype(str) + p



Curie:

In [None]:
rho = data_curie.corr()
pval = data_curie.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
rho.round(2).astype(str) + p


Davinci:

In [None]:
rho = data_davinci.corr()
pval = data_davinci.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
rho.round(2).astype(str) + p
