In [1]:
import pandas as pd
from typing import List, Optional
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import levene
from pingouin import welch_anova
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.packages import importr
pandas2ri.activate()

pd.options.mode.chained_assignment = None  # default='warn'

ARTool = importr('ARTool')
base = importr('base')
stats = importr('stats')

In [2]:
def read_gsheet(url: Optional[str]=None, sheet_id: Optional[str]=None, gid: Optional[str]=None):
    if url is not None:
        match = re.search(r"spreadsheets/d/([^/]+)/.*?[?&]gid=(\d+)", url)
        if match:
            sheet_id = match.group(1)
            gid = match.group(2)
        else:
            print("can't parse url to get sheet id and gid")
    else:
        assert sheet_id is not None and gid is not None, "Sheet id an gid must be not None when url is not None"
    _url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid={gid}"
    return pd.read_csv(_url)

In [3]:
def multi_way_anova(_filter_df: pd.DataFrame, independent_vars: Optional[List[str]] = ["fine_tune_dataset"],
                    dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do multi way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_vars: contributing factors
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    if collapse:
        mapping = {'Base': 'Base'}
        _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # print(f"df size: {_filter_df.shape}")
    # levene
    grouped_data =  _filter_df.groupby(independent_vars)[dependent_var].apply(list)
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05: # pval must be < 0.05 for us to reject the H_0 at Levenes.
        print("Robust ANOVA")
        r_df = pandas2ri.py2rpy(_filter_df)
        r.assign("rdf", r_df)

        for var in independent_vars:
            r(f"rdf${var} <- as.factor(rdf${var})")
        formula_str = f"{dependent_var} ~ {' * '.join(independent_vars)}"
        r(f'''
            library(ARTool)
            model <- art({formula_str}, data = rdf)
            art_result <- anova(model, type=2)
        ''')

        art_result = r('art_result')
        art_df = pandas2ri.rpy2py(art_result)
        return art_df
    formula = f"{dependent_var} ~" + ' + '.join([f'C({col})' for col in independent_vars])
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [4]:
def one_way_anova(_filter_df: pd.DataFrame, independent_var: Optional[str] = "fine_tune_dataset",
                  dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do one way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_var: contributing factor
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    #print(f"df size: {_filter_df.shape}")
    # suppose the independent variable is prompt, then, it is possible that the dependent variable has the same value for all the prompts. 
    # in that case, we should filter out those rows as the variance will be zero.
    grouped_data = _filter_df.groupby(independent_var)
    for group_name, group in grouped_data:
        if group[dependent_var].nunique() <= 1:
            print(f"Group '{group_name}' has only one unique value for '{dependent_var}', removing it from analysis.")
            _filter_df = _filter_df[_filter_df[independent_var] != group_name]
    #print(f"df size after filtering: {_filter_df.shape}")
    mapping = {'Base': 'Base'}
    _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # levene
    grouped_data = [group[dependent_var].values for name, group in _filter_df.groupby(independent_var)]
    grouped_data = [group for group in grouped_data if len(group) > 1 and len(set(group)) > 1]  
    # filter out groups with only one observation and groups with no variance
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05:
        print("Welch")
        assert independent_var is not None and dependent_var is not None
        if len(set(_filter_df[independent_var])) < 2:
            print(f"Only one level of {independent_var} found, returning empty DataFrame.")
            return pd.DataFrame(columns=['F', 'PR(>F)', 'sum_sq', 'df', 'mean_sq', 'eta_sq'])
        welch_results = welch_anova(dv=dependent_var, between=independent_var, data=_filter_df)
        return welch_results
    formula=f"{dependent_var} ~ C({independent_var})"
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


## Fine-tuning data analysis

In [5]:
# url = "https://docs.google.com/spreadsheets/d/14dHuq-Z52B-RYvYX0IDFgC9ISXDDr9ID/edit?gid=801390513#gid=801390513"
url = "https://docs.google.com/spreadsheets/d/1L6WA2vUNvY3B7ClDqdYxCKmqTocDbbOz/edit?gid=126664231#gid=126664231"
df = read_gsheet(url=url)
print(len(df))
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
df['prompt'] = df['prompt'].astype(str)
print(len(df))
print(set(df['fine_tune_dataset']))

3175
2693
{'Base', 'Newsarticles', 'Newsroom', 'Canadian-QA', 'OpenR1', 'Imdb', 'Pol-Convo', 'FineTome', 'Scisumm'}


In [6]:
## fine tuning data analysis

model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    _df = df[(df['model']== model)]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df
dependent_vars = ["output_x", "output_y"]

Falcon 633
gemma 720
Mistral 627
Llama3 713


### indepent t-test

determine if we have a difference between fine-tuned and base versions of PCT

In [7]:
from scipy.stats import ttest_ind

models = ["gemma", "Llama3", "Falcon", "Mistral"]
def t_test(_df: pd.DataFrame, _dependent_var: str):
    _df['is_finetuned'] = _df['fine_tune_dataset'].apply(lambda x: 'Base' not in x)
    base_dep_var = _df[_df['is_finetuned'] == False][_dependent_var]
    finetuned_dep_var = _df[_df['is_finetuned'] == True][_dependent_var]
    t_stat, p_value = ttest_ind(base_dep_var, finetuned_dep_var, equal_var=False)
    return (t_stat, p_value)    

for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(t_test(_df=model_split_df[model], _dependent_var = dependent_var))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
(np.float64(-6.127505279754074), np.float64(1.055762172068916e-08))
-------------------- Llama3 --------------------
(np.float64(5.082108758340775), np.float64(1.1090455663368004e-06))
-------------------- Falcon --------------------
(np.float64(8.371913648092294), np.float64(2.9543018299527145e-15))
-------------------- Mistral --------------------
(np.float64(-5.235095992765232), np.float64(6.911225653954734e-07))
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
(np.float64(5.966800986687974), np.float64(2.0666166496433783e-08))
-------------------- Llama3 --------------------
(np.float64(-9.474745646468088), np.float64(2.0860475515015573e-17))
-------------------- Falcon --------------------
(np.float64(-0.5598663913560933), np.float64(0.57653193374933))
---

In [8]:
## code to convert it to csv format
import csv
import re


text = """
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
(np.float64(-6.127505279754074), np.float64(1.055762172068916e-08))
-------------------- Llama3 --------------------
(np.float64(5.082108758340775), np.float64(1.1090455663368004e-06))
-------------------- Falcon --------------------
(np.float64(8.371913648092294), np.float64(2.9543018299527145e-15))
-------------------- Mistral --------------------
(np.float64(-5.235095992765232), np.float64(6.911225653954734e-07))
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
(np.float64(5.966800986687974), np.float64(2.0666166496433783e-08))
-------------------- Llama3 --------------------
(np.float64(-9.474745646468088), np.float64(2.0860475515015573e-17))
-------------------- Falcon --------------------
(np.float64(-0.5598663913560933), np.float64(0.57653193374933))
-------------------- Mistral --------------------
(np.float64(-22.67638706620761), np.float64(1.4972304988349182e-53))
"""

# Extract results
def extract_results(section_text):
    pattern = r"-{5,}\s*(\w+)\s*-{5,}.*?\(np\.float64\(([-+eE\d\.]+)\), np\.float64\(([-+eE\d\.]+)\)\)"
    return {x[0]: (x[1], x[2]) for x in re.findall(pattern, section_text, re.DOTALL)}

# Split sections
output_x_text = text.split("output_x =")[-1].split("output_y =")[0]
output_y_text = text.split("output_y =")[-1]

x_results = extract_results(output_x_text)
y_results = extract_results(output_y_text)

models = ["gemma", "Llama3", "Falcon", "Mistral"]
rows = []
for model in models:
    x_stat, x_p = x_results.get(model, ("", ""))
    y_stat, y_p = y_results.get(model, ("", ""))
    rows.append([model, "{:.2e}".format(float(x_stat)), "{:.2e}".format(float(x_p)), 
                 "{:.2e}".format(float(y_stat)), "{:.2e}".format(float(y_p))])

# Write CSV
with open("t_stats.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["model", "t-stat-x", "p-value", "t-stat-y", "p-value"])
    writer.writerows(rows)
print("text written")

text written


### multi way anova to understand the joint effect of prompts and fine-tune dataset

In [9]:
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(multi_way_anova(_filter_df=model_split_df[model], independent_vars= ["prompt", "fine_tune_dataset"], dependent_var = dependent_var, collapse=True))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
Levene’s Test: Statistic = 8.4102, p-value = 1.513e-21
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   700.0  8.444127e+06   
fine_tune_dataset                fine_tune_dataset  1.0   700.0  1.387190e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   700.0  8.575971e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.259571e+07  29.065929  4.154417e-43  
fine_tune_dataset         2.964537e+07  32.754960  1.550529e-08  
prompt:fine_tune_dataset  3.016229e+07   2.211437  1.970095e-02  
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 3.5789, p-value = 4.779e-07
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt

In [10]:
## convert the results to csv format
output_x_text = """
-------------------- gemma --------------------
Levene’s Test: Statistic = 8.4102, p-value = 1.513e-21
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   700.0  8.444127e+06   
fine_tune_dataset                fine_tune_dataset  1.0   700.0  1.387190e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   700.0  8.575971e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.259571e+07  29.065929  4.154417e-43  
fine_tune_dataset         2.964537e+07  32.754960  1.550529e-08  
prompt:fine_tune_dataset  3.016229e+07   2.211437  1.970095e-02  
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 3.5789, p-value = 4.779e-07
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   693.0  1.665446e+06   
fine_tune_dataset                fine_tune_dataset  1.0   693.0  1.139668e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   693.0  3.695426e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.842045e+07   4.512221  8.745441e-06  
fine_tune_dataset         2.894055e+07  27.290088  2.318861e-07  
prompt:fine_tune_dataset  2.962474e+07   0.960507  4.716849e-01  
-------------------- Falcon --------------------
Levene’s Test: Statistic = 16.8733, p-value = 1.57e-44
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   613.0  2.491218e+06   
fine_tune_dataset                fine_tune_dataset  1.0   613.0  6.257565e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   613.0  1.482095e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    1.844981e+07   9.196825  3.888562e-13  
fine_tune_dataset         2.022138e+07  18.969461  1.556795e-05  
prompt:fine_tune_dataset  2.082047e+07   0.484845  8.851998e-01  
-------------------- Mistral --------------------
Levene’s Test: Statistic = 11.5625, p-value = 3.207e-30
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   607.0  1.143454e+06   
fine_tune_dataset                fine_tune_dataset  1.0   607.0  4.401117e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   607.0  9.901578e+05   

                            Sum Sq.res    F value    Pr(>F)  
prompt                    1.938750e+07   3.977801  0.000060  
fine_tune_dataset         2.008808e+07  13.298823  0.000288  
prompt:fine_tune_dataset  1.952883e+07   3.419593  0.000407  
"""
output_y_text = """
evene’s Test: Statistic = 6.5959, p-value = 4.651e-16
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   700.0  6.715908e+06   
fine_tune_dataset                fine_tune_dataset  1.0   700.0  1.840088e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   700.0  1.726181e+06   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.429742e+07  21.498102  2.771317e-32  
fine_tune_dataset         2.921293e+07  44.092181  6.287845e-11  
prompt:fine_tune_dataset  2.935353e+07   4.573845  6.994484e-06  
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 4.3555, p-value = 2.634e-09
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   693.0  1.791252e+06   
fine_tune_dataset                fine_tune_dataset  1.0   693.0  1.800514e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   693.0  7.204748e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.828995e+07   4.875456  2.386184e-06  
fine_tune_dataset         2.835922e+07  43.998262  6.622597e-11  
prompt:fine_tune_dataset  2.938120e+07   1.888165  5.074256e-02  
-------------------- Falcon --------------------
Levene’s Test: Statistic = 3.3552, p-value = 2.253e-06
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   613.0  2.293624e+06   
fine_tune_dataset                fine_tune_dataset  1.0   613.0  7.858100e+02   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   613.0  3.444248e+05   

                            Sum Sq.res   F value        Pr(>F)  
prompt                    1.881984e+07  8.300884  1.017352e-11  
fine_tune_dataset         2.108149e+07  0.022849  8.798988e-01  
prompt:fine_tune_dataset  2.072867e+07  1.131725  3.377401e-01  
-------------------- Mistral --------------------
Levene’s Test: Statistic = 6.7392, p-value = 2.882e-16
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   607.0  2.924756e+06   
fine_tune_dataset                fine_tune_dataset  1.0   607.0  5.470815e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   607.0  8.455530e+05   

                            Sum Sq.res     F value        Pr(>F)  
prompt                    1.758870e+07   11.215069  2.675569e-16  
fine_tune_dataset         1.500849e+07  221.260351  6.745058e-43  
prompt:fine_tune_dataset  1.961297e+07    2.907660  2.229715e-03
"""

import re


def return_dependent_var_df(text, x_y="x"):
    blocks = re.split(r'-{10,} (.+?) -{10,}', text)[1:]
    
    # Prepare CSV structure
    results = []
    
    for i in range(0, len(blocks), 2):
        model = blocks[i].strip()
        content = blocks[i + 1].strip()
    
        # Find lines containing the ANOVA results
        lines = content.strip().split('\n')
        anova_lines = [
            line.strip()
            for line in lines
            if line.strip().startswith("prompt") or
               line.strip().startswith("fine_tune_dataset") or
               line.strip().startswith("prompt:fine_tune_dataset")
        ]
    
        # Dictionary to hold the output row
        row = {"model": model}
    
        for line in anova_lines:
            parts = re.split(r'\s{2,}', line.strip())
            if len(parts) < 3:
                continue  # Not enough fields
    
            term = parts[0]
            f_value = parts[-2]
            p_value = parts[-1]
    
            if term == "prompt":
                row[f"output_{x_y}-prompt-f-score"] = f"{float(f_value):.2e}"
                row[f"output_{x_y}-prompt-p-value"] = f"{float(p_value):.2e}"
            elif term == "fine_tune_dataset":
                row[f"output_{x_y}-finetune-f-score"] = f"{float(f_value):.2e}"
                row[f"output_{x_y}-finetune-p-value"] = f"{float(p_value):.2e}"
            elif term == "prompt:fine_tune_dataset":
                row[f"output_{x_y}-prompt-finetune-interaction-f-score"] = f"{float(f_value):.2e}"
                row[f"output_{x_y}-prompt-finetune-interaction-p-value"] = f"{float(p_value):.2e}"
        results.append(row)
    
    return pd.DataFrame(results, columns=["model", f"output_{x_y}-prompt-f-score",
                                          f"output_{x_y}-prompt-p-value",
                                         f"output_{x_y}-finetune-f-score",
                                         f"output_{x_y}-finetune-p-value",
                                         f"output_{x_y}-prompt-finetune-interaction-f-score",
                                         f"output_{x_y}-prompt-finetune-interaction-p-value"])
 
df_x = return_dependent_var_df(output_x_text)
df_y = return_dependent_var_df(output_y_text, x_y="y")
stacked = pd.concat([df_x, df_y], axis=1)
stacked.to_csv("multi_way_anova_results.csv", index=False)
print(f"output written")

output written


### Does the type of fine-tuning dataset make a difference?

Check if the type of the fine-tuning dataset makes a difference. We need to re-download the data because we have converted all fine-tuning datasets to "fine_tuned" in the last step.

In [11]:
# url = "https://docs.google.com/spreadsheets/d/14dHuq-Z52B-RYvYX0IDFgC9ISXDDr9ID/edit?gid=801390513#gid=801390513"
url = "https://docs.google.com/spreadsheets/d/1L6WA2vUNvY3B7ClDqdYxCKmqTocDbbOz/edit?gid=126664231#gid=126664231"
df = read_gsheet(url=url)
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
print(len(df))
df['prompt'] = df['prompt'].astype(str)


model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    _df = df[(df['model']== model)]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df
    
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
dataset_category = {"target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-Convo'}, "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}}
# we will test the fuck all 
def tukey_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base']
    #print(len(_df), list(_df), set(_df['fine_tune_dataset']))
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
    tukey = pairwise_tukeyhsd(endog=_df[_dependent_var],
                              groups=_df["dataset_group"],
                              alpha=0.05)
    return tukey

from scipy.stats import ttest_ind

def t_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base']
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(
        lambda x: 'target' if x in dataset_category["target"] else "control"
    )

    control_vals = _df[_df['dataset_group'] == 'control'][_dependent_var]
    target_vals = _df[_df['dataset_group'] == 'target'][_dependent_var]

    t_stat, p_val = ttest_ind(control_vals, target_vals, equal_var=False)
    diff = target_vals.mean() - control_vals.mean()

    return {
        'mean_diff': diff,
        't_stat': t_stat,
        'p_value': p_val,
        'n_control': len(control_vals),
        'n_target': len(target_vals)
    }

from pprint import pprint
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(tukey_test(_df=model_split_df[model], _dependent_var = dependent_var))
        #pprint(t_test(_df=model_split_df[model], _dependent_var = dependent_var))

2693
Falcon 633
gemma 720
Mistral 627
Llama3 713
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05
 group1 group2 meandiff p-adj lower  upper  reject
--------------------------------------------------
control target   1.1544   0.0 0.9374 1.3714   True
--------------------------------------------------
-------------------- Llama3 --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj  lower   upper  reject
----------------------------------------------------
control target  -0.5978   0.0 -0.8094 -0.3862   True
----------------------------------------------------
-------------------- Falcon --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj   lower  upper  reject
----------------------------------------------------
control target  -0.1053 0.2342 -0.

In [32]:
import pandas as pd
import pingouin as pg
from statsmodels.formula.api import ols
import statsmodels.api as sm

dataset_category = {
    "target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-Convo'},
    "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}
}

task_dataset_category = {
    "classification": {
        "target": 'Newsarticles',
        "control": "Imdb"
    },
    "summarization": {
        "target": "Newsroom",
        "control": "Scisumm"
    },
    "conversational": {
        "target": "Pol-Convo",
        "control": "FineTome"
    },
    "qa": {
        "target": "Canadian-QA",
        "control": "OpenR1"
    }
}

def games_howell_test_base(_df, _dependent_var='output_x'):
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(
        lambda x: 'Base' if x == 'Base' else ('target' if x in dataset_category["target"] else "control")
    )
    assert set(_df['dataset_group']) == {'target', 'control', 'Base'}, "Dataset groups must be either 'target' or 'control'"
    gh_result = pg.pairwise_gameshowell(dv=_dependent_var, between="dataset_group", data=_df)
    return gh_result
    
def games_howell_test_task_base(_df, task, _dependent_var='output_x'):
    task_datasets = set(task_dataset_category[task].values()) | {'Base'}
    _df = _df[_df['fine_tune_dataset'].isin(task_datasets)]
    if _df['fine_tune_dataset'].nunique() < 3:
        print(f"Only {_df['fine_tune_dataset'].nunique()} datasets found for task '{task}', returning empty DataFrame.")
        return None
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(
        lambda x: 'Base' if x == 'Base' else ('target' if x in task_dataset_category[task]["target"] else "control")
    )
    assert set(_df['dataset_group']) == {'target', 'control', 'Base'}, f"Dataset groups must be either 'target' or 'control' or 'Base', got {set(_df['dataset_group'])}"
    gh_result = pg.pairwise_gameshowell(dv=_dependent_var, between="dataset_group", data=_df)
    return gh_result


# dfs = []
# for dependent_var in dependent_vars:
#     print("=:" * 20 + f" {dependent_var} " + "=:" * 20)
#     for model in models:
#         print("-" * 20 + f" {model} " + "-" * 20)
#         result = games_howell_test_base(_df=model_split_df[model], _dependent_var=dependent_var)
#         result['model'] = model
#         result['dependent_var'] = dependent_var
#         result['setup'] = [f"{x.lower()}-{y.lower()}" for x,y in zip(result['A'], result['B'])]
#         result['diff'] = [f"{float(x):.2e}" for x in result['diff']]
#         result['pval'] = [f"{float(x):.2e}" for x in result['pval']]
#         #print(result[['diff', 'pval']])
#         dfs.append(result[['model', 'setup', 'diff', 'pval', 'dependent_var']])

# pd.concat(dfs, axis=0).to_csv("diff_pval_summary_incl_base.csv", index=False)
# print("output written")

dfs = []
for dependent_var in dependent_vars:
    print("=:" * 20 + f" {dependent_var} " + "=:" * 20)
    for model in models:
        print("-" * 20 + f" {model} " + "-" * 20)
        for task in task_dataset_category.keys():
            print(f"Task: {task}")
            result = games_howell_test_task_base(_df=model_split_df[model], task=task, _dependent_var=dependent_var)
            if result is not None:
                result['model'] = model
                result['task'] = task
                result['dependent_var'] = dependent_var
                result['setup'] = [f"{x.lower()}-{y.lower()}" for x,y in zip(result['A'], result['B'])]
                result['diff'] = [f"{float(x):.2e}" for x in result['diff']]
                result['pval'] = [f"{float(x):.2e}" for x in result['pval']]
                print(result[['diff', 'pval']])
                dfs.append(result[['model', 'task', 'setup', 'diff', 'pval', 'dependent_var']])

pd.concat(dfs, axis=0).to_csv("diff_pval_summary_task_incl_base.csv", index=False)
print("output written")

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
Task: classification
        diff      pval
0   3.53e-01  4.19e-02
1  -2.00e+00  3.53e-14
2  -2.35e+00  0.00e+00
Task: summarization
        diff      pval
0  -4.42e-01  1.38e-02
1  -1.75e+00  5.41e-13
2  -1.31e+00  1.30e-08
Task: conversational
        diff      pval
0  -9.02e-01  1.37e-05
1  -2.03e+00  0.00e+00
2  -1.13e+00  1.02e-07
Task: qa
       diff      pval
0  1.18e-01  7.92e-01
1  2.90e-01  2.10e-01
2  1.72e-01  6.31e-01
-------------------- Llama3 --------------------
Task: classification
        diff      pval
0   1.39e+00  0.00e+00
1  -2.00e-01  4.51e-01
2  -1.59e+00  8.22e-15
Task: summarization
        diff      pval
0   9.39e-01  5.77e-13
1   4.68e-01  1.26e-03
2  -4.71e-01  4.32e-04
Task: conversational
        diff      pval
0  -7.73e-01  1.29e-08
1   1.48e+00  0.00e+00
2   2.25e+00  0.00e+00
Task: qa
        diff      pval
0  -7.2