In [1]:
import pandas as pd
from typing import List, Optional
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import levene
from pingouin import welch_anova
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.packages import importr
pandas2ri.activate()

pd.options.mode.chained_assignment = None  # default='warn'

ARTool = importr('ARTool')
base = importr('base')
stats = importr('stats')

In [4]:
def read_gsheet(url: Optional[str]=None, sheet_id: Optional[str]=None, gid: Optional[str]=None):
    if url is not None:
        match = re.search(r"spreadsheets/d/([^/]+)/.*?[?&]gid=(\d+)", url)
        if match:
            sheet_id = match.group(1)
            gid = match.group(2)
        else:
            print("can't parse url to get sheet id and gid")
    else:
        assert sheet_id is not None and gid is not None, "Sheet id an gid must be not None when url is not None"
    _url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid={gid}"
    return pd.read_csv(_url)

In [None]:
def multi_way_anova(_filter_df: pd.DataFrame, independent_vars: Optional[List[str]] = ["fine_tune_dataset"],
                    dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do multi way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_vars: contributing factors
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    if collapse:
        mapping = {'Base': 'Base'}
        _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # print(f"df size: {_filter_df.shape}")
    # levene
    grouped_data =  _filter_df.groupby(independent_vars)[dependent_var].apply(list)
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05: # pval must be < 0.05 for us to reject the H_0 at Levenes.
        print("Robust ANOVA")
        r_df = pandas2ri.py2rpy(_filter_df)
        r.assign("rdf", r_df)

        for var in independent_vars:
            r(f"rdf${var} <- as.factor(rdf${var})")
        formula_str = f"{dependent_var} ~ {' * '.join(independent_vars)}"
        r(f'''
            library(ARTool)
            model <- art({formula_str}, data = rdf)
            art_result <- anova(model, type=2)
        ''')

        art_result = r('art_result')
        art_df = pandas2ri.rpy2py(art_result)
        return art_df
    formula = f"{dependent_var} ~" + ' + '.join([f'C({col})' for col in independent_vars])
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [48]:
def one_way_anova(_filter_df: pd.DataFrame, independent_var: Optional[str] = "fine_tune_dataset",
                  dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do one way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_var: contributing factor
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    #print(f"df size: {_filter_df.shape}")
    # suppose the independent variable is prompt, then, it is possible that the dependent variable has the same value for all the prompts. 
    # in that case, we should filter out those rows as the variance will be zero.
    grouped_data = _filter_df.groupby(independent_var)
    for group_name, group in grouped_data:
        if group[dependent_var].nunique() <= 1:
            print(f"Group '{group_name}' has only one unique value for '{dependent_var}', removing it from analysis.")
            _filter_df = _filter_df[_filter_df[independent_var] != group_name]
    #print(f"df size after filtering: {_filter_df.shape}")
    mapping = {'Base': 'Base'}
    _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # levene
    grouped_data = [group[dependent_var].values for name, group in _filter_df.groupby(independent_var)]
    grouped_data = [group for group in grouped_data if len(group) > 1 and len(set(group)) > 1]  
    # filter out groups with only one observation and groups with no variance
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05:
        print("Welch")
        assert independent_var is not None and dependent_var is not None
        if len(set(_filter_df[independent_var])) < 2:
            print(f"Only one level of {independent_var} found, returning empty DataFrame.")
            return pd.DataFrame(columns=['F', 'PR(>F)', 'sum_sq', 'df', 'mean_sq', 'eta_sq'])
        welch_results = welch_anova(dv=dependent_var, between=independent_var, data=_filter_df)
        return welch_results
    formula=f"{dependent_var} ~ C({independent_var})"
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [2]:
#url = "https://docs.google.com/spreadsheets/d/14dHuq-Z52B-RYvYX0IDFgC9ISXDDr9ID/edit?gid=801390513#gid=801390513"
url = "https://docs.google.com/spreadsheets/d/1L6WA2vUNvY3B7ClDqdYxCKmqTocDbbOz/edit?gid=126664231#gid=126664231"
df = read_gsheet(url=url)
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
print(len(df))
df['prompt'] = df['prompt'].astype(str)

2693


### base data analysis

1. one way anova.
2. multi way anova.

In [42]:
## base data analysis

model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    _df = df[(df['model']== model) & (df['fine_tune_dataset'] == "Base")]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df

gemma 80
Llama3 80
Falcon 73
Mistral 79


In [43]:
print(model_split_df["gemma"])

      output_y  output_x  n_beams  tmp  top_k fine_tune_dataset  model prompt
3095     -3.50     -5.79        5  1.0     50              Base  gemma      1
3096     -2.63     -5.08        1  0.5     50              Base  gemma      8
3097     -3.88     -6.82        1  1.0    100              Base  gemma      5
3098     -2.25     -4.77        1  1.0    100              Base  gemma      4
3099     -4.13     -7.33        1  0.5     50              Base  gemma      9
...        ...       ...      ...  ...    ...               ...    ...    ...
3170     -2.75     -7.33        1  1.0    100              Base  gemma      9
3171     -1.38     -3.74        5  1.0    100              Base  gemma      0
3172     -3.50     -5.79        5  1.0    100              Base  gemma      1
3173     -2.75     -5.79        1  1.0    100              Base  gemma      8
3174     -2.75     -6.36        1  0.5     50              Base  gemma      5

[80 rows x 8 columns]


### Effect of different parameters on the output variables.
for the generation parameters, there's no reason to expect interactions, and we also have less data. proceed with one way anova.


In [82]:
gen_factors = ["tmp", "top_k", "n_beams"]
models = ["gemma", "Llama3", "Falcon", "Mistral"]
dependent_vars = ["output_x", "output_y"]
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for gen_factor in gen_factors:
        print("="*20+f" {gen_factor} "+"="*20)
        for model in models:
            #print(model_split_df[model])
            print("-"*20+f" {model} "+"-"*20)
            print(one_way_anova(_filter_df=model_split_df[model], independent_var = gen_factor,
                    dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
Levene’s Test: Statistic = 0.0045, p-value = 0.9469
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.031601   1.0  0.029357  0.864402  0.000376
Residual  83.964247  78.0       NaN       NaN  0.999624
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 1.0567, p-value = 0.3071
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.046561   1.0  0.072264  0.788778  0.000926
Residual  50.257058  78.0       NaN       NaN  0.999074
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0031, p-value = 0.9558
            sum_sq    df         F    PR(>F)    eta_sq
C(tmp)    0.000177   1.0  0.001343  0.970871  0.000019
Residual  9.364724  71.0       NaN       NaN  0.999981
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0022, p-value = 0.963
     

In [83]:
#  code to convert it to csv format
raw_text = """
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
==================== tmp ====================
-------------------- gemma --------------------
Levene’s Test: Statistic = 0.0045, p-value = 0.9469
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.031601   1.0  0.029357  0.864402  0.000376
Residual  83.964247  78.0       NaN       NaN  0.999624
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 1.0567, p-value = 0.3071
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.046561   1.0  0.072264  0.788778  0.000926
Residual  50.257058  78.0       NaN       NaN  0.999074
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0031, p-value = 0.9558
            sum_sq    df         F    PR(>F)    eta_sq
C(tmp)    0.000177   1.0  0.001343  0.970871  0.000019
Residual  9.364724  71.0       NaN       NaN  0.999981
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0022, p-value = 0.963
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.001317   1.0  0.001363  0.970645  0.000018
Residual  74.400177  77.0       NaN       NaN  0.999982
==================== top_k ====================
-------------------- gemma --------------------
Levene’s Test: Statistic = 0.0154, p-value = 0.9016
             sum_sq    df       F    PR(>F)    eta_sq
C(top_k)   0.132031   1.0  0.1228  0.726962  0.001572
Residual  83.863818  78.0     NaN       NaN  0.998428
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 1.1740, p-value = 0.2819
             sum_sq    df         F    PR(>F)    eta_sq
C(top_k)   0.043711   1.0  0.067837  0.795199  0.000869
Residual  50.259908  78.0       NaN       NaN  0.999131
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0544, p-value = 0.8163
            sum_sq    df         F    PR(>F)    eta_sq
C(top_k)  0.000583   1.0  0.004421  0.947173  0.000062
Residual  9.364318  71.0       NaN       NaN  0.999938
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0022, p-value = 0.963
             sum_sq    df         F    PR(>F)    eta_sq
C(top_k)   0.001317   1.0  0.001363  0.970645  0.000018
Residual  74.400177  77.0       NaN       NaN  0.999982
==================== n_beams ====================
-------------------- gemma --------------------
Levene’s Test: Statistic = 0.2601, p-value = 0.6115
               sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)   0.376751   1.0  0.351434  0.555017  0.004485
Residual    83.619098  78.0       NaN       NaN  0.995515
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 5.7678, p-value = 0.0187
Welch
    Source  ddof1      ddof2          F     p-unc       np2
0  n_beams      1  71.442937  16.263571  0.000136  0.172533
-------------------- Falcon --------------------
Levene’s Test: Statistic = 3.9696, p-value = 0.05018
              sum_sq    df        F        PR(>F)    eta_sq
C(n_beams)  3.528448   1.0  42.9233  7.714651e-09  0.376774
Residual    5.836453  71.0      NaN           NaN  0.623226
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.8188, p-value = 0.3684
               sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)   3.001519   1.0  3.236934  0.075913  0.040342
Residual    71.399974  77.0       NaN       NaN  0.959658
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
==================== tmp ====================
-------------------- gemma --------------------
Levene’s Test: Statistic = 0.0524, p-value = 0.8195
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.056711   1.0  0.055967  0.813608  0.000717
Residual  79.037257  78.0       NaN       NaN  0.999283
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 0.0444, p-value = 0.8336
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.504031   1.0  0.534366  0.466967  0.006804
Residual  73.572108  78.0       NaN       NaN  0.993196
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0745, p-value = 0.7857
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.001874   1.0  0.003238  0.954779  0.000046
Residual  41.091824  71.0       NaN       NaN  0.999954
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0063, p-value = 0.937
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.002362   1.0  0.003016  0.956347  0.000039
Residual  60.296854  77.0       NaN       NaN  0.999961
==================== top_k ====================
-------------------- gemma --------------------
Levene’s Test: Statistic = 0.0192, p-value = 0.8902
             sum_sq    df         F    PR(>F)   eta_sq
C(top_k)   0.032401   1.0  0.031966  0.858565  0.00041
Residual  79.061567  78.0       NaN       NaN  0.99959
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 0.0169, p-value = 0.8969
             sum_sq    df         F    PR(>F)    eta_sq
C(top_k)   0.090451   1.0  0.095359  0.758295  0.001221
Residual  73.985687  78.0       NaN       NaN  0.998779
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0010, p-value = 0.9747
             sum_sq    df         F    PR(>F)    eta_sq
C(top_k)   0.035509   1.0  0.061404  0.805006  0.000864
Residual  41.058190  71.0       NaN       NaN  0.999136
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0063, p-value = 0.937
             sum_sq    df         F    PR(>F)    eta_sq
C(top_k)   0.002362   1.0  0.003016  0.956347  0.000039
Residual  60.296854  77.0       NaN       NaN  0.999961
==================== n_beams ====================
-------------------- gemma --------------------
Levene’s Test: Statistic = 1.3672, p-value = 0.2459
               sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)   0.331531   1.0  0.328322  0.568297  0.004192
Residual    78.762438  78.0       NaN       NaN  0.995808
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 0.8596, p-value = 0.3567
               sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)   0.961411   1.0  1.025649  0.314314  0.012979
Residual    73.114728  78.0       NaN       NaN  0.987021
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0962, p-value = 0.7573
               sum_sq    df          F        PR(>F)    eta_sq
C(n_beams)  22.296481   1.0  84.217263  1.106546e-13  0.542577
Residual    18.797217  71.0        NaN           NaN  0.457423
-------------------- Mistral --------------------
Levene’s Test: Statistic = 15.6265, p-value = 0.00017
Welch
    Source  ddof1      ddof2         F     p-unc       np2
0  n_beams      1  69.285649  0.493315  0.484805  0.006416
"""
## ^^ the raw text block containing the results of the ANOVA tests from the previous step.

import pandas as pd
import re
import numpy as np
import itertools


def parse_results(text_block):
    results = {}
    sections = re.split(r"=+ ([a-z_]+) =+", text_block)
    for i in range(1, len(sections), 2):
        param = sections[i]
        models_data = re.split(r"-+ ([A-Za-z0-9]+) -+", sections[i+1])
        results[param] = {}
        for j in range(1, len(models_data), 2):
            model = models_data[j]
            content = models_data[j+1]
            welch_match = "welch" in content.lower()
            if welch_match:
                pattern = r'(\d+\.\d+)\s+(\d+\.\d+(?:e[+-]?\d+)?)\s+(\d+\.\d+)$'
                match = re.search(pattern, content.strip())
                if match:
                    F = float(match.group(1))      # F value: 16.263571
                    p = float(match.group(2))  # p-unc value: 0.000136
            else:
                match = re.search(r'F\s*=\s*([0-9.]+)[,\s]+p-value\s*=\s*([0-9.]+)', content)
                if match:
                    F = float(match.group(1))
                    p = float(match.group(2))
                else:
                    match2 = re.search(r'C\([a-z_]+\).*?([0-9.]+)\s+[0-9.]+\s+([0-9.eE+-]+)\s+([0-9.eE+-]+)', content, re.DOTALL)
                    if match2:
                        F = float(match2.group(2))
                        p = float(match2.group(3))
                    else:
                        F, p = np.nan, np.nan
            results[param][model] = (F, p)
    return results


output_x = re.search(r'output_x.*?=+', raw_text, re.DOTALL).end()
output_y = re.search(r'output_y.*?=+', raw_text, re.DOTALL).end()
text_x = raw_text[output_x:output_y-10]
text_y = raw_text[output_y:]

results_x = parse_results(text_x)
results_y = parse_results(text_y)

rows = []
models = ["gemma", "Llama3", "Falcon", "Mistral"]
params = results_x.keys()

for param in params:
    for i, model in enumerate(models):
        fx, px = results_x.get(param, {}).get(model, (np.nan, np.nan))
        fy, py = results_y.get(param, {}).get(model, (np.nan, np.nan))
        rows.append({
            'Decoding Param': param if i == 0 else '',
            'Model': model,
            'F-stat X': fx,
            'p-value X': px,
            'F-stat Y': fy,
            'p-value Y': py
        })

df = pd.DataFrame(rows)


def convert_csv_to_scientific_notation(_df):
    for col in _df.columns:
        if _df[col].dtype in ['float64', 'float32', 'int64', 'int32']:
            _df[col] = _df[col].apply(lambda x: f"{x:.2e}")
    return _df


# df = sort_models(df, models)
print(df)
df = convert_csv_to_scientific_notation(df)
df.to_csv("one_way_anova_results_base_except_prompt.csv", index=False)

   Decoding Param    Model   F-stat X     p-value X   F-stat Y     p-value Y
0             tmp    gemma   0.029357  8.644020e-01   0.055967  8.136080e-01
1                   Llama3   0.072264  7.887780e-01   0.534366  4.669670e-01
2                   Falcon   0.001343  9.708710e-01   0.003238  9.547790e-01
3                  Mistral   0.001363  9.706450e-01   0.003016  9.563470e-01
4           top_k    gemma   0.122800  7.269620e-01   0.031966  8.585650e-01
5                   Llama3   0.067837  7.951990e-01   0.095359  7.582950e-01
6                   Falcon   0.004421  9.471730e-01   0.061404  8.050060e-01
7                  Mistral   0.001363  9.706450e-01   0.003016  9.563470e-01
8         n_beams    gemma   0.351434  5.550170e-01   0.328322  5.682970e-01
9                   Llama3  16.263571  1.360000e-04   1.025649  3.143140e-01
10                  Falcon  42.923300  7.714651e-09  84.217263  1.106546e-13
11                 Mistral   3.236934  7.591300e-02   0.493315  4.848050e-01

In [84]:
## what is the effect of prompt?

for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(one_way_anova(_filter_df=model_split_df[model], independent_var = "prompt", dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
Levene’s Test: Statistic = 3.3080, p-value = 0.002046
Welch
   Source  ddof1      ddof2           F         p-unc       np2
0  prompt      9  26.994029  818.463289  1.646926e-30  0.953824
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 3.6131, p-value = 0.0009597
Welch
   Source  ddof1      ddof2         F     p-unc       np2
0  prompt      9  28.336067  1.755564  0.122112  0.127021
-------------------- Falcon --------------------
Group '0' has only one unique value for 'output_x', removing it from analysis.
Levene’s Test: Statistic = 121.5935, p-value = 3.857e-34
Welch
   Source  ddof1      ddof2         F     p-unc      np2
0  prompt      8  24.574243  1.528834  0.198284  0.17153
-------------------- Mistral --------------------
Levene’s Test: Statistic = 484.6359, p-value = 1.062e-58
Welch
   Source  ddof1      ddof2  

In [85]:
raw_text = """
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
Levene’s Test: Statistic = 3.3080, p-value = 0.002046
Welch
   Source  ddof1      ddof2           F         p-unc       np2
0  prompt      9  26.994029  818.463289  1.646926e-30  0.953824
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 3.6131, p-value = 0.0009597
Welch
   Source  ddof1      ddof2         F     p-unc       np2
0  prompt      9  28.336067  1.755564  0.122112  0.127021
-------------------- Falcon --------------------
Levene’s Test: Statistic = 121.5935, p-value = 3.857e-34
Welch
   Source  ddof1      ddof2         F     p-unc      np2
0  prompt      8  24.574243  1.528834  0.198284  0.17153
-------------------- Mistral --------------------
Levene’s Test: Statistic = 484.6359, p-value = 1.062e-58
Welch
   Source  ddof1      ddof2           F         p-unc       np2
0  prompt      9  27.210306  230.811706  2.639164e-23  0.943831
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
Levene’s Test: Statistic = 4.7409, p-value = 6.216e-05
Welch
   Source  ddof1      ddof2           F         p-unc       np2
0  prompt      9  27.834362  102.075943  6.378794e-19  0.879945
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 4.6550, p-value = 7.625e-05
Welch
   Source  ddof1      ddof2          F         p-unc       np2
0  prompt      9  28.122914  34.965386  6.076650e-13  0.721607
-------------------- Falcon --------------------
Levene’s Test: Statistic = 408.1929, p-value = 2.235e-44
Welch
   Source  ddof1      ddof2         F     p-unc       np2
0  prompt      7  23.005145  3.395906  0.012227  0.246062
-------------------- Mistral --------------------
Levene’s Test: Statistic = 629.2575, p-value = 7.447e-44
Welch
   Source  ddof1      ddof2          F         p-unc       np2
0  prompt      6  19.693397  20.403909  1.735803e-07  0.542234
"""
## ^^ the raw text block containing the results of the ANOVA tests from the previous step.

import pandas as pd
import re
import numpy as np
import itertools


def parse_results(text_block):
    results = {}
    section = text_block.strip()
    models_data = re.split(r"-+ ([A-Za-z0-9]+) -+", section)
    param = "prompt"
    results[param] = {}
    for j in range(1, len(models_data), 2):
        model = models_data[j]
        content = models_data[j+1]
        welch_match = "welch" in content.lower()
        if welch_match:
            pattern = r'(\d+\.\d+)\s+(\d+\.\d+(?:e[+-]?\d+)?)\s+(\d+\.\d+)$'
            match = re.search(pattern, content.strip())
            if match:
                F = float(match.group(1))      # F value: 16.263571
                p = float(match.group(2))  # p-unc value: 0.000136
        else:
            match = re.search(r'F\s*=\s*([0-9.]+)[,\s]+p-value\s*=\s*([0-9.]+)', content)
            if match:
                F = float(match.group(1))
                p = float(match.group(2))
            else:
                match2 = re.search(r'C\([a-z_]+\).*?([0-9.]+)\s+[0-9.]+\s+([0-9.eE+-]+)\s+([0-9.eE+-]+)', content, re.DOTALL)
                if match2:
                    F = float(match2.group(2))
                    p = float(match2.group(3))
                else:
                    F, p = np.nan, np.nan
        results[param][model] = (F, p)
    return results


output_x = re.search(r'output_x.*?=+', raw_text, re.DOTALL).end()
output_y = re.search(r'output_y.*?=+', raw_text, re.DOTALL).end()
text_x = raw_text[output_x:output_y-10]
text_y = raw_text[output_y:]
results_x = parse_results(text_x)
results_y = parse_results(text_y)

rows = []
models = ["gemma", "Llama3", "Falcon", "Mistral"]
params = results_x.keys()

for param in params:
    for i, model in enumerate(models):
        fx, px = results_x.get(param, {}).get(model, (np.nan, np.nan))
        fy, py = results_y.get(param, {}).get(model, (np.nan, np.nan))
        rows.append({
            'Decoding Param': param if i == 0 else '',
            'Model': model,
            'F-stat X': fx,
            'p-value X': px,
            'F-stat Y': fy,
            'p-value Y': py
        })

df = pd.DataFrame(rows)


def convert_csv_to_scientific_notation(_df):
    for col in _df.columns:
        if _df[col].dtype in ['float64', 'float32', 'int64', 'int32']:
            _df[col] = _df[col].apply(lambda x: f"{x:.2e}")
    return _df


# df = sort_models(df, models)
print(df)
df = convert_csv_to_scientific_notation(df)
df.to_csv("one_way_anova_results_base_only_prompt.csv", index=False)

  Decoding Param    Model    F-stat X     p-value X    F-stat Y     p-value Y
0         prompt    gemma  818.463289  1.646926e-30  102.075943  6.378794e-19
1                  Llama3    1.755564  1.221120e-01   34.965386  6.076650e-13
2                  Falcon    1.528834  1.982840e-01    3.395906  1.222700e-02
3                 Mistral    1.528834  1.982840e-01   20.403909  1.735803e-07


In [81]:
# we don't _need_ to see the interaction effect of variables, but just to be sure
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(multi_way_anova(_filter_df=model_split_df[model], independent_vars= ["prompt", "top_k"], dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
df size: (80, 8)
Levene’s Test: Statistic = 4.1442, p-value = 1.252e-05
Robust ANOVA
                      Term   Df  Df.res    Sum Sq  Sum Sq.res    F value  \
prompt              prompt  9.0    60.0  37749.75      4764.0  52.826406   
top_k                top_k  1.0    60.0   1620.00     38337.5   2.535377   
prompt:top_k  prompt:top_k  9.0    60.0   4707.25     34108.0   0.920068   

                    Pr(>F)  
prompt        3.329243e-25  
top_k         1.165756e-01  
prompt:top_k  5.143277e-01  
-------------------- Llama3 --------------------
df size: (80, 8)
Levene’s Test: Statistic = 3.7918, p-value = 4.041e-05
Robust ANOVA
                      Term   Df  Df.res     Sum Sq  Sum Sq.res   F value  \
prompt              prompt  9.0    60.0  5300.4375   37035.625  0.954115   
top_k                top_k  1.0    60.0     4.0500   41817.625  0.005

In [93]:
## convert the results to csv format
text = """
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
df size: (720, 8)
Levene’s Test: Statistic = 8.4102, p-value = 1.513e-21
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   700.0  8.444127e+06   
fine_tune_dataset                fine_tune_dataset  1.0   700.0  1.387190e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   700.0  8.575971e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.259571e+07  29.065929  4.154417e-43  
fine_tune_dataset         2.964537e+07  32.754960  1.550529e-08  
prompt:fine_tune_dataset  3.016229e+07   2.211437  1.970095e-02  
-------------------- Llama3 --------------------
df size: (713, 8)
Levene’s Test: Statistic = 3.5789, p-value = 4.779e-07
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   693.0  1.665446e+06   
fine_tune_dataset                fine_tune_dataset  1.0   693.0  1.139668e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   693.0  3.695426e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.842045e+07   4.512221  8.745441e-06  
fine_tune_dataset         2.894055e+07  27.290088  2.318861e-07  
prompt:fine_tune_dataset  2.962474e+07   0.960507  4.716849e-01  
-------------------- Falcon --------------------
df size: (633, 8)
Levene’s Test: Statistic = 16.8733, p-value = 1.57e-44
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   613.0  2.491218e+06   
fine_tune_dataset                fine_tune_dataset  1.0   613.0  6.257565e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   613.0  1.482095e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    1.844981e+07   9.196825  3.888562e-13  
fine_tune_dataset         2.022138e+07  18.969461  1.556795e-05  
prompt:fine_tune_dataset  2.082047e+07   0.484845  8.851998e-01  
-------------------- Mistral --------------------
df size: (627, 8)
Levene’s Test: Statistic = 11.5625, p-value = 3.207e-30
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   607.0  1.143454e+06   
fine_tune_dataset                fine_tune_dataset  1.0   607.0  4.401117e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   607.0  9.901578e+05   

                            Sum Sq.res    F value    Pr(>F)  
prompt                    1.938750e+07   3.977801  0.000060  
fine_tune_dataset         2.008808e+07  13.298823  0.000288  
prompt:fine_tune_dataset  1.952883e+07   3.419593  0.000407  
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
df size: (720, 8)
Levene’s Test: Statistic = 6.5959, p-value = 4.651e-16
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   700.0  6.715908e+06   
fine_tune_dataset                fine_tune_dataset  1.0   700.0  1.840088e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   700.0  1.726181e+06   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.429742e+07  21.498102  2.771317e-32  
fine_tune_dataset         2.921293e+07  44.092181  6.287845e-11  
prompt:fine_tune_dataset  2.935353e+07   4.573845  6.994484e-06  
-------------------- Llama3 --------------------
df size: (713, 8)
Levene’s Test: Statistic = 4.3555, p-value = 2.634e-09
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   693.0  1.791252e+06   
fine_tune_dataset                fine_tune_dataset  1.0   693.0  1.800514e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   693.0  7.204748e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.828995e+07   4.875456  2.386184e-06  
fine_tune_dataset         2.835922e+07  43.998262  6.622597e-11  
prompt:fine_tune_dataset  2.938120e+07   1.888165  5.074256e-02  
-------------------- Falcon --------------------
df size: (633, 8)
Levene’s Test: Statistic = 3.3552, p-value = 2.253e-06
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   613.0  2.293624e+06   
fine_tune_dataset                fine_tune_dataset  1.0   613.0  7.858100e+02   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   613.0  3.444248e+05   

                            Sum Sq.res   F value        Pr(>F)  
prompt                    1.881984e+07  8.300884  1.017352e-11  
fine_tune_dataset         2.108149e+07  0.022849  8.798988e-01  
prompt:fine_tune_dataset  2.072867e+07  1.131725  3.377401e-01  
-------------------- Mistral --------------------
df size: (627, 8)
Levene’s Test: Statistic = 6.7392, p-value = 2.882e-16
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   607.0  2.924756e+06   
fine_tune_dataset                fine_tune_dataset  1.0   607.0  5.470815e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   607.0  8.455530e+05   

                            Sum Sq.res     F value        Pr(>F)  
prompt                    1.758870e+07   11.215069  2.675569e-16  
fine_tune_dataset         1.500849e+07  221.260351  6.745058e-43  
prompt:fine_tune_dataset  1.961297e+07    2.907660  2.229715e-03
"""
output_x_text = re.search(r"output_x\s*=+:([\s\S]+?)=+: output_y", text).group(1)
output_y_text = re.search(r"output_y\s*=+:([\s\S]+)", text).group(1)
print(output_x_text)
print("="*40)
print(output_y_text)


=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
df size: (720, 8)
Levene’s Test: Statistic = 8.4102, p-value = 1.513e-21
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  prompt                                      prompt  9.0   700.0  8.444127e+06   
fine_tune_dataset                fine_tune_dataset  1.0   700.0  1.387190e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   700.0  8.575971e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.259571e+07  29.065929  4.154417e-43  
fine_tune_dataset         2.964537e+07  32.754960  1.550529e-08  
prompt:fine_tune_dataset  3.016229e+07   2.211437  1.970095e-02  
-------------------- Llama3 --------------------
df size: (713, 8)
Levene’s Test: Statistic = 3.5789, p-value = 4.779e-07
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  prompt                    

In [69]:
from scipy.stats import ttest_ind
def t_test(_df: pd.DataFrame, _dependent_var: str):
    _df['is_finetuned'] = df['fine_tune_dataset'].apply(lambda x: 'Base' not in x)
    base_dep_var = _df[_df['is_finetuned'] == False][_dependent_var]
    finetuned_dep_var = _df[_df['is_finetuned'] == True][_dependent_var]
    t_stat, p_value = ttest_ind(base_dep_var, finetuned_dep_var, equal_var=False)
    return (t_stat, p_value)    

for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(t_test(_df=model_split_df[model], _dependent_var = dependent_var))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
(np.float64(-6.127505279754074), np.float64(1.055762172068916e-08))
-------------------- Llama3 --------------------
(np.float64(5.082108758340775), np.float64(1.1090455663368004e-06))
-------------------- Falcon --------------------
(np.float64(8.371913648092294), np.float64(2.9543018299527145e-15))
-------------------- Mistral --------------------
(np.float64(-5.235095992765232), np.float64(6.911225653954734e-07))
-------------------- Phi --------------------
(np.float64(12.417752026120633), np.float64(2.1585652574624705e-27))
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
(np.float64(5.966800986687974), np.float64(2.0666166496433783e-08))
-------------------- Llama3 --------------------
(np.float64(-9.474745646468088), np.float64(2.0860475515015573e-17))
-

In [43]:
import re
import csv

output_x_raw = """
-------------------- gemma --------------------
(np.float64(-6.127505279754074), np.float64(1.055762172068916e-08))
-------------------- Llama3 --------------------
(np.float64(5.082108758340775), np.float64(1.1090455663368004e-06))
-------------------- Falcon --------------------
(np.float64(8.371913648092294), np.float64(2.9543018299527145e-15))
-------------------- Mistral --------------------
(np.float64(-5.30810768144648), np.float64(4.723292695384559e-07))
-------------------- Phi --------------------
(np.float64(12.417752026120633), np.float64(2.1585652574624705e-27))
"""

output_y_raw = """
-------------------- gemma --------------------
(np.float64(5.966800986687974), np.float64(2.0666166496433783e-08))
-------------------- Llama3 --------------------
(np.float64(-9.474745646468088), np.float64(2.0860475515015573e-17))
-------------------- Falcon --------------------
(np.float64(-0.5598663913560933), np.float64(0.57653193374933))
-------------------- Mistral --------------------
(np.float64(-22.823010125410967), np.float64(1.3589373117604956e-54))
-------------------- Phi --------------------
(np.float64(5.14785348670747), np.float64(3.1538938438164265e-06))
"""

def parse_output(raw_text):
    pattern = r"--------------------\s+(.*?)\s+--------------------\s+\(np\.float64\(([-\d.eE+]+)\),\s+np\.float64\(([\d.eE+-]+)\)\)"
    return {match[0]: (float(match[1]), float(match[2])) for match in re.findall(pattern, raw_text, re.DOTALL)}

output_x = parse_output(output_x_raw)
output_y = parse_output(output_y_raw)
print(output_x)
print(output_y)
# Write to CSV
with open("t_stats.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["model", "t-stat-x", "p-value", "t-stat-y", "p-value"])
    for model in output_x:
        t_x, p_x = output_x[model]
        t_y, p_y = output_y[model]
        writer.writerow([
            model,
            f"{t_x:.1e}",
            f"{p_x:.1e}",
            f"{t_y:.1e}",
            f"{p_y:.1e}"
        ])


{'gemma': (-6.127505279754074, 1.055762172068916e-08), 'Llama3': (5.082108758340775, 1.1090455663368004e-06), 'Falcon': (8.371913648092294, 2.9543018299527145e-15), 'Mistral': (-5.30810768144648, 4.723292695384559e-07), 'Phi': (12.417752026120633, 2.1585652574624705e-27)}
{'gemma': (5.966800986687974, 2.0666166496433783e-08), 'Llama3': (-9.474745646468088, 2.0860475515015573e-17), 'Falcon': (-0.5598663913560933, 0.57653193374933), 'Mistral': (-22.823010125410967, 1.3589373117604956e-54), 'Phi': (5.14785348670747, 3.1538938438164265e-06)}


In [1]:
# url = "https://docs.google.com/spreadsheets/d/14dHuq-Z52B-RYvYX0IDFgC9ISXDDr9ID/edit?gid=801390513#gid=801390513"
url = "https://docs.google.com/spreadsheets/d/1L6WA2vUNvY3B7ClDqdYxCKmqTocDbbOz/edit?gid=126664231#gid=126664231"
df = read_gsheet(url=url)
df = df.drop_duplicates()
print(len(df))
df['prompt'] = df['prompt'].astype(str)


model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    _df = df[(df['model']== model)]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df
    
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
dataset_category = {"target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-convo'}, "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}}
# we know prompt and fine-tuning is important
def tukey_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base']
    #print(len(_df), list(_df), set(_df['fine_tune_dataset']))
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
    #model = ols(f'{_dependent_var} ~ C(prompt) + C(fine_tune_dataset) + C(prompt):C(fine_tune_dataset)', data=_df).fit()
    #anova_table = sm.stats.anova_lm(model, typ=2)
    tukey = pairwise_tukeyhsd(endog=_df[_dependent_var],
                              groups=_df["dataset_group"],
                              alpha=0.05)
    return tukey

for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(tukey_test(_df=model_split_df[model], _dependent_var = dependent_var))

3173
Llama3 713
gemma 720
Phi 480
Falcon 633
Mistral 627


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)


NameError: name 'dependent_vars' is not defined

In [71]:
import pandas as pd
import pingouin as pg
from statsmodels.formula.api import ols
import statsmodels.api as sm

dataset_category = {
    "target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-convo'},
    "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}
}

def games_howell_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base'].copy()
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
    # model = ols(f'{_dependent_var} ~ C(prompt) + C(fine_tune_dataset) + C(prompt):C(fine_tune_dataset)', data=_df).fit()
    # anova_table = sm.stats.anova_lm(model, typ=2)
    gh_result = pg.pairwise_gameshowell(dv=_dependent_var, between="dataset_group", data=_df)
    return gh_result

for dependent_var in dependent_vars:
    print("=:" * 20 + f" {dependent_var} " + "=:" * 20)
    for model in models:
        print("-" * 20 + f" {model} " + "-" * 20)
        result = games_howell_test(_df=model_split_df[model], _dependent_var=dependent_var)
        #print(result)
        print(result[['diff', 'pval']])

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
       diff      pval
0 -0.573642  0.000008
-------------------- Llama3 --------------------
       diff      pval
0  0.125732  0.244809
-------------------- Falcon --------------------
       diff      pval
0 -0.231071  0.007725
-------------------- Mistral --------------------
       diff      pval
0  0.373778  0.000422
-------------------- Phi --------------------
       diff  pval
0 -0.697567   0.0
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
       diff  pval
0  1.264442   0.0
-------------------- Llama3 --------------------
       diff      pval
0 -0.017108  0.911435
-------------------- Falcon --------------------
       diff      pval
0  0.076148  0.467119
-------------------- Mistral --------------------
       diff     pval
0 -0.051392  0.70409
--