In [2]:
import pandas as pd
from typing import List, Optional
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import levene
from pingouin import welch_anova
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.packages import importr
pandas2ri.activate()

pd.options.mode.chained_assignment = None  # default='warn'

ARTool = importr('ARTool')
base = importr('base')
stats = importr('stats')

In [4]:
def read_gsheet(url: Optional[str]=None, sheet_id: Optional[str]=None, gid: Optional[str]=None):
    if url is not None:
        match = re.search(r"spreadsheets/d/([^/]+)/.*?[?&]gid=(\d+)", url)
        if match:
            sheet_id = match.group(1)
            gid = match.group(2)
        else:
            print("can't parse url to get sheet id and gid")
    else:
        assert sheet_id is not None and gid is not None, "Sheet id an gid must be not None when url is not None"
    _url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid={gid}"
    return pd.read_csv(_url)

In [6]:
def multi_way_anova(_filter_df: pd.DataFrame, independent_vars: Optional[List[str]] = ["fine_tune_dataset"],
                    dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do multi way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_vars: contributing factors
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    if collapse:
        mapping = {'Base': 'Base'}
        _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # print(f"df size: {_filter_df.shape}")
    # levene
    grouped_data =  _filter_df.groupby(independent_vars)[dependent_var].apply(list)
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05: # pval must be < 0.05 for us to reject the H_0 at Levenes.
        print("Robust ANOVA")
        r_df = pandas2ri.py2rpy(_filter_df)
        r.assign("rdf", r_df)

        for var in independent_vars:
            r(f"rdf${var} <- as.factor(rdf${var})")
        formula_str = f"{dependent_var} ~ {' * '.join(independent_vars)}"
        r(f'''
            library(ARTool)
            model <- art({formula_str}, data = rdf)
            art_result <- anova(model, type=2)
        ''')

        art_result = r('art_result')
        art_df = pandas2ri.rpy2py(art_result)
        return art_df
    formula = f"{dependent_var} ~" + ' + '.join([f'C({col})' for col in independent_vars])
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [7]:
def one_way_anova(_filter_df: pd.DataFrame, independent_var: Optional[str] = "fine_tune_dataset",
                  dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do one way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_var: contributing factor
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    #print(f"df size: {_filter_df.shape}")
    # suppose the independent variable is prompt, then, it is possible that the dependent variable has the same value for all the prompts. 
    # in that case, we should filter out those rows as the variance will be zero.
    grouped_data = _filter_df.groupby(independent_var)
    for group_name, group in grouped_data:
        if group[dependent_var].nunique() <= 1:
            print(f"Group '{group_name}' has only one unique value for '{dependent_var}', removing it from analysis.")
            _filter_df = _filter_df[_filter_df[independent_var] != group_name]
    #print(f"df size after filtering: {_filter_df.shape}")
    mapping = {'Base': 'Base'}
    _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # levene
    grouped_data = [group[dependent_var].values for name, group in _filter_df.groupby(independent_var)]
    grouped_data = [group for group in grouped_data if len(group) > 1 and len(set(group)) > 1]  
    # filter out groups with only one observation and groups with no variance
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05:
        print("Welch")
        assert independent_var is not None and dependent_var is not None
        if len(set(_filter_df[independent_var])) < 2:
            print(f"Only one level of {independent_var} found, returning empty DataFrame.")
            return pd.DataFrame(columns=['F', 'PR(>F)', 'sum_sq', 'df', 'mean_sq', 'eta_sq'])
        welch_results = welch_anova(dv=dependent_var, between=independent_var, data=_filter_df)
        return welch_results
    formula=f"{dependent_var} ~ C({independent_var})"
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [8]:
url = "https://docs.google.com/spreadsheets/d/1XgO8oV-78ct0aaHGdGYg3RiTVIXE3aQw/edit?gid=1451271972#gid=1451271972"
df = read_gsheet(url=url)
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
print(len(df))
df['prompt'] = df['prompt'].astype(str)

2704


### base data analysis

1. one way anova.
2. multi way anova.

In [13]:
## base data analysis

model_split_df = {}

columns = {}
columns['equality'] = "Equality"
columns['nation'] = "Nation"
columns['liberty'] = "Liberty"
columns['tradition'] = "Tradition"

for model in set(df['model']):
    _df = df[(df['model']== model) & (df['fine_tune_dataset'] == "Base")]
    _df.rename(columns={v:k for k,v in columns.items()}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df

print(set(df['model']))
models = ["Gemma", "Llama", "Falcon", "Mistral"]

Mistral 80
Gemma 80
Llama 80
Falcon 80
{'Mistral', 'Gemma', 'Llama', 'Falcon'}


In [14]:
print(model_split_df["Gemma"])

      model  n_beams  tmp  top_k prompt fine_tune_dataset  equality  \
1984  Gemma        1  0.5     50      0              Base      71.8   
1985  Gemma        1  0.5     50      1              Base      79.5   
1986  Gemma        1  0.5     50      2              Base      74.4   
1987  Gemma        1  0.5     50      3              Base      69.9   
1988  Gemma        1  0.5     50      4              Base      78.2   
...     ...      ...  ...    ...    ...               ...       ...   
2059  Gemma        5  1.0    100      5              Base      79.5   
2060  Gemma        5  1.0    100      6              Base      78.8   
2061  Gemma        5  1.0    100      7              Base      79.5   
2062  Gemma        5  1.0    100      8              Base      78.8   
2063  Gemma        5  1.0    100      9              Base      79.5   

     Equality_binned  nation Nation_binned  liberty Liberty_binned  tradition  \
1984            left    46.1      moderate     57.0       moderate

### Effect of different parameters on the output variables.
for the generation parameters, there's no reason to expect interactions, and we also have less data. proceed with one way anova.


In [47]:
gen_factors = ["tmp", "top_k", "n_beams", "prompt"]
dependent_vars = ["equality", "nation", "liberty", "tradition"]
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for gen_factor in gen_factors:
        print("="*20+f" {gen_factor} "+"="*20)
        for model in models:
            #print(model_split_df[model])
            print("-"*20+f" {model} "+"-"*20)
            print(one_way_anova(_filter_df=model_split_df[model], independent_var = gen_factor,
                    dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: equality =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.0656, p-value = 0.7986
           sum_sq    df         F    PR(>F)    eta_sq
C(tmp)      1.682   1.0  0.153425  0.696352  0.001963
Residual  855.116  78.0       NaN       NaN  0.998037
-------------------- Llama --------------------
Levene’s Test: Statistic = 1.5230, p-value = 0.2209
               sum_sq    df         F    PR(>F)    eta_sq
C(tmp)      13.366125   1.0  0.619664  0.433557  0.007882
Residual  1682.455750  78.0       NaN       NaN  0.992118
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    7.987217e-29   1.0  3.171609e-30     1.0  4.066165e-32
Residual  1.964312e+03  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.000

In [49]:
#  code to convert it to csv format
raw_text = """
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: equality =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
==================== tmp ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.0656, p-value = 0.7986
           sum_sq    df         F    PR(>F)    eta_sq
C(tmp)      1.682   1.0  0.153425  0.696352  0.001963
Residual  855.116  78.0       NaN       NaN  0.998037
-------------------- Llama --------------------
Levene’s Test: Statistic = 1.5230, p-value = 0.2209
               sum_sq    df         F    PR(>F)    eta_sq
C(tmp)      13.366125   1.0  0.619664  0.433557  0.007882
Residual  1682.455750  78.0       NaN       NaN  0.992118
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    7.987217e-29   1.0  3.171609e-30     1.0  4.066165e-32
Residual  1.964312e+03  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    7.987217e-29   1.0  5.887038e-30     1.0  7.547485e-32
Residual  1.058262e+03  78.0           NaN     NaN  1.000000e+00
==================== top_k ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.1342, p-value = 0.7151
           sum_sq    df         F    PR(>F)    eta_sq
C(top_k)    4.608   1.0  0.421765  0.517966  0.005378
Residual  852.190  78.0       NaN       NaN  0.994622
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.1201, p-value = 0.7299
               sum_sq    df         F    PR(>F)    eta_sq
C(top_k)     2.701125   1.0  0.124438  0.725222  0.001593
Residual  1693.120750  78.0       NaN       NaN  0.998407
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  7.730837e-28   1.0  3.069804e-29     1.0  3.935646e-31
Residual  1.964312e+03  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  8.346148e-27   1.0  6.151592e-28     1.0  7.886656e-30
Residual  1.058262e+03  78.0           NaN     NaN  1.000000e+00
==================== n_beams ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.4589, p-value = 0.5001
             sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)    1.922   1.0  0.175366  0.676537  0.002243
Residual    854.876  78.0       NaN       NaN  0.997757
-------------------- Llama --------------------
Levene’s Test: Statistic = 1.5288, p-value = 0.22
                 sum_sq    df         F   PR(>F)   eta_sq
C(n_beams)     0.153125   1.0  0.007044  0.93333  0.00009
Residual    1695.668750  78.0       NaN      NaN  0.99991
-------------------- Falcon --------------------
Levene’s Test: Statistic = 5.1812, p-value = 0.02558
Welch
    Source  ddof1      ddof2           F         p-unc       np2
0  n_beams      1  57.187375  128.696044  2.882433e-16  0.622634
-------------------- Mistral --------------------
Levene’s Test: Statistic = 1.3209, p-value = 0.2539
              sum_sq    df         F   PR(>F)    eta_sq
C(n_beams)    48.050   1.0  3.710013  0.05773  0.045405
Residual    1010.212  78.0       NaN      NaN  0.954595
==================== prompt ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 3.2325, p-value = 0.00247
Welch
   Source  ddof1      ddof2         F         p-unc       np2
0  prompt      9  27.576333  66.24044  2.570565e-16  0.790026
-------------------- Llama --------------------
Levene’s Test: Statistic = 1.6068, p-value = 0.1302
               sum_sq    df         F        PR(>F)    eta_sq
C(prompt)  929.305625   9.0  9.429588  3.046495e-09  0.547997
Residual   766.516250  70.0       NaN           NaN  0.452003
-------------------- Falcon --------------------
Group '0' has only one unique value for 'equality', removing it from analysis.
Levene’s Test: Statistic = 1326923169309211855317001830400.0000, p-value = 0
Welch
   Source  ddof1      ddof2         F     p-unc     np2
0  prompt      8  26.115796  1.376481  0.252459  0.0946
-------------------- Mistral --------------------
Group '3' has only one unique value for 'equality', removing it from analysis.
Group '6' has only one unique value for 'equality', removing it from analysis.
Group '7' has only one unique value for 'equality', removing it from analysis.
Group '9' has only one unique value for 'equality', removing it from analysis.
Levene’s Test: Statistic = 1901228312334481903345498849280.0000, p-value = 0
Welch
   Source  ddof1      ddof2           F         p-unc       np2
0  prompt      5  19.232704  144.301056  1.468175e-14  0.778568
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: nation =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
==================== tmp ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.3539, p-value = 0.5537
               sum_sq    df         F    PR(>F)    eta_sq
C(tmp)       0.378125   1.0  0.020755  0.885821  0.000266
Residual  1421.071750  78.0       NaN       NaN  0.999734
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.0280, p-value = 0.8675
               sum_sq    df         F    PR(>F)  eta_sq
C(tmp)       4.560125   1.0  0.242551  0.623753  0.0031
Residual  1466.453750  78.0       NaN       NaN  0.9969
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    2.073225e-28   1.0  3.091349e-29     1.0  3.963268e-31
Residual  5.231100e+02  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    1.577722e-29   1.0  8.923714e-31     1.0  1.144066e-32
Residual  1.379048e+03  78.0           NaN     NaN  1.000000e+00
==================== top_k ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.2383, p-value = 0.6268
               sum_sq    df        F    PR(>F)    eta_sq
C(top_k)     0.990125   1.0  0.05437  0.816238  0.000697
Residual  1420.459750  78.0      NaN       NaN  0.999303
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.4726, p-value = 0.4938
               sum_sq    df         F    PR(>F)    eta_sq
C(top_k)     4.656125   1.0  0.247673  0.620118  0.003165
Residual  1466.357750  78.0       NaN       NaN  0.996835
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  5.918922e-28   1.0  8.825599e-29     1.0  1.131487e-30
Residual  5.231100e+02  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  1.952677e-27   1.0  1.104449e-28     1.0  1.415960e-30
Residual  1.379048e+03  78.0           NaN     NaN  1.000000e+00
==================== n_beams ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.4879, p-value = 0.487
                 sum_sq    df         F   PR(>F)    eta_sq
C(n_beams)     0.276125   1.0  0.015155  0.90234  0.000194
Residual    1421.173750  78.0       NaN      NaN  0.999806
-------------------- Llama --------------------
Levene’s Test: Statistic = 1.7971, p-value = 0.184
                 sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)    62.128125   1.0  3.439593  0.067431  0.042235
Residual    1408.885750  78.0       NaN       NaN  0.957765
-------------------- Falcon --------------------
Levene’s Test: Statistic = 1.7798, p-value = 0.1861
            sum_sq    df          F        PR(>F)    eta_sq
C(n_beams)  211.25   1.0  52.836209  2.415547e-10  0.403835
Residual    311.86  78.0        NaN           NaN  0.596165
-------------------- Mistral --------------------
Levene’s Test: Statistic = 6.4391, p-value = 0.01316
Welch
    Source  ddof1      ddof2         F     p-unc       np2
0  n_beams      1  76.973552  1.445289  0.232971  0.018192
==================== prompt ====================
-------------------- Gemma --------------------
Group '6' has only one unique value for 'nation', removing it from analysis.
Levene’s Test: Statistic = 1.5741, p-value = 0.1507
                sum_sq    df          F        PR(>F)    eta_sq
C(prompt)  1235.417778   8.0  70.012971  2.054999e-28  0.898893
Residual    138.958750  63.0        NaN           NaN  0.101107
-------------------- Llama --------------------
Levene’s Test: Statistic = 2.1163, p-value = 0.03931
Welch
   Source  ddof1      ddof2         F    p-unc       np2
0  prompt      9  28.191967  5.778082  0.00015  0.496678
-------------------- Falcon --------------------
Group '7' has only one unique value for 'nation', removing it from analysis.
Levene’s Test: Statistic = 1976861496974665441774009319424.0000, p-value = 0
Welch
   Source  ddof1      ddof2        F         p-unc      np2
0  prompt      8  25.655894  24.2165  4.242525e-10  0.30901
-------------------- Mistral --------------------
Group '8' has only one unique value for 'nation', removing it from analysis.
Levene’s Test: Statistic = 783483997930123045373564944384.0000, p-value = 0
Welch
   Source  ddof1      ddof2          F         p-unc       np2
0  prompt      8  25.688647  75.716834  6.658616e-16  0.787019
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: liberty =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
==================== tmp ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.0278, p-value = 0.868
               sum_sq    df         F    PR(>F)    eta_sq
C(tmp)       0.300125   1.0  0.015925  0.899902  0.000204
Residual  1469.981750  78.0       NaN       NaN  0.999796
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.4049, p-value = 0.5264
               sum_sq    df         F    PR(>F)    eta_sq
C(tmp)       0.630125   1.0  0.043131  0.836019  0.000553
Residual  1139.533750  78.0       NaN       NaN  0.999447
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    1.193152e-28   1.0  3.331300e-29     1.0  4.270898e-31
Residual  2.793680e+02  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    3.944305e-30   1.0  2.555637e-31     1.0  3.276458e-33
Residual  1.203832e+03  78.0           NaN     NaN  1.000000e+00
==================== top_k ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.0077, p-value = 0.9301
               sum_sq    df         F   PR(>F)    eta_sq
C(top_k)     0.021125   1.0  0.001121  0.97338  0.000014
Residual  1470.260750  78.0       NaN      NaN  0.999986
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.2233, p-value = 0.6379
               sum_sq    df         F    PR(>F)    eta_sq
C(top_k)     0.780125   1.0  0.053406  0.817845  0.000684
Residual  1139.383750  78.0       NaN       NaN  0.999316
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  2.086537e-27   1.0  5.825646e-28     1.0  7.468776e-30
Residual  2.793680e+02  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  6.793078e-27   1.0  4.401446e-28     1.0  5.642879e-30
Residual  1.203832e+03  78.0           NaN     NaN  1.000000e+00
==================== n_beams ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.3792, p-value = 0.5398
                 sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)     5.460125   1.0  0.290745  0.591279  0.003714
Residual    1464.821750  78.0       NaN       NaN  0.996286
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.4136, p-value = 0.522
                 sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)    66.066125   1.0  4.797662  0.031485  0.057944
Residual    1074.097750  78.0       NaN       NaN  0.942056
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0431, p-value = 0.8361
             sum_sq    df          F        PR(>F)    eta_sq
C(n_beams)  100.352   1.0  43.724896  4.260992e-09  0.359211
Residual    179.016  78.0        NaN           NaN  0.640789
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.2038, p-value = 0.6529
              sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)    15.488   1.0  1.016595  0.316446  0.012866
Residual    1188.344  78.0       NaN       NaN  0.987134
==================== prompt ====================
-------------------- Gemma --------------------
Group '2' has only one unique value for 'liberty', removing it from analysis.
Levene’s Test: Statistic = 1.0265, p-value = 0.4258
               sum_sq    df          F        PR(>F)   eta_sq
C(prompt)  393.198611   8.0  27.599648  7.771534e-18  0.77801
Residual   112.191250  63.0        NaN           NaN  0.22199
-------------------- Llama --------------------
Levene’s Test: Statistic = 2.5323, p-value = 0.01414
Welch
   Source  ddof1      ddof2          F         p-unc       np2
0  prompt      9  28.043858  27.877303  1.078209e-11  0.571166
-------------------- Falcon --------------------
Levene’s Test: Statistic = 854901217292806499294215405568.0000, p-value = 0
Welch
   Source  ddof1     ddof2         F     p-unc       np2
0  prompt      9  27.51081  1.123136  0.380132  0.019358
-------------------- Mistral --------------------
Group '1' has only one unique value for 'liberty', removing it from analysis.
Group '7' has only one unique value for 'liberty', removing it from analysis.
Group '8' has only one unique value for 'liberty', removing it from analysis.
Levene’s Test: Statistic = 279436078601090981130378674176.0000, p-value = 0
Welch
   Source  ddof1      ddof2          F         p-unc      np2
0  prompt      6  21.030938  122.94645  2.887789e-15  0.86452
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: tradition =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
==================== tmp ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.0145, p-value = 0.9044
           sum_sq    df         F    PR(>F)    eta_sq
C(tmp)      0.050   1.0  0.004243  0.948234  0.000054
Residual  919.268  78.0       NaN       NaN  0.999946
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.4440, p-value = 0.5071
            sum_sq    df         F   PR(>F)    eta_sq
C(tmp)      3.2805   1.0  0.845582  0.36064  0.010725
Residual  302.6070  78.0       NaN      NaN  0.989275
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(tmp)    2.073225e-28   1.0  2.740279e-29     1.0  3.513179e-31
Residual  5.901280e+02  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
           sum_sq    df    F  PR(>F)  eta_sq
C(tmp)      0.000   1.0  0.0     1.0     0.0
Residual  535.238  78.0  NaN     NaN     1.0
==================== top_k ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.0678, p-value = 0.7953
           sum_sq    df         F    PR(>F)    eta_sq
C(top_k)    0.288   1.0  0.024443  0.876167  0.000313
Residual  919.030  78.0       NaN       NaN  0.999687
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.3099, p-value = 0.5793
            sum_sq    df         F   PR(>F)    eta_sq
C(top_k)    1.2005   1.0  0.307329  0.58091  0.003925
Residual  304.6870  78.0       NaN      NaN  0.996075
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  5.445605e-28   1.0  7.197713e-29     1.0  9.227838e-31
Residual  5.901280e+02  78.0           NaN     NaN  1.000000e+00
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.0000, p-value = 1
                sum_sq    df             F  PR(>F)        eta_sq
C(top_k)  1.781100e-27   1.0  2.595589e-28     1.0  3.327679e-30
Residual  5.352380e+02  78.0           NaN     NaN  1.000000e+00
==================== n_beams ====================
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.6163, p-value = 0.4348
             sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)    4.418   1.0  0.376658  0.541183  0.004806
Residual    914.900  78.0       NaN       NaN  0.995194
-------------------- Llama --------------------
Levene’s Test: Statistic = 0.6080, p-value = 0.4379
              sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)    9.8000   1.0  2.581669  0.112149  0.032038
Residual    296.0875  78.0       NaN       NaN  0.967962
-------------------- Falcon --------------------
Levene’s Test: Statistic = 26.0703, p-value = 2.27e-06
Welch
    Source  ddof1      ddof2          F         p-unc       np2
0  n_beams      1  56.188219  63.571478  8.370965e-11  0.449042
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.4863, p-value = 0.4877
             sum_sq    df          F    PR(>F)    eta_sq
C(n_beams)   74.498   1.0  12.611981  0.000654  0.139187
Residual    460.740  78.0        NaN       NaN  0.860813
==================== prompt ====================
-------------------- Gemma --------------------
Group '9' has only one unique value for 'tradition', removing it from analysis.
Levene’s Test: Statistic = 4.9470, p-value = 8.955e-05
Welch
   Source  ddof1      ddof2          F         p-unc       np2
0  prompt      8  25.384629  69.667921  2.480302e-15  0.874663
-------------------- Llama --------------------
Levene’s Test: Statistic = 1.4926, p-value = 0.1678
             sum_sq    df         F        PR(>F)    eta_sq
C(prompt)  143.1300   9.0  6.839828  5.453170e-07  0.467917
Residual   162.7575  70.0       NaN           NaN  0.532083
-------------------- Falcon --------------------
Group '3' has only one unique value for 'tradition', removing it from analysis.
Levene’s Test: Statistic = 3228864535106331453396671791104.0000, p-value = 0
Welch
   Source  ddof1      ddof2          F         p-unc       np2
0  prompt      8  25.743495  13.088397  2.523812e-07  0.139313
-------------------- Mistral --------------------
Levene’s Test: Statistic = 384034193352036085841128325120.0000, p-value = 0
Welch
   Source  ddof1     ddof2           F         p-unc       np2
0  prompt      9  27.73492  199.909491  8.218998e-23  0.816568
"""
## ^^ the raw text block containing the results of the ANOVA tests from the previous step.

from pydoc import text
import pandas as pd
import re
import numpy as np
import itertools


def parse_results(text_block):
    results = {}
    sections = re.split(r"=+ ([a-z_]+) =+", text_block)
    for i in range(1, len(sections), 2):
        param = sections[i]
        models_data = re.split(r"-+ ([A-Za-z0-9]+) -+", sections[i+1])
        results[param] = {}
        for j in range(1, len(models_data), 2):
            model = models_data[j]
            content = models_data[j+1]
            welch_match = "welch" in content.lower()
            if welch_match:
                pattern = r'(\d+\.\d+)\s+(\d+\.\d+(?:e[+-]?\d+)?)\s+(\d+\.\d+)$'
                match = re.search(pattern, content.strip())
                if match:
                    F = float(match.group(1))      # F value: 16.263571
                    p = float(match.group(2))  # p-unc value: 0.000136
            else:
                match = re.search(r'F\s*=\s*([0-9.]+)[,\s]+p-value\s*=\s*([0-9.]+)', content)
                if match:
                    F = float(match.group(1))
                    p = float(match.group(2))
                else:
                    match2 = re.search(r'C\([a-z_]+\).*?([0-9.]+)\s+[0-9.]+\s+([0-9.eE+-]+)\s+([0-9.eE+-]+)', content, re.DOTALL)
                    if match2:
                        F = float(match2.group(2))
                        p = float(match2.group(3))
                    else:
                        F, p = np.nan, np.nan
            results[param][model] = (F, p)
            print(f"Model: {model}, Param: {param}, F-stat: {F}, p-value: {p}")
    return results

def get_block(_text):
    pattern = r"(=:+\s*(\w+)\s*=:+)(.*?)(?==:+\s*\w+\s*=:+|\Z)"  # updated
    matches = re.finditer(pattern, _text, flags=re.DOTALL)
    blocks = {}
    for match in matches:
        header = match.group(2).strip()        # e.g., 'equality'
        content = match.group(3).strip()       # text block following it
        blocks[header] = content.strip("=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:")
    return blocks

rows = []
blocks = get_block(raw_text)
for block_name, block_text in blocks.items():
    results = parse_results(block_text)
    params = results.keys()
    for param in params:
        for i, model in enumerate(models):
            fx, px = results.get(param, {}).get(model, (np.nan, np.nan))
            rows.append({
                'Decoding Param': param if i == 0 else '',
                'Model': model,
                f'F-stat {block_name}': fx,
                f'p-value {block_name}': px,
            })
print(len(rows))
df = pd.DataFrame(rows)


def convert_csv_to_scientific_notation(_df):
    for col in _df.columns:
        if _df[col].dtype in ['float64', 'float32', 'int64', 'int32']:
            _df[col] = _df[col].apply(lambda x: f"{x:.2e}")
    return _df


# df = sort_models(df, models)
print(df)
df = convert_csv_to_scientific_notation(df)
df.to_csv("one_way_anova_results_base_8values.csv", index=False)

Model: Gemma, Param: tmp, F-stat: 0.153425, p-value: 0.696352
Model: Llama, Param: tmp, F-stat: 0.619664, p-value: 0.433557
Model: Falcon, Param: tmp, F-stat: 3.171609e-30, p-value: 1.0
Model: Mistral, Param: tmp, F-stat: 5.887038e-30, p-value: 1.0
Model: Gemma, Param: top_k, F-stat: 0.421765, p-value: 0.517966
Model: Llama, Param: top_k, F-stat: 0.124438, p-value: 0.725222
Model: Falcon, Param: top_k, F-stat: 3.069804e-29, p-value: 1.0
Model: Mistral, Param: top_k, F-stat: 6.151592e-28, p-value: 1.0
Model: Gemma, Param: n_beams, F-stat: 0.175366, p-value: 0.676537
Model: Llama, Param: n_beams, F-stat: 0.007044, p-value: 0.93333
Model: Falcon, Param: n_beams, F-stat: 128.696044, p-value: 2.882433e-16
Model: Mistral, Param: n_beams, F-stat: 3.710013, p-value: 0.05773
Model: Gemma, Param: prompt, F-stat: 66.24044, p-value: 2.570565e-16
Model: Llama, Param: prompt, F-stat: 9.429588, p-value: 3.046495e-09
Model: Falcon, Param: prompt, F-stat: 1.376481, p-value: 0.252459
Model: Mistral, Par

In [None]:
# we don't _need_ to see the interaction effect of variables, but just to be sure
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        
        print(multi_way_anova(_filter_df=model_split_df[model], independent_vars= ["prompt", "top_k"], dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- gemma --------------------
df size: (80, 8)
Levene’s Test: Statistic = 4.1442, p-value = 1.252e-05
Robust ANOVA
                      Term   Df  Df.res    Sum Sq  Sum Sq.res    F value  \
prompt              prompt  9.0    60.0  37749.75      4764.0  52.826406   
top_k                top_k  1.0    60.0   1620.00     38337.5   2.535377   
prompt:top_k  prompt:top_k  9.0    60.0   4707.25     34108.0   0.920068   

                    Pr(>F)  
prompt        3.329243e-25  
top_k         1.165756e-01  
prompt:top_k  5.143277e-01  
-------------------- Llama3 --------------------
df size: (80, 8)
Levene’s Test: Statistic = 3.7918, p-value = 4.041e-05
Robust ANOVA
                      Term   Df  Df.res     Sum Sq  Sum Sq.res   F value  \
prompt              prompt  9.0    60.0  5300.4375   37035.625  0.954115   
top_k                top_k  1.0    60.0     4.0500   41817.625  0.005