In [79]:
import pandas as pd
from typing import List, Optional
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import levene
from pingouin import welch_anova
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.packages import importr
pandas2ri.activate()

ARTool = importr('ARTool')
base = importr('base')
stats = importr('stats')

In [80]:
def read_gsheet(url: Optional[str]=None, sheet_id: Optional[str]=None, gid: Optional[str]=None):
    if url is not None:
        match = re.search(r"spreadsheets/d/([^/]+)/.*?[?&]gid=(\d+)", url)
        if match:
            sheet_id = match.group(1)
            gid = match.group(2)
        else:
            print("can't parse url to get sheet id and gid")
    else:
        assert sheet_id is not None and gid is not None, "Sheet id an gid must be not None when url is not None"
    _url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid={gid}"
    return pd.read_csv(_url)

In [116]:
def multi_way_anova(_filter_df: pd.DataFrame, independent_vars: Optional[List[str]] = ["fine_tune_dataset"],
                    dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do one way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_vars: contributing factors
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    if collapse:
        mapping = {'Base': 'Base'}
        _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # levene
    grouped_data =  _filter_df.groupby(independent_vars)[dependent_var].apply(list)
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05: # pval must be < 0.05 for us to reject the H_0 at Levenes.
        print("Robust ANOVA")
        r_df = pandas2ri.py2rpy(_filter_df)
        r.assign("rdf", r_df)

        for var in independent_vars:
            r(f"rdf${var} <- as.factor(rdf${var})")
        formula_str = f"{dependent_var} ~ {' * '.join(independent_vars)}"
        r(f'''
            library(ARTool)
            model <- art({formula_str}, data = rdf)
            art_result <- anova(model, type=2)
        ''')

        art_result = r('art_result')
        art_df = pandas2ri.rpy2py(art_result)
        return art_df
    formula = f"{dependent_var} ~" + ' + '.join([f'C({col})' for col in independent_vars])
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [105]:
def one_way_anova(_filter_df: pd.DataFrame, independent_var: Optional[str] = "fine_tune_dataset",
                  dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do one way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_var: contributing factor
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    if collapse:
        mapping = {'Base': 'Base'}
        _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # levene
    grouped_data = [group[dependent_var].values for name, group in _filter_df.groupby(independent_var)]
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05:
        print("Welch")
        assert independent_var is not None and dependent_var is not None
        welch_results = welch_anova(dv=dependent_var, between=independent_var, data=_filter_df)
        return welch_results
    formula=f"{dependent_var} ~ C({independent_var})"
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [127]:
url = "https://docs.google.com/spreadsheets/d/14dHuq-Z52B-RYvYX0IDFgC9ISXDDr9ID/edit?gid=801390513#gid=801390513"
df = read_gsheet(url=url)
print(len(df))
df['prompt'] = df['prompt'].astype(str)

3660


### base data analysis

1. one way anova.
2. multi way anova.

In [91]:
## base data analysis

model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    _df = df[(df['model']== model) & (df['fine_tune_dataset'] == "Base")]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df

Falcon 170
Gemma 178
Mistral 177
Phi 119
Llama3 176


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)


In [92]:
print(model_split_df["Gemma"])

      model fine_tune_dataset  n_beams  tmp  top_k  n_beam_groups prompt  \
78    Gemma              Base        5  1.5    100              1      5   
79    Gemma              Base        5  1.5     50              1      0   
80    Gemma              Base        5  1.5     50              1      1   
81    Gemma              Base        5  1.5    100              1      4   
82    Gemma              Base        5  1.5    100              1      6   
...     ...               ...      ...  ...    ...            ...    ...   
3548  Gemma              Base        4  1.0    100              2      5   
3549  Gemma              Base        4  1.0     50              2      9   
3550  Gemma              Base        4  1.0     50              2      8   
3551  Gemma              Base        4  1.5    100              2      8   
3552  Gemma              Base        4  1.5    100              2      9   

      output_y  output_x  
78       -2.75     -6.36  
79       -1.38     -3.74  
80    

In [76]:
# for the generation parameters, there's no reason to expect interactions, and we also have less data. proceed with one way anova.
gen_factors = ["tmp", "top_k", "n_beams", "n_beam_groups"]
models = ["Gemma", "Llama3", "Falcon", "Mistral", "Phi"]
dependent_vars = ["output_x", "output_y"]
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for gen_factor in gen_factors:
        print("="*20+f" {gen_factor} "+"="*20)
        for model in models:
            #print(model_split_df[model])
            print("-"*20+f" {model} "+"-"*20)
            print(one_way_anova(_filter_df=model_split_df[model], independent_var = gen_factor,
                    dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Levene’s Test: Statistic = 0.0006, p-value = 0.9994
              sum_sq     df         F    PR(>F)    eta_sq
C(tmp)      0.320376    2.0  0.147899  0.862626  0.001687
Residual  189.540734  175.0       NaN       NaN  0.998313
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 0.7200, p-value = 0.4882
              sum_sq     df         F    PR(>F)    eta_sq
C(tmp)      0.976332    2.0  0.670863  0.512592  0.007696
Residual  125.886641  173.0       NaN       NaN  0.992304
-------------------- Falcon --------------------
Levene’s Test: Statistic = 0.1487, p-value = 0.8619
             sum_sq     df         F    PR(>F)    eta_sq
C(tmp)     0.106443    2.0  0.501175  0.606727  0.005966
Residual  17.734287  167.0       NaN       NaN  0.994034
-------------------- Mistral --------------------
Levene’s Test: Statistic = 0.1814, p-v

In [72]:
print(multi_way_anova(_filter_df=model_split_df["Falcon"], independent_vars = ["n_beams", "tmp", "top_k", "n_beam_groups", "prompt"],
                    dependent_var = "output_x", collapse=False))

Levene’s Test: Statistic = nan, p-value = nan
                    sum_sq     df          F        PR(>F)    eta_sq
C(n_beams)        3.966579    2.0  53.383654  2.302834e-18  0.231030
C(tmp)            0.091553    2.0   1.232148  2.945060e-01  0.005332
C(top_k)          0.002111    1.0   0.056832  8.118897e-01  0.000123
C(n_beam_groups)  1.055191    1.0  28.402284  3.419472e-07  0.061459
C(prompt)         6.295194    9.0  18.827333  5.727170e-21  0.366658
Residual          5.758502  155.0        NaN           NaN  0.335399


  W = numer / denom


In [46]:
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(one_way_anova(_filter_df=model_split_df[model], independent_var = "prompt", dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Levene’s Test: Statistic = 4.8471, p-value = 2.478e-06
Welch
   Source  ddof1       ddof2         F         p-unc       np2
0  prompt      9  328.891977  41.94176  1.493547e-49  0.326098
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 2.8477, p-value = 0.002618
Welch
   Source  ddof1       ddof2           F          p-unc       np2
0  prompt      9  328.069165  207.710946  9.498003e-130  0.692112
-------------------- Falcon --------------------
Levene’s Test: Statistic = 8.0638, p-value = 2.012e-11
Welch
   Source  ddof1       ddof2          F         p-unc       np2
0  prompt      9  280.671324  17.113354  1.798242e-22  0.132591
-------------------- Mistral --------------------
Levene’s Test: Statistic = 12.1912, p-value = 3.351e-18
Welch
   Source  ddof1       ddof2          F         p-unc       np2
0  prompt      9  3

In [119]:
# we don't _need_ to see the interaction effect of variables, but just to be sure
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(multi_way_anova(_filter_df=model_split_df[model], independent_vars= ["prompt", "top_k"], dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Levene’s Test: Statistic = 1.2025, p-value = 0.2619
               sum_sq     df           F         PR(>F)    eta_sq
C(prompt)  181.915291    9.0  424.950408  3.450407e-110  0.958149
C(top_k)     0.002447    1.0    0.051449   8.208377e-01  0.000013
Residual     7.943372  167.0         NaN            NaN  0.041838
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 2.0025, p-value = 0.01102
Robust ANOVA
                      Term   Df  Df.res         Sum Sq     Sum Sq.res  \
prompt              prompt  9.0   156.0  322547.998013  131310.194444   
top_k                top_k  1.0   156.0     221.064670  448423.402778   
prompt:top_k  prompt:top_k  9.0   156.0   17054.567899  431706.361111   

                F value        Pr(>F)  
prompt        42.577288  1.191321e-37  
top_k          0.076905  7.819032e-01  
prompt:top_k   0.

## Fine-tuning data analysis

In [146]:
url = "https://docs.google.com/spreadsheets/d/14dHuq-Z52B-RYvYX0IDFgC9ISXDDr9ID/edit?gid=801390513#gid=801390513"
df = read_gsheet(url=url)
print(len(df))
df['prompt'] = df['prompt'].astype(str)

3660


In [147]:
## fine tuning data analysis

model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    _df = df[(df['model']== model)]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df

Falcon 707
Gemma 818
Mistral 817
Phi 502
Llama3 816


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)


In [126]:
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(multi_way_anova(_filter_df=model_split_df[model], independent_vars= ["prompt", "fine_tune_dataset"], dependent_var = dependent_var, collapse=True))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Levene’s Test: Statistic = 18.2172, p-value = 1.784e-50
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   798.0  1.412035e+07   
fine_tune_dataset                fine_tune_dataset  1.0   798.0  3.418362e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   798.0  2.293793e+06   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    3.123087e+07  40.088684  4.377073e-59  
fine_tune_dataset         4.190622e+07  65.094223  2.615594e-15  
prompt:fine_tune_dataset  4.313508e+07   4.715026  4.011799e-06  
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 2.9251, p-value = 2.982e-05
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
promp

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison



                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   687.0  4.036177e+06   
fine_tune_dataset                fine_tune_dataset  1.0   687.0  7.734288e+03   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   687.0  3.594816e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.466377e+07  12.491798  1.802639e-18  
fine_tune_dataset         2.879735e+07   0.184512  6.676588e-01  
prompt:fine_tune_dataset  2.819385e+07   0.973277  4.608198e-01  
-------------------- Mistral --------------------
Levene’s Test: Statistic = 9.2211, p-value = 2.505e-24
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   797.0  2.481027e+07   
fine_tune_dataset                fine_tune_dataset  1.0   797.0  6.058998e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison



                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   687.0  3.982931e+06   
fine_tune_dataset                fine_tune_dataset  1.0   687.0  4.333371e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   687.0  4.489030e+05   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    2.536527e+07  11.986090  1.134206e-17  
fine_tune_dataset         2.888431e+07  10.306725  1.387194e-03  
prompt:fine_tune_dataset  2.888350e+07   1.186361  3.005397e-01  
-------------------- Mistral --------------------
Levene’s Test: Statistic = 9.4034, p-value = 6.946e-25
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   797.0  1.380868e+07   
fine_tune_dataset                fine_tune_dataset  1.0   797.0  5.011084e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dataset

In [142]:
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(multi_way_anova(_filter_df=model_split_df[model], independent_vars= ["prompt", "fine_tune_dataset"], dependent_var = dependent_var, collapse=True))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Levene’s Test: Statistic = 18.2172, p-value = 1.784e-50
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   798.0  1.412035e+07   
fine_tune_dataset                fine_tune_dataset  1.0   798.0  3.418362e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   798.0  2.293793e+06   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    3.123087e+07  40.088684  4.377073e-59  
fine_tune_dataset         4.190622e+07  65.094223  2.615594e-15  
prompt:fine_tune_dataset  4.313508e+07   4.715026  4.011799e-06  
-------------------- Llama3 --------------------
Levene’s Test: Statistic = 2.9251, p-value = 2.982e-05
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
promp

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison



                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   797.0  2.481027e+07   
fine_tune_dataset                fine_tune_dataset  1.0   797.0  6.058998e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   797.0  3.260060e+06   

                            Sum Sq.res     F value         Pr(>F)  
prompt                    2.037709e+07  107.821427  2.018801e-131  
fine_tune_dataset         3.673257e+07  131.464281   2.812075e-28  
prompt:fine_tune_dataset  3.921975e+07    7.360997   2.342795e-10  
-------------------- Phi --------------------
Levene’s Test: Statistic = 12.5185, p-value = 4.51e-29
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   484.0  2.689663e+06   
fine_tune_dataset                fine_tune_dataset  1.0   484.0  9.468081e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dat

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison



                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   797.0  1.380868e+07   
fine_tune_dataset                fine_tune_dataset  1.0   797.0  5.011084e+05   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  9.0   797.0  1.655553e+06   

                            Sum Sq.res    F value        Pr(>F)  
prompt                    3.084562e+07  39.643729  1.745995e-58  
fine_tune_dataset         4.285659e+07   9.319066  2.343155e-03  
prompt:fine_tune_dataset  4.066695e+07   3.605100  2.037865e-04  
-------------------- Phi --------------------
Levene’s Test: Statistic = 12.1271, p-value = 3.902e-28
Robust ANOVA
                                              Term   Df  Df.res        Sum Sq  \
prompt                                      prompt  9.0   484.0  4.361576e+06   
fine_tune_dataset                fine_tune_dataset  1.0   484.0  1.283695e+06   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  7

In [153]:
from scipy.stats import ttest_ind
def t_test(_df: pd.DataFrame, _dependent_var: str):
    _df['is_finetuned'] = df['fine_tune_dataset'].apply(lambda x: 'Base' not in x)
    base_dep_var = _df[_df['is_finetuned'] == False][_dependent_var]
    finetuned_dep_var = _df[_df['is_finetuned'] == True][_dependent_var]
    t_stat, p_value = ttest_ind(base_dep_var, finetuned_dep_var, equal_var=False)
    return (t_stat, p_value)    

for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(t_test(_df=model_split_df[model], _dependent_var = dependent_var))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
(np.float64(-7.961883482265759), np.float64(1.6504537420848558e-14))
-------------------- Llama3 --------------------
(np.float64(-2.8137255152509466), np.float64(0.0052731254773178676))
-------------------- Falcon --------------------
(np.float64(4.662984095184156), np.float64(3.7362266578756533e-06))
-------------------- Mistral --------------------
(np.float64(-5.986063349178471), np.float64(6.245681125240809e-09))
-------------------- Phi --------------------
(np.float64(8.742483909167175), np.float64(3.4774373386495e-17))
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
(np.float64(8.653359009804992), np.float64(8.279097832307584e-17))
-------------------- Llama3 --------------------
(np.float64(-2.4369787484676895), np.float64(0.015301078978165008))
-----

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df['is_finetuned'] = df['fine_tune_dataset'].apply(lambda x: 'Base' not in x)


In [170]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
dataset_category = {"target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-convo'}, "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}}
# we know prompt and fine-tuning is important
def tukey_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base']
    #print(len(_df), list(_df), set(_df['fine_tune_dataset']))
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
    #model = ols(f'{_dependent_var} ~ C(prompt) + C(fine_tune_dataset) + C(prompt):C(fine_tune_dataset)', data=_df).fit()
    #anova_table = sm.stats.anova_lm(model, typ=2)
    tukey = pairwise_tukeyhsd(endog=_df[_dependent_var],
                              groups=_df["dataset_group"],
                              alpha=0.05)
    return tukey

for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for model in models:
        print("-"*20+f" {model} "+"-"*20)
        print(tukey_test(_df=model_split_df[model], _dependent_var = dependent_var))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05
 group1 group2 meandiff p-adj lower  upper  reject
--------------------------------------------------
control target   1.1011   0.0 0.8799 1.3223   True
--------------------------------------------------
-------------------- Llama3 --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj   lower  upper  reject
----------------------------------------------------
control target   0.0681 0.2631 -0.0513 0.1874  False
----------------------------------------------------
-------------------- Falcon --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj   lower  upper  reject
----------------------------------------------------
control target   0.0232 0.7698 -0.1324 0.1788  False
------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj   lower  upper  reject
----------------------------------------------------
control target  -0.1459 0.0955 -0.3175 0.0257  False
----------------------------------------------------
-------------------- Phi --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj   lower  upper  reject
----------------------------------------------------
control target  -0.0647 0.2759 -0.1812 0.0519  False
----------------------------------------------------
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj  lower   upper  reject
----------------------------------------------------
control target  -1.0043   0.0 -1.2275 -0.7812   True
----------------------------------------------------
--------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Multiple Comparison of Means - Tukey HSD, FWER=0.05
 group1 group2 meandiff p-adj  lower  upper  reject
---------------------------------------------------
control target   0.2443 0.0053 0.0728 0.4158   True
---------------------------------------------------
-------------------- Mistral --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1 group2 meandiff p-adj   lower  upper  reject
----------------------------------------------------
control target   0.2454 0.0681 -0.0183 0.5092  False
----------------------------------------------------
-------------------- Phi --------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05
 group1 group2 meandiff p-adj  lower  upper reject
--------------------------------------------------
control target    0.344 0.0036 0.1131 0.575   True
--------------------------------------------------


In [174]:
import pandas as pd
import pingouin as pg
from statsmodels.formula.api import ols
import statsmodels.api as sm

dataset_category = {
    "target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-convo'},
    "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}
}

def games_howell_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base'].copy()
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
    # model = ols(f'{_dependent_var} ~ C(prompt) + C(fine_tune_dataset) + C(prompt):C(fine_tune_dataset)', data=_df).fit()
    # anova_table = sm.stats.anova_lm(model, typ=2)
    gh_result = pg.pairwise_gameshowell(dv=_dependent_var, between="dataset_group", data=_df)
    return gh_result

for dependent_var in dependent_vars:
    print("=:" * 20 + f" {dependent_var} " + "=:" * 20)
    for model in models:
        print("-" * 20 + f" {model} " + "-" * 20)
        result = games_howell_test(_df=model_split_df[model], _dependent_var=dependent_var)
        #print(result)
        print(result[['diff', 'pval']])

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
       diff  pval
0 -1.101125   0.0
-------------------- Llama3 --------------------
       diff      pval
0 -0.068063  0.263087
-------------------- Falcon --------------------
       diff      pval
0 -0.023195  0.772777
-------------------- Mistral --------------------
       diff     pval
0  0.145906  0.09557
-------------------- Phi --------------------
       diff      pval
0  0.064677  0.192566
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
-------------------- Gemma --------------------
       diff  pval
0  1.004344   0.0
-------------------- Llama3 --------------------
       diff      pval
0  0.090469  0.240688
-------------------- Falcon --------------------
       diff      pval
0 -0.244304  0.006388
-------------------- Mistral --------------------
       diff      pval
0 -0.245437  0.068133
--