In [6]:
import pandas as pd
from typing import List, Optional
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import levene
from pingouin import welch_anova
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.packages import importr
pandas2ri.activate()

pd.options.mode.chained_assignment = None  # default='warn'

ARTool = importr('ARTool')
base = importr('base')
stats = importr('stats')

In [7]:
def read_gsheet(url: Optional[str]=None, sheet_id: Optional[str]=None, gid: Optional[str]=None):
    if url is not None:
        match = re.search(r"spreadsheets/d/([^/]+)/.*?[?&]gid=(\d+)", url)
        if match:
            sheet_id = match.group(1)
            gid = match.group(2)
        else:
            print("can't parse url to get sheet id and gid")
    else:
        assert sheet_id is not None and gid is not None, "Sheet id an gid must be not None when url is not None"
    _url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid={gid}"
    return pd.read_csv(_url)

In [33]:
def multi_way_anova(_filter_df: pd.DataFrame, independent_vars: Optional[List[str]] = ["fine_tune_dataset"],
                    dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do multi way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_vars: contributing factors
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    if collapse:
        mapping = {'Base': 'Base'}
        _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # print(f"df size: {_filter_df.shape}")
    # levene
    grouped_data =  _filter_df.groupby(independent_vars)[dependent_var].apply(list)
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05: # pval must be < 0.05 for us to reject the H_0 at Levenes.
        print("Robust ANOVA")
        r_df = pandas2ri.py2rpy(_filter_df)
        r.assign("rdf", r_df)

        for var in independent_vars:
            r(f"rdf${var} <- as.factor(rdf${var})")
        formula_str = f"{dependent_var} ~ {' * '.join(independent_vars)}"
        r(f'''
            library(ARTool)
            model <- art({formula_str}, data = rdf)
            art_result <- anova(model, type=2)
        ''')

        art_result = r('art_result')
        art_df = pandas2ri.rpy2py(art_result)
        return art_df
    formula = f"{dependent_var} ~" + ' + '.join([f'C({col})' for col in independent_vars])
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


In [9]:
def one_way_anova(_filter_df: pd.DataFrame, independent_var: Optional[str] = "fine_tune_dataset",
                  dependent_var: Optional[str] = "output_x", collapse: bool=True):
    """
    do one way anova on the provided dataframe.
    :param _filter_df: output from a model
    :param dependent_var: predicted variable (column name in the df)
    :param independent_var: contributing factor
    :param collapse: instead of saying which dataset it is finetuned on, we will just have 2 values
    for this factor -- whether it is finetuned or not
    :return:
    """
    #print(f"df size: {_filter_df.shape}")
    # suppose the independent variable is prompt, then, it is possible that the dependent variable has the same value for all the prompts. 
    # in that case, we should filter out those rows as the variance will be zero.
    grouped_data = _filter_df.groupby(independent_var)
    for group_name, group in grouped_data:
        if group[dependent_var].nunique() <= 1:
            print(f"Group '{group_name}' has only one unique value for '{dependent_var}', removing it from analysis.")
            _filter_df = _filter_df[_filter_df[independent_var] != group_name]
    #print(f"df size after filtering: {_filter_df.shape}")
    mapping = {'Base': 'Base'}
    _filter_df.loc[:, 'fine_tune_dataset'] = _filter_df['fine_tune_dataset'].map(mapping).fillna('fine_tuned')
    # levene
    grouped_data = [group[dependent_var].values for name, group in _filter_df.groupby(independent_var)]
    grouped_data = [group for group in grouped_data if len(group) > 1 and len(set(group)) > 1]  
    # filter out groups with only one observation and groups with no variance
    stat, pval = levene(*grouped_data, center='mean')
    print(f"Levene’s Test: Statistic = {stat:.4f}, p-value = {pval:.4g}")
    if pval < 0.05:
        print("Welch")
        assert independent_var is not None and dependent_var is not None
        if len(set(_filter_df[independent_var])) < 2:
            print(f"Only one level of {independent_var} found, returning empty DataFrame.")
            return pd.DataFrame(columns=['F', 'PR(>F)', 'sum_sq', 'df', 'mean_sq', 'eta_sq'])
        welch_results = welch_anova(dv=dependent_var, between=independent_var, data=_filter_df)
        return welch_results
    formula=f"{dependent_var} ~ C({independent_var})"
    _model = ols(formula, data=_filter_df).fit()
    anova_table = sm.stats.anova_lm(_model, typ=2)
    anova_table['eta_sq'] = anova_table['sum_sq'] / anova_table['sum_sq'].sum()
    return anova_table


## Fine-tuning data analysis

### generalization studies

#### Q1: effect of model size
to test the effect of model size, we will ideally compare the base model experiments with 
- llama-1b-quant and llama-8b-quant
- llama-1b-full and llama-8b-full

We don't have access to these.

Similarly, we can do the fine-tuning experiments with:
- llama-1b-quant-peft and llama-8b-quant-peft
- llama-1b-full-peft and llama-8b-full-peft

We only have llama1b-quant-peft so let's do the experiments with that.



In [49]:
url="https://docs.google.com/spreadsheets/d/16M6E6upoQi3jjy54-qW3JxasDUs2MOrA/edit?gid=948710219#gid=948710219"
df = read_gsheet(url=url)
print(len(df))
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
df['prompt'] = df['prompt'].astype(str)
print(len(df))
print(set(df['fine_tune_dataset']))

966
966
{'Pol-Convo', 'Scisumm', 'Imdb', 'Newsarticles', 'Canadian-QA', 'Newsroom', 'Base', 'FineTome'}


In [54]:
model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)

llama_1b_base_quant = df[df['type'] == "base-quant"]
llama_1b_base_full = df[df['type'] == "base-full"]
print(len(llama_1b_base_full))
print(len(llama_1b_base_quant))

70
71


In [56]:
gen_factors = ["tmp", "top_k", "n_beams", "prompt"]
dependent_vars = ["output_x", "output_y"]
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    for gen_factor in gen_factors:
        print("="*20+f" {gen_factor} "+"="*20)
        print(one_way_anova(_filter_df=llama_1b_base_full, independent_var = gen_factor,
                    dependent_var = dependent_var, collapse=False))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
Levene’s Test: Statistic = 0.2775, p-value = 0.6
             sum_sq    df         F    PR(>F)    eta_sq
C(tmp)     0.475200   1.0  0.389078  0.534872  0.005689
Residual  83.051887  68.0       NaN       NaN  0.994311
Levene’s Test: Statistic = 0.3417, p-value = 0.5608
             sum_sq    df        F    PR(>F)    eta_sq
C(top_k)   0.156516   1.0  0.12766  0.721977  0.001874
Residual  83.370571  68.0      NaN       NaN  0.998126
Levene’s Test: Statistic = 0.1986, p-value = 0.6573
               sum_sq    df         F    PR(>F)    eta_sq
C(n_beams)   0.001750   1.0  0.001425  0.970001  0.000021
Residual    83.525337  68.0       NaN       NaN  0.999979
Group '2' has only one unique value for 'output_x', removing it from analysis.
Levene’s Test: Statistic = 2.8840, p-value = 0.01237
Welch
   Source  ddof1      ddof2          F         p-unc       np2
0  prompt      7  22.715332  13.503933  9.179571

In [59]:
url="https://docs.google.com/spreadsheets/d/16M6E6upoQi3jjy54-qW3JxasDUs2MOrA/edit?gid=948710219#gid=948710219"
df = read_gsheet(url=url)
print(len(df))
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
df['prompt'] = df['prompt'].astype(str)
print(len(df))
print(set(df['fine_tune_dataset']))

model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    #_df = df[(df['model']== model) & (df['type'].isin(["peft-quant", "base-quant"]))]
    _df = df[(df['model']== model) & (df['type'].isin(["peft", "base-full"]))]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df
dependent_vars = ["output_x", "output_y"]

966
966
{'Pol-Convo', 'Scisumm', 'Imdb', 'Newsarticles', 'Canadian-QA', 'Newsroom', 'Base', 'FineTome'}
Llama1B 471


### indepent t-test

determine if we have a difference between fine-tuned and base versions of PCT

In [60]:
from scipy.stats import ttest_ind

def t_test(_df: pd.DataFrame, _dependent_var: str):
    #print(set(_df['fine_tune_dataset']))
    _df['is_finetuned'] = _df['fine_tune_dataset'].apply(lambda x: 'Base' not in x)
    base_dep_var = _df[_df['is_finetuned'] == False][_dependent_var]
    finetuned_dep_var = _df[_df['is_finetuned'] == True][_dependent_var]
    #print(len(base_dep_var), len(finetuned_dep_var))
    t_stat, p_value = ttest_ind(base_dep_var, finetuned_dep_var, equal_var=False)
    return (t_stat, p_value)    

for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    print(t_test(_df=model_split_df["Llama1B"], _dependent_var = dependent_var))

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
(np.float64(-32.74110826453176), np.float64(5.969774780743185e-57))
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
(np.float64(-8.385910965573048), np.float64(5.973144981266671e-13))


### multi way anova to understand the joint effect of prompts and fine-tune dataset

In [63]:
url="https://docs.google.com/spreadsheets/d/16M6E6upoQi3jjy54-qW3JxasDUs2MOrA/edit?gid=948710219#gid=948710219"
df = read_gsheet(url=url)
print(len(df))
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
df['prompt'] = df['prompt'].astype(str)
print(len(df))
print(set(df['fine_tune_dataset']))

model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    # _df = df[(df['model']== model) & (df['type'].isin(["peft-quant", "base-quant"]))]
    _df = df[(df['model']== model) & (df['type'].isin(["peft", "base-full"]))]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df
dependent_vars = ["output_x", "output_y"]

966
966
{'Pol-Convo', 'Scisumm', 'Imdb', 'Newsarticles', 'Canadian-QA', 'Newsroom', 'Base', 'FineTome'}
Llama1B 471


In [64]:
print(len(model_split_df["Llama1B"]), set(model_split_df["Llama1B"]['fine_tune_dataset']))
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    print(multi_way_anova(_filter_df=model_split_df["Llama1B"], independent_vars= ["prompt", "fine_tune_dataset"], dependent_var = dependent_var, collapse=False))

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison

R[write to console]: Note: model has aliased coefficients
      sums of squares computed by model comparison



471 {'Pol-Convo', 'Scisumm', 'Imdb', 'Newsarticles', 'Canadian-QA', 'Newsroom', 'Base', 'FineTome'}
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
Levene’s Test: Statistic = 4.3453, p-value = 8.22e-21
Robust ANOVA
                                              Term    Df  Df.res  \
prompt                                      prompt   9.0   402.0   
fine_tune_dataset                fine_tune_dataset   7.0   402.0   
prompt:fine_tune_dataset  prompt:fine_tune_dataset  52.0   402.0   

                                Sum Sq    Sum Sq.res     F value  \
prompt                    3.459763e+06  4.987889e+06   30.982261   
fine_tune_dataset         7.323279e+06  1.224297e+06  343.515838   
prompt:fine_tune_dataset  4.555424e+06  3.480247e+06   10.119091   

                                 Pr(>F)  
prompt                     4.718145e-41  
fine_tune_dataset         2.777726e-165  
prompt:fine_tune_dataset   7.912064e-47  
=:=:=:=:=:=:=:=:=:=:=:=:=:=:

### Does the type of fine-tuning dataset make a difference?

Check if the type of the fine-tuning dataset makes a difference. We need to re-download the data because we have converted all fine-tuning datasets to "fine_tuned" in the last step.

In [65]:
url="https://docs.google.com/spreadsheets/d/16M6E6upoQi3jjy54-qW3JxasDUs2MOrA/edit?gid=948710219#gid=948710219"
df = read_gsheet(url=url)
print(len(df))
df = df.drop_duplicates()
df = df[df.model != "Phi"]  # filter out Phi
df['prompt'] = df['prompt'].astype(str)
print(len(df))
print(set(df['fine_tune_dataset']))

model_split_df = {}

output_x = "Social Libertarian/Authoritarian"
output_y = "Economic Left/Right"

for model in set(df['model']):
    #_df = df[(df['model']== model) & (df['type'].isin(["peft-quant", "base-quant"]))]
    _df = df[(df['model']== model) & (df['type'].isin(["peft", "base-full"]))]
    _df.rename(columns={output_x: "output_x", output_y: "output_y"}, inplace=True)
    print(model, len(_df))
    model_split_df[model] = _df
dependent_vars = ["output_x", "output_y"]
    
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
dataset_category = {"target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-Convo'}, "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}}
# we will test the fuck all 
def tukey_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base']
    #print(len(_df), list(_df), set(_df['fine_tune_dataset']))
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(lambda x: 'target' if x in dataset_category["target"] else "control")
    tukey = pairwise_tukeyhsd(endog=_df[_dependent_var],
                              groups=_df["dataset_group"],
                              alpha=0.05)
    return tukey

from scipy.stats import ttest_ind

def t_test(_df, _dependent_var='output_x'):
    _df = _df[_df['fine_tune_dataset'] != 'Base']
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(
        lambda x: 'target' if x in dataset_category["target"] else "control"
    )

    control_vals = _df[_df['dataset_group'] == 'control'][_dependent_var]
    target_vals = _df[_df['dataset_group'] == 'target'][_dependent_var]

    t_stat, p_val = ttest_ind(control_vals, target_vals, equal_var=False)
    diff = target_vals.mean() - control_vals.mean()

    return {
        'mean_diff': diff,
        't_stat': t_stat,
        'p_value': p_val,
        'n_control': len(control_vals),
        'n_target': len(target_vals)
    }

from pprint import pprint
for dependent_var in dependent_vars:
    print("=:"*20+f" {dependent_var} "+"=:"*20)
    print(tukey_test(_df=model_split_df["Llama1B"], _dependent_var = dependent_var))
        #pprint(t_test(_df=model_split_df[model], _dependent_var = dependent_var))

966
966
{'Pol-Convo', 'Scisumm', 'Imdb', 'Newsarticles', 'Canadian-QA', 'Newsroom', 'Base', 'FineTome'}
Llama1B 471
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
 group1 group2 meandiff p-adj lower  upper  reject
--------------------------------------------------
control target   1.5478   0.0 1.3414 1.7542   True
--------------------------------------------------
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
 group1 group2 meandiff p-adj lower  upper  reject
--------------------------------------------------
control target    0.554   0.0 0.4046 0.7033   True
--------------------------------------------------


In [66]:
import pandas as pd
import pingouin as pg
from statsmodels.formula.api import ols
import statsmodels.api as sm

dataset_category = {
    "target": {'Canadian-QA', 'Newsarticles', 'Newsroom', 'Pol-Convo'},
    "control": {"Imdb", "OpenR1", "Scisumm", "FineTome"}
}

task_dataset_category = {
    "classification": {
        "target": 'Newsarticles',
        "control": "Imdb"
    },
    "summarization": {
        "target": "Newsroom",
        "control": "Scisumm"
    },
    "conversational": {
        "target": "Pol-Convo",
        "control": "FineTome"
    },
    "qa": {
        "target": "Canadian-QA",
        "control": "OpenR1"
    }
}

def games_howell_test_base(_df, _dependent_var='output_x'):
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(
        lambda x: 'Base' if x == 'Base' else ('target' if x in dataset_category["target"] else "control")
    )
    assert set(_df['dataset_group']) == {'target', 'control', 'Base'}, "Dataset groups must be either 'target' or 'control'"
    gh_result = pg.pairwise_gameshowell(dv=_dependent_var, between="dataset_group", data=_df)
    return gh_result
    
def games_howell_test_task_base(_df, task, _dependent_var='output_x'):
    task_datasets = set(task_dataset_category[task].values()) | {'Base'}
    _df = _df[_df['fine_tune_dataset'].isin(task_datasets)]
    if _df['fine_tune_dataset'].nunique() < 3:
        print(f"Only {_df['fine_tune_dataset'].nunique()} datasets found for task '{task}', returning empty DataFrame.")
        return None
    _df['dataset_group'] = _df['fine_tune_dataset'].apply(
        lambda x: 'Base' if x == 'Base' else ('target' if x in task_dataset_category[task]["target"] else "control")
    )
    assert set(_df['dataset_group']) == {'target', 'control', 'Base'}, f"Dataset groups must be either 'target' or 'control' or 'Base', got {set(_df['dataset_group'])}"
    gh_result = pg.pairwise_gameshowell(dv=_dependent_var, between="dataset_group", data=_df)
    return gh_result


# dfs = []
# for dependent_var in dependent_vars:
#     print("=:" * 20 + f" {dependent_var} " + "=:" * 20)
#     for model in models:
#         print("-" * 20 + f" {model} " + "-" * 20)
#         result = games_howell_test_base(_df=model_split_df[model], _dependent_var=dependent_var)
#         result['model'] = model
#         result['dependent_var'] = dependent_var
#         result['setup'] = [f"{x.lower()}-{y.lower()}" for x,y in zip(result['A'], result['B'])]
#         result['diff'] = [f"{float(x):.2e}" for x in result['diff']]
#         result['pval'] = [f"{float(x):.2e}" for x in result['pval']]
#         #print(result[['diff', 'pval']])
#         dfs.append(result[['model', 'setup', 'diff', 'pval', 'dependent_var']])

# pd.concat(dfs, axis=0).to_csv("diff_pval_summary_incl_base.csv", index=False)
# print("output written")

dfs = []
for dependent_var in dependent_vars:
    print("=:" * 20 + f" {dependent_var} " + "=:" * 20)
    for task in task_dataset_category.keys():
        print(f"Task: {task}")
        result = games_howell_test_task_base(_df=model_split_df["Llama1B"], task=task, _dependent_var=dependent_var)
        if result is not None:
            result['model'] = "Llama1B"
            result['task'] = task
            result['dependent_var'] = dependent_var
            result['setup'] = [f"{x.lower()}-{y.lower()}" for x,y in zip(result['A'], result['B'])]
            result['diff'] = [f"{float(x):.2e}" for x in result['diff']]
            result['pval'] = [f"{float(x):.2e}" for x in result['pval']]
            print(result[['diff', 'pval']])
            dfs.append(result[['model', 'task', 'setup', 'diff', 'pval', 'dependent_var']])

pd.concat(dfs, axis=0).to_csv("diff_pval_summary_task_incl_base_llama1b_full.csv", index=False)
print("output written")

=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_x =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
Task: classification
        diff      pval
0  -4.11e+00  2.22e-15
1  -5.59e+00  1.07e-14
2  -1.48e+00  0.00e+00
Task: summarization
        diff      pval
0  -4.31e+00  3.44e-15
1  -4.72e+00  3.55e-14
2  -4.12e-01  3.12e-01
Task: conversational
        diff      pval
0  -3.32e+00  3.44e-15
1  -5.06e+00  0.00e+00
2  -1.74e+00  2.12e-14
Task: qa
Only 2 datasets found for task 'qa', returning empty DataFrame.
=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=: output_y =:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:=:
Task: classification
        diff      pval
0  -1.12e+00  0.00e+00
1  -1.42e+00  3.22e-14
2  -3.05e-01  8.85e-04
Task: summarization
        diff      pval
0  -3.54e-02  9.80e-01
1  -1.23e+00  8.38e-13
2  -1.19e+00  4.12e-08
Task: conversational
        diff      pval
0  -2.99e-01  1.95e-01
1  -1.40e+00  0.00e+00
2  -1.10e+00  5.92e-10
Task: qa
Only 2 datasets found for task 'qa', returning empty DataFra