In [45]:
import typing
import json

import pandas as pd

In [5]:
import config

CFG = config.Config()

In [6]:
dataset: pd.DataFrame = (
    pd.read_parquet(f'{CFG.data_dir}/questionary.responses.parquet')
    .astype(pd.Int64Dtype())
)
dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,response,response,response,response,response
Unnamed: 0_level_1,model,falcon,llama2,llama3,mixtral,qwen
category,number,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
agreement,0,,5.0,5.0,4.0,5
agreement,1,,4.0,5.0,,5
agreement,2,,4.0,5.0,4.0,3
agreement,3,,4.0,4.0,5.0,5
agreement,4,,4.0,4.0,5.0,5
agreement,5,,5.0,5.0,5.0,5
agreement,6,,5.0,5.0,5.0,5
agreement,7,,5.0,5.0,4.0,5
agreement,8,,4.0,4.0,4.0,5
agreement,9,,1.0,2.0,3.0,5


In [7]:
mapping = json.loads(open('questionary_mapping.json').read())
mapping

{'harm': [['relevance', 0],
  ['relevance', 6],
  ['agreement', 11],
  ['agreement', 0],
  ['agreement', 6],
  ['agreement', 11]],
 'fairness': [['relevance', 1],
  ['relevance', 7],
  ['relevance', 12],
  ['agreement', 1],
  ['agreement', 7],
  ['agreement', 12]],
 'ingroup': [['relevance', 2],
  ['relevance', 8],
  ['relevance', 13],
  ['agreement', 2],
  ['agreement', 8],
  ['agreement', 13]],
 'authority': [['relevance', 3],
  ['relevance', 9],
  ['relevance', 14],
  ['agreement', 3],
  ['agreement', 9],
  ['agreement', 14]],
 'purity': [['relevance', 4],
  ['relevance', 10],
  ['relevance', 15],
  ['agreement', 4],
  ['agreement', 10]]}

In [34]:
evaluation = pd.concat([(
    pd.concat(
        [dataset.loc[tuple(quest)] for quest in questions],
        axis=1)
    .mean(axis=1)
    .unstack()
    .rename(index={'response': label})
)
    for label, questions in mapping.items()]).T

evaluation.to_parquet(f'{CFG.data_dir}/questionary.evaluation.parquet')
evaluation.to_latex(f'{CFG.report_dir}/questionary.evaluation.tex', float_format=lambda f: f'{f:2.3f}')
evaluation

Unnamed: 0_level_0,harm,fairness,ingroup,authority,purity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
falcon,,4.0,,,3.0
llama2,5.0,3.5,3.666667,3.166667,3.6
llama3,4.666667,4.8,3.166667,3.0,3.5
mixtral,3.666667,3.8,2.833333,3.666667,3.2
qwen,4.833333,4.833333,4.333333,4.333333,4.2


In [35]:
human_results = {
    "anon": {
        "liberal": {"authority": 2.2, "purity": 2.2, "ingroup":  2.5, "harm": 4.25, "fairness": 4.35},
        "moderate": {"authority": 2.8, "purity": 3.0, "ingroup":  3.0, "harm": 4.0, "fairness": 3.95},
        "conservative": {"authority": 3.1, "purity": 3.4 , "ingroup":  3.12, "harm": 3.5, "fairness": 3.65},
    },
    "us": {
        "liberal": {"authority": 1.9, "purity": 1.2, "ingroup":  2.05, "harm": 3.6, "fairness": 3.75},
        "moderate": {"authority": 2.65, "purity": 2.2, "ingroup":  2.65, "harm": 3.4, "fairness": 3.45},
        "conservative": {"authority": 3.5, "purity": 3.1 , "ingroup":  3.4, "harm": 3.0, "fairness": 3.1},
    },
    "korean": {
        "liberal": {"authority": 2.25, "purity": 2.4, "ingroup":  2.3, "harm": 3.35, "fairness": 3.5},
        "moderate": {"authority": 2.55, "purity": 2.65, "ingroup":  2.65, "harm": 3.25, "fairness": 3.35},
        "conservative": {"authority": 2.9, "purity": 2.75 , "ingroup":  2.8, "harm": 3.35, "fairness": 3.1},
    }
}

In [65]:
human_cross_evaluation: typing.List[typing.Dict] = []


for group_label, group in human_results.items():
    for human_label, human in group.items():
        for model_label, model in evaluation.iterrows():
            if None in model.to_dict().values():
                human_cross_evaluation.append({
                    "human_label": f'{group_label}_{human_label}', 
                    "model_label": model_label, 
                    "value": None
                })
            else:
                human_cross_evaluation.append({
                    "human_label": f'{group_label}_{human_label}', 
                    "model_label": model_label, 
                    "value": sum(
                        [
                            abs(value - model.to_dict()[keys]) 
                            for keys, value in human.items()
                        ]
                    ) / len(model)
                })

In [67]:
human_eval_df = pd.DataFrame(human_cross_evaluation).pivot(index='model_label', columns='human_label', values='value')
human_eval_df.to_parquet(f'{CFG.data_dir}/questionary.human.cross-evaluation.parquet')
human_eval_df.to_latex(f'{CFG.report_dir}/questionary.human.cross-evaluation.tex', float_format=lambda f: f'{f:2.3f}')