In [4]:
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import wasserstein_distance

from utils import FileIO
file_io = FileIO()

In [5]:
questionaire = file_io.load_json(
    '../llm_baseline/prompts/envent_questionnaire_reader.json'
)
question_to_name = {
    question['Dname']: question['Dquestion'] for question in questionaire
}
appraisal_d_lst = list(question_to_name.keys())

In [6]:
def visualize_mean_and_var_dist(fpath: str, model_name: str):
    
    model_results = json.load(open(fpath, 'r'))

    model_mean_lst = []
    model_dim_mean_lst = [[] for _ in range(21)]
    human_mean_lst = []
    human_dim_mean_lst = [[] for _ in range(21)]
    model_var_lst = []
    model_dim_var_lst = [[] for _ in range(21)]
    human_var_lst = []
    human_dim_var_lst = [[] for _ in range(21)]
    for entry in model_results:
        model_appraisal_d = entry['appraisal_d_pred_list']
        human_appraisal_d = entry['appraisal_d_list']

        human_appraisal_d = [ele['appraisal_d'] for ele in human_appraisal_d]

        temp_model_app_lst = [[] for _ in range(21)]
        temp_human_app_lst = [[] for _ in range(21)]
        for i in range(5):

            cur_model_appraisal_d = list(model_appraisal_d[i].values())
            if len(cur_model_appraisal_d) != 21:
                continue

            for j in range(21):
                temp_model_app_lst[j].append(int(cur_model_appraisal_d[j]))

            cur_human_appraisal_d = list(human_appraisal_d[i].values())
            for j in range(21):
                temp_human_app_lst[j].append(int(cur_human_appraisal_d[j]))

        temp_model_app_mean_lst = [np.mean(ele) for ele in temp_model_app_lst]
        temp_human_app_mean_lst = [np.mean(ele) for ele in temp_human_app_lst]

        for j in range(21):
            model_dim_mean_lst[j].append(temp_model_app_mean_lst[j])
            human_dim_mean_lst[j].append(temp_human_app_mean_lst[j])

        temp_model_app_var_lst = [np.var(ele) for ele in temp_model_app_lst]
        temp_human_app_var_lst = [np.var(ele) for ele in temp_human_app_lst]

        for j in range(21):
            model_dim_var_lst[j].append(temp_model_app_var_lst[j])
            human_dim_var_lst[j].append(temp_human_app_var_lst[j])

        model_mean_lst.append(temp_model_app_mean_lst)
        human_mean_lst.append(temp_human_app_mean_lst)
        model_var_lst.append(temp_model_app_var_lst)
        human_var_lst.append(temp_human_app_var_lst)

    unimodal_model_mean_lst = sum(model_mean_lst, [])
    human_mean_lst = sum(human_mean_lst, [])

    sns.kdeplot(unimodal_model_mean_lst, label=f'{model_name} Model')
    sns.kdeplot(human_mean_lst, label='Human')
    plt.title('Model vs Human Mean')
    plt.xlabel('Mean')
    plt.legend()
    plt.savefig(f'./figures/{model_name}/var.png')
    plt.close()

    model_var_lst = sum(model_var_lst, [])
    human_var_lst = sum(human_var_lst, [])
    sns.kdeplot(model_var_lst, label=f'{model_name} Model')
    sns.kdeplot(human_var_lst, label='Human')
    plt.title('Model vs Human Variance')
    plt.xlabel('Variance')
    plt.legend()
    plt.savefig(f'./figures/{model_name}/mean.png')
    plt.close()

    for i in range(21):
        cur_dim_name = appraisal_d_lst[i]
        sns.kdeplot(model_dim_mean_lst[i], label=f'Unimodal Model {cur_dim_name}')
        sns.kdeplot(human_dim_mean_lst[i], label=f'Human {cur_dim_name}')
        plt.title(f'{cur_dim_name} Mean')
        plt.xlabel('Mean')
        plt.legend(loc='upper right')
        plt.savefig(f'./figures/{model_name}/{cur_dim_name}_mean.png')
        plt.close()

    for i in range(21):
        cur_dim_name = appraisal_d_lst[i]
        sns.kdeplot(model_dim_var_lst[i], label=f'Unimodal Model {cur_dim_name}')
        sns.kdeplot(human_dim_var_lst[i], label=f'Human {cur_dim_name}')
        plt.title(f'{cur_dim_name} Variance')
        plt.xlabel('Variance')
        plt.legend(loc='upper right')
        plt.savefig(f'./figures/{model_name}/{cur_dim_name}_var.png')
        plt.close()

In [8]:
# visualize_mean_and_var_dist(
#     fpath='./cache/deberta_large_unimodal_formatted.json',
#     model_name='unimodal-deberta'
# )
# visualize_mean_and_var_dist(
#     fpath='./cache/deberta_large_bimodal_formatted.json',
#     model_name='bimodal-deberta'
# )
# visualize_mean_and_var_dist(
#     fpath='../llm_baseline/cache/llama8_fomatted.json',
#     model_name='llama'
# )
# visualize_mean_and_var_dist(
#     fpath='../llm_baseline/cache/qwen7_fomatted.json',
#     model_name='qwen'
# )

In [9]:
# compute Wasserstein distance w.r.t. appraisal dimension
def rank_appraisal_dim_with_wasserstein(fpath: str):
    model_results = json.load(open(fpath, 'r'))

    appraisal_d_wasserstein_dict = {}
    for entry in model_results:
        predicted_appraisal_d = entry['appraisal_d_pred_list']
        human_appraisal_d = entry['appraisal_d_list']

        cur_predicted_appraisal_d = {}
        cur_human_appraisal_d = {}
        for i in range(5):
            cur_prediction = predicted_appraisal_d[i]
            cur_human = human_appraisal_d[i]['appraisal_d']

            for name, rate in cur_prediction.items():
                if name not in cur_predicted_appraisal_d:
                    cur_predicted_appraisal_d[name] = []

                cur_predicted_appraisal_d[name].append(rate)

            for name, rate in cur_human.items():
                if name not in cur_human_appraisal_d:
                    cur_human_appraisal_d[name] = []

                cur_human_appraisal_d[name].append(rate)

        for (name, pred_rate), (_, human_rate) in zip(cur_predicted_appraisal_d.items(), cur_human_appraisal_d.items()):

            if name not in appraisal_d_wasserstein_dict:
                appraisal_d_wasserstein_dict[name] = []

            cur_wasserstein_d = wasserstein_distance(pred_rate, human_rate)
            appraisal_d_wasserstein_dict[name].append(cur_wasserstein_d)

    appraisal_d_wasserstein_dict = {k: round(np.mean(v), 3) for k, v in appraisal_d_wasserstein_dict.items()}
    appraisal_d_wasserstein_dict = {
        k: tuple((v, idx)) for idx, (k, v) in enumerate(sorted(
            appraisal_d_wasserstein_dict.items(), key=lambda item: item[1], reverse=False
        ))
    }
    return appraisal_d_wasserstein_dict

In [10]:
ud_appraisal_d_wasserstein_dict = rank_appraisal_dim_with_wasserstein('./cache/deberta_large_unimodal_formatted.json')

In [54]:
bd_appraisal_d_wasserstein_dict = rank_appraisal_dim_with_wasserstein('./cache/deberta_large_bimodal_formatted.json')

In [15]:
ll_appraisal_d_wasserstein_dict = rank_appraisal_dim_with_wasserstein('../llm_baseline/cache/llama8_fomatted.json')

In [56]:
qw_appraisal_d_wasserstein_dict = rank_appraisal_dim_with_wasserstein('../llm_baseline/cache/qwen7_fomatted.json')

In [12]:
ll70_appraisal_d_wasserstein_dict = rank_appraisal_dim_with_wasserstein('../llm_baseline/cache/llama70_fomatted.json')

In [18]:
qw72_appraisal_d_wasserstein_dict = rank_appraisal_dim_with_wasserstein('../llm_baseline/cache/qwen72_fomatted.json')

In [22]:
vae_appraisal_d_wasserstein_dict = rank_appraisal_dim_with_wasserstein('../vae-exp/vae/predictions/envent_test_reader_pred.json')

In [23]:
vae_appraisal_d_wasserstein_dict

{'accept_conseq': (1.044, 0),
 'predict_conseq': (1.059, 1),
 'predict_event': (1.089, 2),
 'effort': (1.11, 3),
 'familiarity': (1.124, 4),
 'urgency': (1.158, 5),
 'goal_relevance': (1.17, 6),
 'self_control': (1.187, 7),
 'attention': (1.2, 8),
 'suddenness': (1.252, 9),
 'not_consider': (1.311, 10),
 'chance_control': (1.34, 11),
 'self_responsblt': (1.352, 12),
 'goal_support': (1.36, 13),
 'other_control': (1.362, 14),
 'chance_responsblt': (1.363, 15),
 'standards': (1.382, 16),
 'other_responsblt': (1.389, 17),
 'unpleasantness': (1.524, 18),
 'social_norms': (1.533, 19),
 'pleasantness': (1.558, 20)}

In [61]:
aggregated_ranking_dict = {}

for name, tup in ud_appraisal_d_wasserstein_dict.items():
    bd_tup = bd_appraisal_d_wasserstein_dict[name]
    ll_tup = ll_appraisal_d_wasserstein_dict[name]
    qw_tup = qw_appraisal_d_wasserstein_dict[name]

    aggregated_ranking_dict[name] = round(np.mean([tup[1], bd_tup[1], ll_tup[1], qw_tup[1]]), 3)

aggregated_ranking_dict = {
    k: v for k, v in sorted(
        aggregated_ranking_dict.items(), key=lambda item: item[1]
    )
}

In [62]:
aggregated_ranking_dict

{'pleasantness': 5.0,
 'unpleasantness': 5.25,
 'social_norms': 5.5,
 'chance_responsblt': 6.25,
 'suddenness': 6.5,
 'goal_relevance': 7.0,
 'standards': 7.25,
 'effort': 7.5,
 'attention': 7.75,
 'chance_control': 8.0,
 'self_control': 8.25,
 'self_responsblt': 9.25,
 'familiarity': 10.75,
 'other_responsblt': 12.25,
 'predict_event': 12.75,
 'predict_conseq': 13.0,
 'accept_conseq': 13.5,
 'not_consider': 14.0,
 'urgency': 14.25,
 'other_control': 16.5,
 'goal_support': 19.5}