# Miscalibration Analysis
Use this script to analyze and test the miscalibration between recommendations and user profiles. Adapt the variables below to adapt to different datasets or RAs.


In [None]:
models = ['Random', 'MostPop', 'RP3beta', 'iALS']
dataset = 'bx'

In [81]:
import os
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils.genre_processing as gp
import utils.file_handler as fh
import utils.age_processing as ap
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_rel
from scipy.spatial.distance import jensenshannon

In [None]:

ages_sort = ap.get_sorted_ages(dataset, 'defined_ages')
child_group = ages_sort[0]

In [83]:
results_path = f'../Results/{dataset}/user_and_recommendation_genre_distributions.tsv'
genres = gp.get_genres(dataset)

In [84]:
results = pd.read_csv(results_path, sep='\t')

In [85]:
for column in results.columns:
    if 'genre_distribution' in column:
        results[column] = results[column].apply(fh.safe_literal_eval)

In [86]:
results.columns

Index(['user_id', 'age', 'age_group', 'train_genre_distribution',
       'train_interactions', 'train_avg_popularity',
       'train_avg_normalized_popularity', 'train_avg_child_popularity',
       'train_avg_child_normalized_popularity',
       'validation_genre_distribution', 'validation_interactions',
       'validation_avg_popularity', 'validation_avg_normalized_popularity',
       'validation_avg_child_popularity',
       'validation_avg_child_normalized_popularity', 'test_genre_distribution',
       'test_interactions', 'test_avg_popularity',
       'test_avg_normalized_popularity', 'test_avg_child_popularity',
       'test_avg_child_normalized_popularity', 'RP3beta_genre_distribution',
       'RP3beta_interactions', 'RP3beta_avg_popularity',
       'RP3beta_avg_normalized_popularity', 'RP3beta_avg_child_popularity',
       'RP3beta_avg_child_normalized_popularity',
       'child_RP3beta_genre_distribution', 'child_RP3beta_interactions',
       'child_RP3beta_avg_popularity',
   

In [87]:
    
def genre_miscalibration(df, models, child=False):
    js_df = df[['user_id']].copy()
    df =  df[['train_genre_distribution'] + [f'{"child_" if child else ""}{model}_genre_distribution' for model in models]]
    model_js = {}
    for model in models:
        js_df.loc[:,f'gmc_{model}'] = None
    
    for i, row in df.iterrows():
        user_distribution = np.array(gp.genre_dict_to_list(row['train_genre_distribution'], dataset), dtype=float)
        for model in models:
            model_distribution = np.array(gp.genre_dict_to_list(row[f'{"child_" if child else ""}{model}_genre_distribution'], dataset), dtype=float)   

            js = jensenshannon(user_distribution, model_distribution)**2

            if np.isnan(js):
                js = 0
            model_js[model] = model_js.get(model, 0) + js
            js_df.at[i, f'gmc_{model}'] = js
    model_js = {key: value/len(df) for key, value in model_js.items()}
    return model_js, js_df

In [88]:
gmc_results_df = pd.DataFrame()

for age_group, group in results.groupby('age_group'):
    mean_gmcs, gmc_df = genre_miscalibration(group, models, child=False)
    gmc_df['user_group'] = age_group
    gmc_df['age'] = group['age']
    gmc_results_df = pd.concat([gmc_results_df,gmc_df], ignore_index=True, axis=0)
    if age_group == child_group:
        child_mean_gmcs, child_gmc_df = genre_miscalibration(group, models, child=True)
        child_gmc_df['user_group'] = 'child_focused'
        child_gmc_df['age'] = group['age']
        gmc_results_df = pd.concat([gmc_results_df,child_gmc_df], ignore_index=True, axis=0)


In [89]:
print(gmc_results_df.columns)

Index(['user_id', 'gmc_Random', 'gmc_MostPop', 'gmc_RP3beta', 'gmc_iALS',
       'user_group', 'age'],
      dtype='object')


In [90]:
# Testing the GMC across all groups
for model in models:
    print('==========================================================================')
    print(f"Testing GMC for model: {model}")
    
    # Group JSD by age group for the given model
    js_df_clean = gmc_results_df[['user_group', f'gmc_{model}']].dropna()
    js_df_clean = js_df_clean[js_df_clean['user_group'] != 'child_focused']
    groups = [group[f'gmc_{model}'].values for name, group in js_df_clean.groupby('user_group')]
    
    # Print average scores
    print(f"Average JS divergence for {model}:")
    gmc_grouped = js_df_clean.groupby('user_group')[f'gmc_{model}'].mean()
    for age in ages_sort:
        if age in gmc_grouped.index:
            print(f"{age}: {gmc_grouped[age]:.3f}")
        else:
            print(f"{age}: {np.nan:.3f}")
    print()
    print('-----------------------------------------------')
    # One-way ANOVA test
    anova_result = f_oneway(*groups)
    print(f"ANOVA result for {model}: F-statistic = {anova_result.statistic:.4f}, p-value = {anova_result.pvalue:.4f}")
    
    # If ANOVA is significant, perform Tukey HSD for post-hoc analysis
    if anova_result.pvalue < 0.05:
        print(f"ANOVA is significant for {model}, performing Tukey HSD test...")
        
        tukey_result = pairwise_tukeyhsd(pd.to_numeric(js_df_clean[f'gmc_{model}']), js_df_clean['user_group'], alpha=0.01)
        print(tukey_result)
    print()
    print('-----------------------------------------------')
    # paired t-test between user_group = child_focused and user_group = child
    print('Paired t-test between child_focused and child')
    child_focused = gmc_results_df[gmc_results_df['user_group'] == 'child_focused'][f'gmc_{model}'].astype(float)
    child = gmc_results_df[gmc_results_df['user_group'] == child_group][f'gmc_{model}'].astype(float)
    ttest_result = ttest_rel(child_focused, child)
    print(f"Mean GMC for child_focused: {child_focused.mean():.3f}")
    print(f"Paired t-test result: t-statistic = {ttest_result.statistic:.3f}, p-value = {ttest_result.pvalue:.3f}")
        
    print()
    print()

Testing GMC for model: Random
Average JS divergence for Random:
12-17: 0.293
18-49: 0.212
50-65: 0.217

-----------------------------------------------
ANOVA result for Random: F-statistic = 67.6633, p-value = 0.0000
ANOVA is significant for Random, performing Tukey HSD test...
Multiple Comparison of Means - Tukey HSD, FWER=0.01 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
 12-17  18-49  -0.0815    0.0  -0.102 -0.0611   True
 12-17  50-65   -0.076    0.0 -0.0984 -0.0536   True
 18-49  50-65   0.0056 0.3056 -0.0055  0.0166  False
----------------------------------------------------

-----------------------------------------------
Paired t-test between child_focused and child
Mean GMC for child_focused: 0.261
Paired t-test result: t-statistic = -5.919, p-value = 0.000


Testing GMC for model: MostPop
Average JS divergence for MostPop:
12-17: 0.308
18-49: 0.199
50-65: 0.181

-----------------------------------------------
ANOVA