In [1]:
import os
import ast
import pandas as pd
import numpy as np
import utils.age_processing as ap
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_rel


In [2]:
models_short = ['Random', 'MostPop', 'RP3beta', 'iALS']
filtered = True
dataset = 'ml'
ages_sort = ap.get_sorted_ages(dataset, 'defined_ages')
child_group = ages_sort[0]
cutoff = 50

In [3]:
import os
from dotenv import load_dotenv
from pathlib import Path
env_path = Path('../..') / 'config.env'
load_dotenv(dotenv_path=env_path)
dataset_dir = os.getenv("dataset_directory")

In [4]:
import json
with open('../../utils/best_models.json', 'r') as f:
    best_models = json.load(f)

models = [m for m in best_models[dataset]['all'] if m.split('_')[0] in models_short]
child_models = [m for m in best_models[dataset]['child'] if m.split('_')[0] in models_short]

if not (set([m.split('_')[0] for m in child_models]) == set([m.split('_')[0] for m in models]) == set(models_short)):
    print(f'intended models: {models_short}')
    print(f'child models: {[m.split('_')[0] for m in child_models]}')
    print(f'full models: {[m.split('_')[0] for m in models]}')
    raise ValueError('Models do not match!')



In [None]:
if dataset == 'ml':
    data_dir = dataset_dir + f'/processed/ml_rec{"_filtered" if filtered else ""}'
    recommendations_path = f'../elliot/Results/ml/recommendations'
    child_recommendations_path = f'../elliot/Results/ml_child/recommendations'
    source_dir = dataset_dir + '/processed/movielens-1m'
  
elif dataset == 'mlhd':
    data_dir = dataset_dir + f'/processed/mlhd_rec{"_filtered" if filtered else ""}'
    recommendations_path = f'../elliot/Results/mlhd/recommendations'
    child_recommendations_path = f'../elliot/Results/mlhd_child/recommendations'
    source_dir = dataset_dir + '/processed/mlhd_sampled_filtered'

    
elif dataset == 'bx':
    data_dir = dataset_dir + f'/processed/bx_rec{"_filtered" if filtered else ""}'
    recommendations_path = f'../elliot/Results/bx/recommendations'
    child_recommendations_path = f'../elliot/Results/bx_child/recommendations'
    source_dir = dataset_dir + '/processed/Book-Crossing'

    


results_path = f'../Results/{dataset}/user_and_recommendation_genre_distributions.tsv'

if not os.path.exists(f'../Results/{dataset}'):
    os.makedirs(f'../Results/{dataset}')

train_path = data_dir + f'/train.tsv'
validation_path = data_dir + f'/validation.tsv'
test_path = data_dir + f'/test.tsv'
user_info_path = data_dir + f'/user_info.tsv'

if not os.path.exists(train_path) or not os.path.exists(validation_path) or not os.path.exists(test_path):
    raise FileNotFoundError(f"One of the required files does not exist: {train_path}, {validation_path}, {test_path}")
for model in models:

    file_path = rf'{recommendations_path}/{model}.tsv'  # raw string to handle $ correctly
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Recommendation file does not exist: {file_path}")
for model in child_models:

    file_path = rf'{child_recommendations_path}/{model}.tsv' 
    if not os.path.exists(child_recommendations_path + f'/{model}.tsv'):
        raise FileNotFoundError(rf"Child recommendation file does not exist: {file_path}")
if not os.path.exists(user_info_path):
    raise FileNotFoundError(f"User info file does not exist: {user_info_path}")

columns = ['user_id', 'item_id', 'rating'] if dataset == 'bx' else ['user_id', 'item_id', 'rating', 'timestamp']

In [12]:
# Load the data
train = pd.read_csv(train_path, sep='\t', header=None, names=columns)
validation = pd.read_csv(validation_path, sep='\t', header=None, names=columns)
test = pd.read_csv(test_path, sep='\t', header=None, names=columns)
users = pd.read_csv(user_info_path, sep='\t')

# Transform the data
train = train.groupby('user_id').agg({'item_id': list}).reset_index()
validation = validation.groupby('user_id').agg({'item_id': list}).reset_index()
test = test.groupby('user_id').agg({'item_id': list}).reset_index()


train = pd.merge(train, users, on='user_id', how='left')[['user_id', 'item_id']]
train.columns = ['user_id', 'train_ids']
validation = pd.merge(validation, users, on='user_id', how='left')[['user_id', 'item_id']]
validation.columns = ['user_id', 'validation_ids']
test = pd.merge(test, users, on='user_id', how='left')[['user_id', 'item_id']]
test.columns = ['user_id', 'test_ids']

users = users[users['user_id'].isin(train['user_id'].unique())]
users['age_group'] = users['age'].apply(lambda x: ap.age_group(x, dataset, 'defined_ages'))

In [13]:
child_recommendations = {}
recommendations = {}
for model in models:
    model_name = model.split('_')[0] if '_' in model else model
    recommendations[model_name] = pd.read_csv(recommendations_path + f'/{model}.tsv', sep='\t', header=None, names=['user_id', 'track_id', 'score'])
    recommendations[model_name] = recommendations[model_name].groupby('user_id').agg({'track_id': list}).reset_index()
    recommendations[model_name] = pd.merge(recommendations[model_name], users, on='user_id', how='left')[['user_id', 'track_id']]
    recommendations[model_name].columns = ['user_id', 'rec_ids']
for model in child_models:
    model_name = model.split('_')[0] if '_' in model else model
    child_recommendations[model_name] = pd.read_csv(child_recommendations_path + f'/{model}.tsv', sep='\t', header=None, names=['user_id', 'track_id', 'score'])
    child_recommendations[model_name] = child_recommendations[model_name].groupby('user_id').agg({'track_id': list}).reset_index()
    child_recommendations[model_name] = pd.merge(child_recommendations[model_name], users, on='user_id', how='left')[['user_id', 'track_id']]
    child_recommendations[model_name].columns = ['user_id', 'rec_ids']


In [163]:

# Compute the nDCG for a user and a model at a given cutoff
def user_nDCG(user_id, test_items, recommendations, k):
    # Calculate the discounted cumulative gain at k
    DCG = 0
    for i in range(min(k, len(recommendations))):
        item = recommendations[i]
        if item in test_items:
            DCG += 1 / np.log2(i + 2)
    # Calculate the ideal discounted cumulative gain at k
    test_items = list(test_items)
    IDCG = 0
    for i in range(min(k, len(test_items))):
        item = test_items[i]
        IDCG += 1 / np.log2(i + 2)
    # Calculate the normalized discounted cumulative gain at k
    nDCG = DCG / IDCG
    return nDCG


def user_MRR(user_id, test_items, recommendations, k):
    for i in range(k):
        item = recommendations[i]
        if item in test_items:
            return 1 / (i + 1)
    return 0

def user_MAP(user_id, test_items, recommendations, k):
    AP = 0
    num_hits = 0
    for i in range(k):
        item = recommendations[i]
        if item in test_items:
            num_hits += 1
            AP += num_hits / (i + 1)
    if num_hits == 0:
        return 0
    return AP / num_hits

In [164]:
performance = {model: {'nDCG': [], 'MRR': [], 'MAP': []} for model in models_short}
ages = []


In [165]:
performance_df = pd.DataFrame()
for age, group in users.groupby('age_group'):
    print('=============================================================================')
    print(age)
    print('-------------------------------------------------------')
    print('Recommendations on general Set')
    ages.append(age)
    
    group = pd.merge(group, test[['user_id', 'test_ids']], on='user_id', how='left')
    
    for model in models_short:
        print(model)
        
        group = pd.merge(group, recommendations[model][['user_id', 'rec_ids']], on='user_id', how='left')
        group[f'nDCG_{model}'] = group.apply(lambda x: user_nDCG(x['user_id'], x['test_ids'], x['rec_ids'], cutoff), axis=1)
        group[f'MRR_{model}'] = group.apply(lambda x: user_MRR(x['user_id'], x['test_ids'], x['rec_ids'], cutoff), axis=1)
        group[f'MAP_{model}'] = group.apply(lambda x: user_MAP(x['user_id'], x['test_ids'], x['rec_ids'], cutoff), axis=1)
        
        # Store the means in the performance dictionary
        ndcg_mean = group[f'nDCG_{model}'].mean()
        performance[model]['nDCG'].append(ndcg_mean)
        print(f'ndcg_mean: {ndcg_mean:.4f}')
        mrr_mean = group[f'MRR_{model}'].mean()
        performance[model]['MRR'].append(mrr_mean)
        print(f'mrr_mean: {mrr_mean:.4f}')
        map_mean = group[f'MAP_{model}'].mean()
        performance[model]['MAP'].append(map_mean)
        print(f'map_mean: {map_mean:.4f}')
        
        group = group.drop(columns=['rec_ids'])
    group['user_group'] = group['age_group']
    performance_df = pd.concat([performance_df, group.drop(columns=['test_ids'])], axis=0)
    
    group = group[group['age_group']== child_group]
    if group.shape[0] == 0:
        continue
    print('Recommendations on child Set')
    for model in models_short:
        print(model)

        group = pd.merge(group, child_recommendations[model][['user_id', 'rec_ids']], on='user_id', how='left')
        group[f'nDCG_{model}'] = group.apply(lambda x: user_nDCG(x['user_id'], x['test_ids'], x['rec_ids'], cutoff), axis=1)
        group[f'MRR_{model}'] = group.apply(lambda x: user_MRR(x['user_id'], x['test_ids'], x['rec_ids'], cutoff), axis=1)
        group[f'MAP_{model}'] = group.apply(lambda x: user_MAP(x['user_id'], x['test_ids'], x['rec_ids'], cutoff), axis=1)

        # Store the means in the performance dictionary
        ndcg_mean = group[f'nDCG_{model}'].mean()
        print(f'ndcg_mean: {ndcg_mean:.4f}')
        #performance[model]['nDCG'].append(ndcg_mean)
        mrr_mean = group[f'MRR_{model}'].mean()
        print(f'mrr_mean: {mrr_mean:.4f}')
        #performance[model]['MRR'].append(mrr_mean)
        map_mean = group[f'MAP_{model}'].mean()
        print(f'map_mean: {map_mean:.4f}')
        #performance[model]['MAP'].append(map_mean)
        
        group = group.drop(columns=['rec_ids'])

    group['user_group'] = 'child_focused'
    performance_df = pd.concat([performance_df, group.drop(columns=['test_ids'])], axis=0)

12-16
-------------------------------------------------------
Recommendations on general Set
Random
ndcg_mean: 0.0004
mrr_mean: 0.0014
map_mean: 0.0013
MostPop
ndcg_mean: 0.0109
mrr_mean: 0.0264
map_mean: 0.0235
RP3beta
ndcg_mean: 0.0432
mrr_mean: 0.0832
map_mean: 0.0694
iALS
ndcg_mean: 0.0384
mrr_mean: 0.0822
map_mean: 0.0646
Recommendations on child Set
Random
ndcg_mean: 0.0003
mrr_mean: 0.0007
map_mean: 0.0007
MostPop
ndcg_mean: 0.0127
mrr_mean: 0.0339
map_mean: 0.0270
RP3beta
ndcg_mean: 0.0390
mrr_mean: 0.0848
map_mean: 0.0673
iALS
ndcg_mean: 0.0326
mrr_mean: 0.0759
map_mean: 0.0596
17-29
-------------------------------------------------------
Recommendations on general Set
Random
ndcg_mean: 0.0004
mrr_mean: 0.0011
map_mean: 0.0011
MostPop
ndcg_mean: 0.0082
mrr_mean: 0.0201
map_mean: 0.0180
RP3beta
ndcg_mean: 0.0332
mrr_mean: 0.0662
map_mean: 0.0533
iALS
ndcg_mean: 0.0299
mrr_mean: 0.0634
map_mean: 0.0497
30-65
-------------------------------------------------------
Recommendations

In [166]:
# Significance Testing
for model in models_short:
    print('=============================================================================')
    print(f"Model: {model}")
    for metric in ['nDCG', 'MRR', 'MAP']:
        print('-------------------------------------------------------')
        print(f"Testing {metric} for model: {model}")
        
        # Group JSD by age group for the given model
        perf_df_clean = performance_df[['user_group', f'{metric}_{model}']].dropna()
        perf_df_clean = perf_df_clean[perf_df_clean['user_group'] != 'child_focused']
        groups = [group[f'{metric}_{model}'].values for name, group in perf_df_clean.groupby('user_group')]
        
        # Print average scores
        print(f"Average {metric} for {model}:")
        grouped_means = perf_df_clean.groupby('user_group')[f'{metric}_{model}'].mean()
        for age in ages_sort:
            if age in grouped_means.index:
                print(f"{age}: {grouped_means[age]:.3f}")
            else:
                print(f"{age}: NaN")
        print()

        
        # One-way ANOVA test
        anova_result = f_oneway(*groups)
        print(f"ANOVA result for {model}: F-statistic = {anova_result.statistic:.3f}, p-value = {anova_result.pvalue:.3f}")
        print()
        # If ANOVA is significant, perform Tukey HSD for post-hoc analysis
        if anova_result.pvalue < 0.05:
            print(f"ANOVA is significant for {model}, performing Tukey HSD test...")
            
            tukey_result = pairwise_tukeyhsd(pd.to_numeric(perf_df_clean[f'{metric}_{model}']), perf_df_clean['user_group'], alpha=0.01)
            print(tukey_result)
            print()
            print('...................................................')
        # paired t-test between user_group = child_focused and user_group = child
        print('Paired t-test between child_focused and child')
        child_focused = performance_df[performance_df['user_group'] == 'child_focused'][f'{metric}_{model}']
        child = performance_df[performance_df['user_group'] == child_group][f'{metric}_{model}']
        ttest_result = ttest_rel(child_focused, child)
        print(f"Mean child_focused: {child_focused.mean():.3f}")
        print(f"Paired t-test result: t-statistic = {ttest_result.statistic:.3f}, p-value = {ttest_result.pvalue:.3f}")
        print()
    print()


Model: Random
-------------------------------------------------------
Testing nDCG for model: Random
Average nDCG for Random:
12-16: 0.000
17-29: 0.000
30-65: 0.000

ANOVA result for Random: F-statistic = 0.816, p-value = 0.442

Paired t-test between child_focused and child
Mean child_focused: 0.000
Paired t-test result: t-statistic = -0.760, p-value = 0.447

-------------------------------------------------------
Testing MRR for model: Random
Average MRR for Random:
12-16: 0.001
17-29: 0.001
30-65: 0.000

ANOVA result for Random: F-statistic = 0.733, p-value = 0.481

Paired t-test between child_focused and child
Mean child_focused: 0.001
Paired t-test result: t-statistic = -1.033, p-value = 0.302

-------------------------------------------------------
Testing MAP for model: Random
Average MAP for Random:
12-16: 0.001
17-29: 0.001
30-65: 0.000

ANOVA result for Random: F-statistic = 0.702, p-value = 0.496

Paired t-test between child_focused and child
Mean child_focused: 0.001
Paired 