# Popularity Analysis
Run script to compare popularity in user profiles to popularity in the recommendations.  Adapt the variables below to adapt to different datasets or RAs.

In [1]:
models = ['Random', 'MostPop', 'RP3beta', 'iALS']
dataset = 'ml'

In [2]:
import os
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils.genre_processing as gp
import utils.file_handler as fh
import utils.age_processing as ap
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_rel
from scipy.spatial.distance import jensenshannon

In [3]:

ages_sort = ap.get_sorted_ages(dataset, 'defined_ages')
child_group = ages_sort[0]

In [4]:
results_path = f'../Results/{dataset}/user_and_recommendation_genre_distributions.tsv'
genres = gp.get_genres(dataset)

In [5]:
results = pd.read_csv(results_path, sep='\t')

In [6]:
results[['user_id', 'Random_avg_normalized_popularity', 'child_Random_avg_normalized_popularity']].head()

Unnamed: 0,user_id,Random_avg_normalized_popularity,child_Random_avg_normalized_popularity
0,1,0.051296,0.109302
1,2,0.08,
2,3,0.05856,
3,4,0.056567,
4,5,0.062193,


In [7]:
def popularity_lift(df, models, normalized=True, child=False):
    pop_df = df[['user_id']].copy()
    df = df[[f'train_avg_{"normalized_" if normalized else ""}popularity'] + [f'{"child_" if child else ""}{model}_avg{"_normalized" if normalized else ""}_popularity' for model in models]]
    model_pop = {}
    for model in models:
        pop_df[f'pop_lift_{model}'] = \
            (df[f'{"child_" if child else ""}{model}_avg_{"normalized_" if normalized else ""}popularity'] - \
                df[f'train_avg_{"normalized_" if normalized else ""}popularity']) / \
                df[f'train_avg_{"normalized_" if normalized else ""}popularity']
    return pop_df
    

In [8]:
pop_lift_results_df = pd.DataFrame()

for age_group, group in results.groupby('age_group'):
    pop_df = popularity_lift(group, models, normalized=True, child=False)
    pop_df['user_group'] = age_group
    pop_df['age'] = group['age']
    pop_lift_results_df = pd.concat([pop_lift_results_df, pop_df], ignore_index=True, axis=0)
    if age_group == child_group:
        child_pop_df = popularity_lift(group, models, normalized=True, child=True)
        child_pop_df['user_group'] = 'child_focused'
        child_pop_df['age'] = group['age']
        pop_lift_results_df = pd.concat([pop_lift_results_df, child_pop_df], ignore_index=True, axis=0)


In [9]:
# Testing the GMC across all groups
for model in models:
    print('==========================================================================')
    print(f"Testing Popularity Lift for model: {model}")
    
    # Group JSD by age group for the given model
    pop_lift_df_clean = pop_lift_results_df[['user_group', f'pop_lift_{model}']].dropna()
    pop_lift_df_clean = pop_lift_df_clean[pop_lift_df_clean['user_group'] != 'child_focused']
    groups = [group[f'pop_lift_{model}'].values for name, group in pop_lift_df_clean.groupby('user_group')]
    
    # Print average scores
    print(f"Average Popularity Lift for {model}:")
    pop_lift_grouped = pop_lift_df_clean.groupby('user_group')[f'pop_lift_{model}'].mean()
    for age in ages_sort:
        if age in pop_lift_grouped.index:
            print(f"{age}: {pop_lift_grouped[age]:.3f}")
        else:
            print(f"{age}: {np.nan:.3f}")
    print()
    print('-----------------------------------------------')
    # One-way ANOVA test
    anova_result = f_oneway(*groups)
    print(f"ANOVA result for {model}: F-statistic = {anova_result.statistic:.4f}, p-value = {anova_result.pvalue:.4f}")
    # If ANOVA is significant, perform Tukey HSD for post-hoc analysis
    if anova_result.pvalue < 0.05:
        print(f"ANOVA is significant for {model}, performing Tukey HSD test...")
        
        tukey_result = pairwise_tukeyhsd(pd.to_numeric(pop_lift_df_clean[f'pop_lift_{model}']), pop_lift_df_clean['user_group'], alpha=0.01)
        print(tukey_result)
    print()
    print('-----------------------------------------------')
    # paired t-test between user_group = child_focused and user_group = child
    print('Paired t-test between child_focused and child')
    child_focused = pop_lift_results_df[pop_lift_results_df['user_group'] == 'child_focused'][f'pop_lift_{model}'].astype(float)
    child = pop_lift_results_df[pop_lift_results_df['user_group'] == child_group][f'pop_lift_{model}'].astype(float)
    ttest_result = ttest_rel(child_focused, child)
    print(f"Mean Pop Lift for child_focused: {child_focused.mean():.3f}")
    print(f"Paired t-test result: t-statistic = {ttest_result.statistic:.3f}, p-value = {ttest_result.pvalue:.3f}")
        
    print()
    print()

Testing Popularity Lift for model: Random
Average Popularity Lift for Random:
Under 18: -0.685
18-49: -0.738
50+: -0.698

-----------------------------------------------
ANOVA result for Random: F-statistic = 56.9377, p-value = 0.0000
ANOVA is significant for Random, performing Tukey HSD test...
 Multiple Comparison of Means - Tukey HSD, FWER=0.01 
group1  group2  meandiff p-adj   lower  upper  reject
-----------------------------------------------------
 18-49      50+   0.0402    0.0  0.0272 0.0532   True
 18-49 Under 18   0.0536    0.0  0.0292  0.078   True
   50+ Under 18   0.0134 0.3105 -0.0133   0.04  False
-----------------------------------------------------

-----------------------------------------------
Paired t-test between child_focused and child
Mean Pop Lift for child_focused: -0.557
Paired t-test result: t-statistic = 10.977, p-value = 0.000


Testing Popularity Lift for model: MostPop
Average Popularity Lift for MostPop:
Under 18: 1.768
18-49: 1.263
50+: 1.594

-------