# Popularity Analysis
Run script to compare popularity in user profiles to popularity in the recommendations.  Adapt the variables below to adapt to different datasets or RAs.

In [None]:
models = ['Random', 'MostPop', 'RP3beta', 'iALS']
dataset = 'bx'

In [17]:
import os
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils.genre_processing as gp
import utils.file_handler as fh
import utils.age_processing as ap
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_rel
from scipy.spatial.distance import jensenshannon

In [None]:

ages_sort = ap.get_sorted_ages(dataset, 'defined_ages')
child_group = ages_sort[0]

In [19]:
results_path = f'../Results/{dataset}/user_and_recommendation_genre_distributions.tsv'
genres = gp.get_genres(dataset)

In [20]:
results = pd.read_csv(results_path, sep='\t')

In [21]:
results[['user_id', 'Random_avg_normalized_popularity', 'child_Random_avg_normalized_popularity']].head()

Unnamed: 0,user_id,Random_avg_normalized_popularity,child_Random_avg_normalized_popularity
0,99,0.028291,
1,114,0.026109,
2,254,0.036073,
3,387,0.040582,0.092145
4,388,0.024218,


In [22]:
def popularity_lift(df, models, normalized=True, child=False):
    pop_df = df[['user_id']].copy()
    df = df[[f'train_avg_{"normalized_" if normalized else ""}popularity'] + [f'{"child_" if child else ""}{model}_avg{"_normalized" if normalized else ""}_popularity' for model in models]]
    model_pop = {}
    for model in models:
        pop_df[f'pop_lift_{model}'] = \
            (df[f'{"child_" if child else ""}{model}_avg_{"normalized_" if normalized else ""}popularity'] - \
                df[f'train_avg_{"normalized_" if normalized else ""}popularity']) / \
                df[f'train_avg_{"normalized_" if normalized else ""}popularity']
    return pop_df
    

In [23]:
pop_lift_results_df = pd.DataFrame()

for age_group, group in results.groupby('age_group'):
    pop_df = popularity_lift(group, models, normalized=True, child=False)
    pop_df['user_group'] = age_group
    pop_df['age'] = group['age']
    pop_lift_results_df = pd.concat([pop_lift_results_df, pop_df], ignore_index=True, axis=0)
    if age_group == child_group:
        child_pop_df = popularity_lift(group, models, normalized=True, child=True)
        child_pop_df['user_group'] = 'child_focused'
        child_pop_df['age'] = group['age']
        pop_lift_results_df = pd.concat([pop_lift_results_df, child_pop_df], ignore_index=True, axis=0)


In [24]:
# Testing the GMC across all groups
for model in models:
    print('==========================================================================')
    print(f"Testing Popularity Lift for model: {model}")
    
    # Group JSD by age group for the given model
    pop_lift_df_clean = pop_lift_results_df[['user_group', f'pop_lift_{model}']].dropna()
    pop_lift_df_clean = pop_lift_df_clean[pop_lift_df_clean['user_group'] != 'child_focused']
    groups = [group[f'pop_lift_{model}'].values for name, group in pop_lift_df_clean.groupby('user_group')]
    
    # Print average scores
    print(f"Average Popularity Lift for {model}:")
    pop_lift_grouped = pop_lift_df_clean.groupby('user_group')[f'pop_lift_{model}'].mean()
    for age in ages_sort:
        if age in pop_lift_grouped.index:
            print(f"{age}: {pop_lift_grouped[age]:.3f}")
        else:
            print(f"{age}: {np.nan:.3f}")
    print()
    print('-----------------------------------------------')
    # One-way ANOVA test
    anova_result = f_oneway(*groups)
    print(f"ANOVA result for {model}: F-statistic = {anova_result.statistic:.4f}, p-value = {anova_result.pvalue:.4f}")
    # If ANOVA is significant, perform Tukey HSD for post-hoc analysis
    if anova_result.pvalue < 0.05:
        print(f"ANOVA is significant for {model}, performing Tukey HSD test...")
        
        tukey_result = pairwise_tukeyhsd(pd.to_numeric(pop_lift_df_clean[f'pop_lift_{model}']), pop_lift_df_clean['user_group'], alpha=0.01)
        print(tukey_result)
    print()
    print('-----------------------------------------------')
    # paired t-test between user_group = child_focused and user_group = child
    print('Paired t-test between child_focused and child')
    child_focused = pop_lift_results_df[pop_lift_results_df['user_group'] == 'child_focused'][f'pop_lift_{model}'].astype(float)
    child = pop_lift_results_df[pop_lift_results_df['user_group'] == child_group][f'pop_lift_{model}'].astype(float)
    ttest_result = ttest_rel(child_focused, child)
    print(f"Mean Pop Lift for child_focused: {child_focused.mean():.3f}")
    print(f"Paired t-test result: t-statistic = {ttest_result.statistic:.3f}, p-value = {ttest_result.pvalue:.3f}")
        
    print()
    print()

Testing Popularity Lift for model: Random
Average Popularity Lift for Random:
12-17: -0.443
18-49: -0.547
50-65: -0.564

-----------------------------------------------
ANOVA result for Random: F-statistic = 10.7881, p-value = 0.0000
ANOVA is significant for Random, performing Tukey HSD test...
Multiple Comparison of Means - Tukey HSD, FWER=0.01 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
 12-17  18-49  -0.1038    0.0 -0.1737 -0.0339   True
 12-17  50-65  -0.1209    0.0 -0.1975 -0.0443   True
 18-49  50-65  -0.0172 0.3811 -0.0549  0.0206  False
----------------------------------------------------

-----------------------------------------------
Paired t-test between child_focused and child
Mean Pop Lift for child_focused: 0.165
Paired t-test result: t-statistic = 17.975, p-value = 0.000


Testing Popularity Lift for model: MostPop
Average Popularity Lift for MostPop:
12-17: 7.961
18-49: 6.220
50-65: 5.864

-----------------