In [None]:
import utils.age_processing as ap
import utils.genre_processing as gp
import os
import ast
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
import numpy as np
from scipy.spatial.distance import jensenshannon
matplotlib.use('TkAgg')

In [33]:
from dotenv import load_dotenv
from pathlib import Path
env_path = Path('..') / 'config.env'
load_dotenv(dotenv_path=env_path)
dataset_dir = os.getenv("dataset_directory")

In [39]:
dataset = 'bx' # 'ml', 'mlhd', or 'bx'
weighted = True
age_type = 'finegrained_age'

In [40]:
ages_sort = ap.get_sorted_ages(dataset, age_type)

In [41]:
genres = []

if dataset == 'ml':
    data_dir = dataset_dir + '/processed/movielens-1m'
    with open('../utils/ML_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'mlhd':
    data_dir = dataset_dir + '/processed/mldh_sampled_filtered'
    with open('../utils/MLHD_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'bx':
    data_dir = dataset_dir + '/processed/Book-Crossing'
    with open('../utils/BX_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())



In [42]:
user_profile_stats_path = data_dir + f'/user_profile_stats{"_weighted" if weighted else ""}.tsv'
user_path = data_dir + '/users.tsv'
interactions_path = data_dir + '/interactions.tsv.bz2'

users = pd.read_csv(user_path, sep='\t')
user_stats = pd.read_csv(user_profile_stats_path, sep='\t')
user_stats['normalized_genre_distribution'] = user_stats['normalized_genre_distribution'].apply(ast.literal_eval)

user_stats['age_group'] = user_stats['age'].apply(lambda x: ap.age_group(x, dataset, age_type))
user_stats['age_group'] = pd.Categorical(user_stats['age_group'], categories=ages_sort, ordered=True)

grouped_user_stats = user_stats.groupby('age_group')

51.0
51.0
51.0
51.0
51.0
51.0
51.0
34.0
34.0
34.0
34.0
34.0
24.0
24.0
24.0
43.0
43.0
43.0
43.0
43.0
43.0
37.0
37.0
37.0
37.0
37.0
37.0
20.0
20.0
42.0
42.0
42.0
42.0
42.0
42.0
20.0
57.0
57.0
57.0
57.0
57.0
57.0
57.0
57.0
49.0
49.0
49.0
49.0
49.0
49.0
49.0
38.0
38.0
38.0
38.0
38.0
38.0
35.0
35.0
35.0
35.0
35.0
35.0
62.0
62.0
62.0
62.0
62.0
62.0
62.0
62.0
20.0
62.0
62.0
62.0
62.0
62.0
62.0
62.0
62.0
34.0
34.0
34.0
34.0
34.0
28.0
28.0
28.0
28.0
27.0
27.0
27.0
27.0
38.0
38.0
38.0
38.0
38.0
38.0
28.0
28.0
28.0
28.0
21.0
21.0
41.0
41.0
41.0
41.0
41.0
41.0
37.0
37.0
37.0
37.0
37.0
37.0
25.0
25.0
25.0
25.0
24.0
24.0
24.0
20.0
21.0
21.0
53.0
53.0
53.0
53.0
53.0
53.0
53.0
27.0
27.0
27.0
27.0
27.0
27.0
27.0
27.0
63.0
63.0
63.0
63.0
63.0
63.0
63.0
63.0
45.0
45.0
45.0
45.0
45.0
45.0
45.0
23.0
23.0
23.0
35.0
35.0
35.0
35.0
35.0
35.0
44.0
44.0
44.0
44.0
44.0
44.0
24.0
24.0
24.0
29.0
29.0
29.0
29.0
40.0
40.0
40.0
40.0
40.0
40.0
48.0
48.0
48.0
48.0
48.0
48.0
48.0
40.0
40.0
40.0
40.0
40.0
40.0
20.0
31.0


  grouped_user_stats = user_stats.groupby('age_group')


In [43]:
user_stats.columns

Index(['user_id', 'age', 'num_interactions', 'num_unique_items',
       'normalized_genre_distribution', 'age_group'],
      dtype='object')

In [None]:
genres = []

if dataset == 'mlhd':
    genres = gp.MLHD_genres
elif dataset == 'ml':
    genres = gp.ML_genres
elif dataset == 'bx':
    genres = gp.BX_genres

age_genre_profiles = pd.DataFrame(columns=('age_group', 'genre_distribution'))
print("Calculating genre distributions for each age group...")

for age, group in grouped_user_stats:
    genre_sum = {}
    num_users = len(group)
    for i,user in group.iterrows():

        for user_genre, value in user['normalized_genre_distribution'].items():
            if user_genre in genre_sum:
                genre_sum[user_genre] += value
            else:
                genre_sum[user_genre] = value
       
    genre_avg = {genre: g_sum for genre, g_sum in genre_sum.items()}
    if age in age_genre_profiles['age_group'].values:
        age_genre_profiles.loc[age_genre_profiles['age_group'] == age, 'genre_distribution'] = [genre_avg]
        print("this should not happen") # test statement
    else:
        age_genre_profiles = pd.concat([age_genre_profiles, pd.DataFrame({'age_group': [age], 'genre_distribution': [genre_avg]})], ignore_index=True)



age_genre_profiles['genre_distribution'] = age_genre_profiles['genre_distribution'].apply(lambda x: gp.genre_dict_to_list(x, dataset))

user_stats['genre_distribution'] = user_stats['normalized_genre_distribution'].apply(lambda x: gp.genre_dict_to_list(x, dataset))

grouped_user_stats = user_stats.groupby('age_group')

age_genre_profiles['age_group'] = pd.Categorical(age_genre_profiles['age_group'], 
                                            categories=ages_sort, 
                                            ordered=True)

    
age_genre_profiles.sort_values('age_group', inplace=True)

Calculating genre distributions for each age group...


  grouped_user_stats = user_stats.groupby('age_group')


In [49]:
genres = [genre.capitalize() for genre in genres]
# Prepare data for stacked bar plot
genre_data = pd.DataFrame(age_genre_profiles['genre_distribution'].tolist(), index=age_genre_profiles['age_group'], columns=genres)


# Plot stacked bar chart
plt.figure(figsize=(12, 8))
sns.set(style='whitegrid')
genre_data.plot(kind='bar', stacked=True, cmap='tab20', ax=plt.gca())
plt.xlabel('Age Group', fontsize=16)
plt.ylabel('Average Genre Distribution', fontsize=16)
handles, labels = plt.gca().get_legend_handles_labels() 
plt.legend(handles[::-1], labels[::-1], title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
plt.xticks(rotation=45, fontsize=16)
plt.yticks(fontsize=16)
plt.tight_layout()
plt.show()

In [None]:


age_genre_profiles['entropy'] = age_genre_profiles['genre_distribution'].apply(lambda x: entropy(x))
plt.figure(figsize=(12, 8))
sns.barplot(x='age_group', y='entropy', data=age_genre_profiles, palette='tab10')
plt.xlabel('Age Group', fontsize=16)
plt.ylabel('Entropy', fontsize=16)
plt.xticks(rotation=45, fontsize=16, ha='right')
plt.yticks(fontsize=16)
plt.tight_layout()
plt.show()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='age_group', y='entropy', data=age_genre_profiles, palette='tab10')


In [48]:
print("Calculating Jensen-Shannon Divergence between age groups...")
inter_group_js_divergences = {}
epsilon = 1e-10

for i, row1 in age_genre_profiles.iterrows():
    age1 = row1['age_group']
    genre_distribution1 = np.array(row1['genre_distribution']) + epsilon
    
    for j, row2 in age_genre_profiles.iterrows():
        age2 = row2['age_group']
        genre_distribution2 = np.array(row2['genre_distribution']) + epsilon

        # Jensen-Shannon Divergence
        js_div = jensenshannon(genre_distribution1, genre_distribution2)**2
        print(js_div)
        inter_group_js_divergences[(age1, age2)] = js_div

ages = age_genre_profiles['age_group'].unique()  

heatmap_data = pd.DataFrame(index=ages, columns=ages)
for (age1, age2), js_div in inter_group_js_divergences.items():
    #print(f"Jensen-Shannon Divergence between age {age1} and age {age2}: {js_divergence}")
    heatmap_data.at[age1, age2] = js_div
    heatmap_data.at[age2, age1] = js_div  # Ensure symmetry
    
heatmap_data = heatmap_data.astype(float)
# Plot the heatmap
plt.figure(figsize=(12, 8))
ax = sns.heatmap(heatmap_data, annot=True, cmap="coolwarm_r", linewidths=0.5, cbar=False, annot_kws={"size": 12})
#plt.title("Jensen-Shannon Divergence Between Age Groups")

plt.xlabel("Age Group", fontsize=16)
plt.ylabel("Age Group", fontsize=16)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha='right', fontsize=16)  # Set rotation to 0 for horizontal

plt.tight_layout()
plt.show()

Calculating Jensen-Shannon Divergence between age groups...
0.0
0.03541104424586099
0.040706555648067744
0.07203743280236663
0.09911837759854641
0.11774886244047263
0.12998466555477803
0.15883465016334233
0.1748456694995193
0.18052255343915735
0.19817563773815658
0.2064777919136663
0.21387373975172766
0.2378982778981377
0.2770775367647315
0.03541104424586099
0.0
0.009274235916486599
0.02218019330432416
0.038502455713829216
0.05428331399006744
0.06962115054474846
0.09783749782330593
0.11239718346261127
0.12559489041128655
0.14689389450759036
0.1587699107882761
0.17267930033165363
0.20182556148473477
0.24708789240312268
0.040706555648067744
0.009274235916486599
0.0
0.0074739053237773325
0.01817084372452172
0.032097271990900726
0.043178790498569534
0.06827450622294756
0.0816716149796698
0.0917270344096617
0.10967585690429035
0.119929301799574
0.13059182610862052
0.15503292593967374
0.19646774988724436
0.07203743280236663
0.02218019330432416
0.0074739053237773325
0.0
0.005122990749769038
0

In [31]:
print("Performing ANOVA analysis on genre distributions...")

genres = [genre.replace("'", "").replace("-", "").replace(" ", "") for genre in genres]
genre_df = pd.DataFrame(user_stats['genre_distribution'].apply(np.array).tolist())
genre_df.columns = genres

# Combine with the original DataFrame
genre_df = pd.concat([user_stats['age_group'], genre_df], axis=1)

print(genre_df.head())
from statsmodels.multivariate.manova import MANOVA

# Fit the MANOVA model
manova = MANOVA.from_formula(' + '.join(genres) + ' ~ age_group', data=genre_df)
manova_results = manova.mv_test()

print(manova_results)

from statsmodels.stats.multicomp import pairwise_tukeyhsd

from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Melt the DataFrame to long format for ANOVA
melted_df = genre_df.melt(id_vars=['age_group'], value_vars=genre_df.columns, var_name='genre', value_name='value')


import statsmodels.api as sm
from statsmodels.formula.api import ols

# Perform ANOVA for each genre
anova_results = {}
for genre in genre_df.drop('age_group', axis=1).columns:
    model = ols(f'{genre} ~ age_group', data=genre_df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_results[genre] = anova_table

# Print ANOVA results for each genre
for genre, result in anova_results.items():
    if result['PR(>F)'][0] < 0.05:
        print(f'ANOVA results for {genre}:')
        print(result)
        print('\n')

# Tukey's HSD test for each genre
tukey_results = {}
for genre in genres:
    tukey = pairwise_tukeyhsd(endog=melted_df[melted_df['genre'] == genre]['value'],
                            groups=melted_df[melted_df['genre'] == genre]['age_group'],
                            alpha=0.01)
    tukey_results[genre] = tukey

print('sigificant Genres')
# Print Tukey's HSD results for each genre
for genre, result in tukey_results.items():
    if result.reject.any():
        
        print(f'Tukey HSD results for {genre}:')
        print(result)
        print('\n')

print('non-significant Genres')
for genre, result in tukey_results.items():
    if not result.reject.any():
        print(f'Tukey HSD results for {genre}:')
        print(result)
        print('\n')
        

Performing ANOVA analysis on genre distributions...
  age_group  Animation  Childrens   Musical   Romance     Drama    Comedy  \
0  Under 18   0.135256   0.135256  0.111218  0.054487  0.278846  0.132051   
1       56+   0.000000   0.000000  0.000000  0.069380  0.382041  0.098191   
2     25-34   0.016993   0.016993  0.003922  0.029085  0.075817  0.355229   
3     45-49   0.000000   0.011905  0.000000  0.025397  0.132540  0.000000   
4     25-34   0.008418   0.011364  0.005471  0.072811  0.360690  0.188131   

     Action  Adventure   Fantasy     Scifi       War  Thriller     Crime  \
0  0.028846   0.024679  0.019231  0.019231  0.016026  0.032051  0.012821   
1  0.169121   0.043798  0.001938  0.043798  0.040568  0.093669  0.034884   
2  0.153268   0.176144  0.011438  0.033987  0.007843  0.040850  0.000000   
3  0.376190   0.086508  0.023810  0.126190  0.034921  0.067460  0.015873   
4  0.069024   0.015152  0.000000  0.031566  0.012626  0.092172  0.047138   

   Filmnoir   Mystery    Hor

  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:
  if result['PR(>F)'][0] < 0.05:


ANOVA results for Animation:
             sum_sq      df          F        PR(>F)
age_group  0.167818     6.0  48.801752  8.074273e-59
Residual   3.457685  6033.0        NaN           NaN


ANOVA results for Childrens:
             sum_sq      df          F        PR(>F)
age_group  0.430586     6.0  74.400415  6.641330e-90
Residual   5.819239  6033.0        NaN           NaN


ANOVA results for Musical:
             sum_sq      df          F        PR(>F)
age_group  0.057781     6.0  14.325227  2.762866e-16
Residual   4.055678  6033.0        NaN           NaN


ANOVA results for Romance:
              sum_sq      df         F    PR(>F)
age_group   0.075703     6.0  5.015673  0.000039
Residual   15.176239  6033.0       NaN       NaN


ANOVA results for Drama:
              sum_sq      df          F        PR(>F)
age_group   3.534204     6.0  42.375999  6.303694e-51
Residual   83.859792  6033.0        NaN           NaN


ANOVA results for Comedy:
              sum_sq      df          F  

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


sigificant Genres
Tukey HSD results for Animation:
 Multiple Comparison of Means - Tukey HSD, FWER=0.01  
group1  group2  meandiff p-adj   lower   upper  reject
------------------------------------------------------
 18-24    25-34   -0.005    0.0 -0.0081 -0.0019   True
 18-24    35-44  -0.0041 0.0008 -0.0076 -0.0007   True
 18-24    45-49  -0.0077    0.0  -0.012 -0.0034   True
 18-24    50-55  -0.0106    0.0 -0.0151 -0.0062   True
 18-24      56+  -0.0107    0.0 -0.0156 -0.0058   True
 18-24 Under 18   0.0174    0.0  0.0113  0.0235   True
 25-34    35-44   0.0009 0.9508 -0.0021  0.0039  False
 25-34    45-49  -0.0027 0.2178 -0.0067  0.0013  False
 25-34    50-55  -0.0056    0.0 -0.0098 -0.0015   True
 25-34      56+  -0.0057 0.0004 -0.0103 -0.0011   True
 25-34 Under 18   0.0224    0.0  0.0166  0.0282   True
 35-44    45-49  -0.0036 0.0569 -0.0078  0.0007  False
 35-44    50-55  -0.0065    0.0 -0.0109 -0.0021   True
 35-44      56+  -0.0066 0.0001 -0.0115 -0.0017   True
 35-44 Under 1