In [2]:
import utils.age_processing as ap
import os
import ast
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.use('TkAgg')

In [3]:
from dotenv import load_dotenv
from pathlib import Path
env_path = Path('..') / 'config.env'
load_dotenv(dotenv_path=env_path)
dataset_dir = os.getenv("dataset_directory")

In [85]:
dataset = 'bx' # 'ml', 'mlhd', or 'bx'
weighted = True
age_type = 'all_ages'

In [86]:
ages_sort = ap.get_sorted_ages(dataset, age_type)

In [87]:
genres = []

if dataset == 'ml':
    data_dir = dataset_dir + '/processed/movielens-1m'
    with open('../utils/ML_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'mlhd':
    data_dir = dataset_dir + '/processed/mlhd_sampled_filtered'
    with open('../utils/MLHD_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'bx':
    data_dir = dataset_dir + '/processed/Book-Crossing'
    with open('../utils/BX_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())



In [88]:
user_profile_stats_path = data_dir + f'/user_profile_stats{"_weighted" if weighted else ""}.tsv'
user_path = data_dir + '/users.tsv'
interactions_path = data_dir + '/interactions.tsv.bz2'
if dataset == 'ml':
    items_path = data_dir + '/movies.tsv'
elif dataset == 'mlhd':
    items_path = data_dir + '/tracks.tsv'
    artists_path = data_dir + '/artists.tsv'
elif dataset == 'bx':
    items_path = data_dir + '/books.tsv'

users = pd.read_csv(user_path, sep='\t')
user_stats = pd.read_csv(user_profile_stats_path, sep='\t')
user_stats['normalized_genre_distribution'] = user_stats['normalized_genre_distribution'].apply(ast.literal_eval)


user_stats['age_group'] = user_stats['age'].apply(lambda x: ap.age_group(x, dataset, age_type))
user_stats['age_group'] = pd.Categorical(user_stats['age_group'], categories=ages_sort, ordered=True)

grouped_user_stats = user_stats.groupby('age_group')

  grouped_user_stats = user_stats.groupby('age_group')


In [89]:
for age, group in grouped_user_stats:
    print(f"Age group: {age}")
    print(f"Number of users: {len(group)}")
    print(f"Average number of interactions: {group['num_interactions'].mean()}")

Age group: 12
Number of users: 66
Average number of interactions: 7.909090909090909
Age group: 13
Number of users: 229
Average number of interactions: 3.7860262008733625
Age group: 14
Number of users: 449
Average number of interactions: 5.276169265033408
Age group: 15
Number of users: 549
Average number of interactions: 3.9799635701275045
Age group: 16
Number of users: 573
Average number of interactions: 3.6317626527050613
Age group: 17
Number of users: 666
Average number of interactions: 5.087087087087087
Age group: 18
Number of users: 719
Average number of interactions: 7.411682892906815
Age group: 19
Number of users: 663
Average number of interactions: 4.549019607843137
Age group: 20
Number of users: 685
Average number of interactions: 5.395620437956205
Age group: 21
Number of users: 785
Average number of interactions: 7.23312101910828
Age group: 22
Number of users: 825
Average number of interactions: 7.774545454545454
Age group: 23
Number of users: 1004
Average number of interactio

## Profile Size Distribution

In [90]:
interactions_per_age = user_stats.groupby('age_group')['num_interactions'].sum().to_dict()
interactions_per_age = {age: interactions_per_age.get(age, 0) for age in ages_sort}


# Print percentage of ratings per age group
total_ratings = sum(interactions_per_age.values())
for age, count in interactions_per_age.items():
    percentage = (count / total_ratings) * 100 if total_ratings > 0 else 0
    print(f"Age group: {age}, Number of ratings: {count}, Percentage: {percentage:.2f}%")

plt.figure(figsize=(12, 6))
ax = sns.lineplot(data=interactions_per_age, marker='o', color='b', )
ylabels = [f'{x:,}' for x in ax.get_yticks()]
ax.set_yticklabels(ylabels)
plt.title('Number of Ratings per Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Ratings')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
plt.xticks(ha='center')  # Rotate labels by 45 degrees and align to the right
plt.show()



plt.figure(figsize=(12, 6))
ax = sns.boxplot(data=user_stats, x='age_group', y='num_interactions')
ax.set(ylim=(0,250))
plt.title('Number of Ratings per Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Ratings per User')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
plt.xticks(ha='center')
plt.show()

  interactions_per_age = user_stats.groupby('age_group')['num_interactions'].sum().to_dict()
  ax.set_yticklabels(ylabels)


Age group: 12, Number of ratings: 522, Percentage: 0.13%
Age group: 13, Number of ratings: 867, Percentage: 0.22%
Age group: 14, Number of ratings: 2369, Percentage: 0.60%
Age group: 15, Number of ratings: 2185, Percentage: 0.55%
Age group: 16, Number of ratings: 2081, Percentage: 0.52%
Age group: 17, Number of ratings: 3388, Percentage: 0.85%
Age group: 18, Number of ratings: 5329, Percentage: 1.34%
Age group: 19, Number of ratings: 3016, Percentage: 0.76%
Age group: 20, Number of ratings: 3696, Percentage: 0.93%
Age group: 21, Number of ratings: 5678, Percentage: 1.43%
Age group: 22, Number of ratings: 6414, Percentage: 1.62%
Age group: 23, Number of ratings: 9842, Percentage: 2.48%
Age group: 24, Number of ratings: 10185, Percentage: 2.57%
Age group: 25, Number of ratings: 11440, Percentage: 2.89%
Age group: 26, Number of ratings: 11771, Percentage: 2.97%
Age group: 27, Number of ratings: 11225, Percentage: 2.83%
Age group: 28, Number of ratings: 13483, Percentage: 3.40%
Age group: 

In [95]:
# For MLHD and BX
if dataset in ['mlhd', 'bx']:
    stats_grouped = user_stats.groupby('user_id')
    num_users = len(stats_grouped)
    print(f"Number of users: {num_users}")
    print(f"Number of user profiles: {len(user_stats)}")

    num_profiles = {user_id: len(group) for user_id, group in stats_grouped}
    avg_num_profiles = sum(num_profiles.values()) / len(num_profiles)
    min_num_profiles = min(num_profiles.values())
    max_num_profiles = max(num_profiles.values())
    print(f"Average number of profiles: {avg_num_profiles}")
    print(f"Minimum number of profiles: {min_num_profiles}")
    print(f"Maximum number of profiles: {max_num_profiles}")
    print(f"Number of users with more than 1 profile: {sum(1 for num in num_profiles.values() if num > 1)}")
    print(f"Number of users with more than 5 profiles: {sum(1 for num in num_profiles.values() if num > 5)}")
    print(f"Number of users with more than 10 profiles: {sum(1 for num in num_profiles.values() if num > 10)}")

    profile_sizes = {user_id: group['num_interactions'].tolist() for user_id, group in stats_grouped}
    avg_profile_size = sum(sum(sizes) / len(sizes) for sizes in profile_sizes.values()) / len(profile_sizes)
    avg_listen_events_per_user = sum(sum(sizes) for sizes in profile_sizes.values()) / len(profile_sizes)
    print(f"Average profile size: {avg_profile_size}")
    print(f"Average interactions per user: {avg_listen_events_per_user}")
    print(f"Minimum profile size: {min(min(sizes) for sizes in profile_sizes.values())}")
    print(f"Maximum profile size: {max(max(sizes) for sizes in profile_sizes.values())}")
    print(f"Number of profiles with more than 1 listen event: {sum(sum(1 for size in sizes if size > 1) for sizes in profile_sizes.values())}")
    print(f"Number of profiles with more than 5 interactions: {sum(sum(1 for size in sizes if size > 5) for sizes in profile_sizes.values())}")
    print(f"Number of profiles with more than 10 interactions: {sum(sum(1 for size in sizes if size > 10) for sizes in profile_sizes.values())}")


    ages = user_stats['age'].astype(int).value_counts().to_dict()

    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(ages.keys()), y=list(ages.values()), color='b')

    if dataset == 'bx':
        # Needs to be adapted for the dataset

        plt.axvline(x=5.5, color='red', linestyle='--', linewidth=1)
        plt.axvline(x=37.5, color='green', linestyle='--', linewidth=1)
        plt.text(2.5, 1125, 'Children', color='red', fontsize=13, ha='center')
        plt.text(20, 1125, 'Mainstream', color='blue', fontsize=13, ha='center')
        plt.text(45, 1125, 'NMA', color='green', fontsize=13, ha='center')

    elif dataset == 'mlhd':
        plt.axvline(x=4.5, color='red', linestyle='--', linewidth=1)
        plt.axvline(x=17.5, color='green', linestyle='--', linewidth=1)
        plt.text(2, 19000, 'Children', color='red', fontsize=13, ha='center')
        plt.text(13, 19000, 'Mainstream', color='blue', fontsize=13, ha='center')
        plt.text(25, 19000, 'NMA', color='green', fontsize=13, ha='center')
        
    #plt.title('Number of Profiles per Age Group')
    plt.xlabel('Age', fontsize=24)
    plt.ylabel('Number of Profiles', fontsize=24)
    # Create a list of x-tick positions. Start with the minimum age and add every 5 years.
    #ages_to_display = [0, 3] + list(range(3, int(max(ages.keys()))-10, 5))
    ages_to_display = [3] + list(range(3, int(max(ages.keys()))-10, 5))
    
    # Ensure that the x-ticks only show the desired ages
    plt.xticks(ticks=ages_to_display, fontsize=24)
    plt.yticks(fontsize=24)
    plt.tight_layout()

    
elif dataset == 'ml':
    # For ML
    ages = user_stats['age_group'].value_counts().to_dict()
    ages = {age: ages.get(age, 0) for age in ages_sort}
    ages = {'<18': ages.pop('Under 18'), **ages}
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(ages.keys()), y=list(ages.values()), color='b')
    plt.axvline(x=0.5, color='red', linestyle='--', linewidth=1)
    plt.axvline(x=4.5, color='green', linestyle='--', linewidth=1)
    plt.text(0, 1500, 'Children', color='red', fontsize=13, ha='center')
    plt.text(3, 1500, 'Mainstream', color='blue', fontsize=13, ha='center')
    plt.text(5.5, 1500, 'NMA', color='green', fontsize=13, ha='center')


    #plt.title('Number of Profiles per Age Group')
    plt.xlabel('Age', fontsize=24)
    plt.ylabel('Number of Profiles', fontsize=24)
    # Ensure that the x-ticks only show the desired ages
    plt.xticks(fontsize=24)
    plt.yticks(fontsize=24)
    plt.tight_layout()
    
plt.savefig(f'Results/Detailed_Profile_Age_Distribution_{dataset}.pdf', format='pdf')
plt.show()

Number of users: 35029
Number of user profiles: 35029
Average number of profiles: 1.0
Minimum number of profiles: 1
Maximum number of profiles: 1
Number of users with more than 1 profile: 0
Number of users with more than 5 profiles: 0
Number of users with more than 10 profiles: 0
Average profile size: 11.318050757943418
Average interactions per user: 11.318050757943418
Minimum profile size: 1
Maximum profile size: 3674
Number of profiles with more than 1 listen event: 17298
Number of profiles with more than 5 interactions: 7572
Number of profiles with more than 10 interactions: 4729


In [43]:
print(f"Number of users: {len(users)}")
print(f"Number of interactions: {user_stats['num_interactions'].sum()}")
items = pd.read_csv(items_path, sep='\t')
print(f"Number of items: {len(items)}")
if dataset == 'mlhd':
    artists = pd.read_csv(artists_path, sep='\t')
    print(f"Number of artists: {len(artists)}")

Number of users: 6040
Number of interactions: 1000209
Number of items: 3706


In [12]:
user_stats

Unnamed: 0,user_id,age,num_interactions,num_unique_items,normalized_genre_distribution,age_group
0,2,18.0,1,1,{'history_biography': 1.0},18-49
1,42,17.0,1,1,{'mystery_thriller_crime': 1.0},12-17
2,44,51.0,2,2,"{'mystery_thriller_crime': 0.75, 'romance': 0.25}",50-65
3,51,34.0,1,1,{'mystery_thriller_crime': 1.0},18-49
4,56,24.0,3,3,"{'history_biography': 0.3333333333333333, 'you...",18-49
...,...,...,...,...,...,...
35024,278843,28.0,29,29,"{'fantasy_paranormal': 0.06896551724137931, 'm...",18-49
35025,278844,28.0,1,1,{'comics_graphic': 1.0},18-49
35026,278846,23.0,1,1,{'history_biography': 1.0},18-49
35027,278849,23.0,2,2,"{'children': 0.5, 'fantasy_paranormal': 0.5}",18-49
