In [1]:
import utils.age_processing as ap
import os
import ast
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.use('TkAgg')

In [2]:
from dotenv import load_dotenv
from pathlib import Path
env_path = Path('..') / 'config.env'
load_dotenv(dotenv_path=env_path)
dataset_dir = os.getenv("dataset_directory")

In [3]:
dataset = 'bx' # 'ml', 'mlhd', or 'bx'
weighted = True
age_type = 'finegrained_age'

In [4]:
ages_sort = ap.get_sorted_ages(dataset, age_type)

In [5]:
genres = []

if dataset == 'ml':
    data_dir = dataset_dir + '/processed/movielens-1m'
    with open('../utils/ML_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'mlhd':
    data_dir = dataset_dir + '/processed/mldh_sampled_filtered'
    with open('../utils/MLHD_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'bx':
    data_dir = dataset_dir + '/processed/Book-Crossing'
    with open('../utils/BX_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())



In [6]:
user_profile_stats_path = data_dir + f'/user_profile_stats{"_weighted" if weighted else ""}.tsv'
user_path = data_dir + '/users.tsv'
interactions_path = data_dir + '/interactions.tsv.bz2'

users = pd.read_csv(user_path, sep='\t')
user_stats = pd.read_csv(user_profile_stats_path, sep='\t')
user_stats['normalized_genre_distribution'] = user_stats['normalized_genre_distribution'].apply(ast.literal_eval)


user_stats['age_group'] = user_stats['age'].apply(lambda x: ap.age_group(x, dataset, age_type))
user_stats['age_group'] = pd.Categorical(user_stats['age_group'], categories=ages_sort, ordered=True)

grouped_user_stats = user_stats.groupby('age_group')

  grouped_user_stats = user_stats.groupby('age_group')


In [7]:
for age, group in grouped_user_stats:
    print(f"Age group: {age}")
    print(f"Number of users: {len(group)}")
    print(f"Average number of interactions: {group['num_interactions'].mean()}")

Age group: 12
Number of users: 66
Average number of interactions: 7.909090909090909
Age group: 13
Number of users: 229
Average number of interactions: 3.7860262008733625
Age group: 14
Number of users: 449
Average number of interactions: 5.276169265033408
Age group: 15
Number of users: 549
Average number of interactions: 3.9799635701275045
Age group: 16
Number of users: 572
Average number of interactions: 3.6363636363636362
Age group: 17
Number of users: 666
Average number of interactions: 5.087087087087087
Age group: 18
Number of users: 719
Average number of interactions: 7.411682892906815
Age group: 19-20
Number of users: 1348
Average number of interactions: 4.9792284866468846
Age group: 21-22
Number of users: 1610
Average number of interactions: 7.51055900621118
Age group: 23-24
Number of users: 2034
Average number of interactions: 9.846116027531957
Age group: 25-29
Number of users: 5412
Average number of interactions: 11.87971175166297
Age group: 30-34
Number of users: 5328
Average 

## Profile Size Distribution

In [8]:
interactions_per_age = user_stats.groupby('age_group')['num_interactions'].sum().to_dict()
interactions_per_age = {age: interactions_per_age.get(age, 0) for age in ages_sort}

plt.figure(figsize=(12, 6))
ax = sns.lineplot(data=interactions_per_age, marker='o', color='b', )
ylabels = [f'{x:,}' for x in ax.get_yticks()]
ax.set_yticklabels(ylabels)
plt.title('Number of Ratings per Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Ratings')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
plt.xticks(ha='center')  # Rotate labels by 45 degrees and align to the right
plt.show()



plt.figure(figsize=(12, 6))
ax = sns.boxplot(data=user_stats, x='age_group', y='num_interactions')
ax.set(ylim=(0,250))
plt.title('Number of Ratings per Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Ratings per User')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
plt.xticks(ha='center')
plt.show()

  interactions_per_age = user_stats.groupby('age_group')['num_interactions'].sum().to_dict()
  ax.set_yticklabels(ylabels)


In [9]:
    ages = user_stats['age_group'].value_counts().to_dict()
    ages = {age: ages.get(age, 0) for age in ages_sort}
    plt.figure(figsize=(10, 8))
    sns.barplot(x=list(ages.keys()), y=list(ages.values()), color='b')
    #plt.title('Number of Profiles per Age Group')
    plt.xlabel('Age', fontsize=16)
    plt.ylabel('Number of Profiles', fontsize=16)
    # Ensure that the x-ticks only show the desired ages
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.tight_layout()
    plt.show()

In [10]:
stats_grouped = user_stats.groupby('user_id')
num_users = len(stats_grouped)
print(f"Number of users: {num_users}")
print(f"Number of user profiles: {len(user_stats)}")

num_profiles = {user_id: len(group) for user_id, group in stats_grouped}
avg_num_profiles = sum(num_profiles.values()) / len(num_profiles)
min_num_profiles = min(num_profiles.values())
max_num_profiles = max(num_profiles.values())
print(f"Average number of profiles: {avg_num_profiles}")
print(f"Minimum number of profiles: {min_num_profiles}")
print(f"Maximum number of profiles: {max_num_profiles}")
print(f"Number of users with more than 1 profile: {sum(1 for num in num_profiles.values() if num > 1)}")
print(f"Number of users with more than 5 profiles: {sum(1 for num in num_profiles.values() if num > 5)}")
print(f"Number of users with more than 10 profiles: {sum(1 for num in num_profiles.values() if num > 10)}")

profile_sizes = {user_id: group['num_interactions'].tolist() for user_id, group in stats_grouped}
avg_profile_size = sum(sum(sizes) / len(sizes) for sizes in profile_sizes.values()) / len(profile_sizes)
avg_listen_events_per_user = sum(sum(sizes) for sizes in profile_sizes.values()) / len(profile_sizes)
print(f"Average profile size: {avg_profile_size}")
print(f"Average interactions per user: {avg_listen_events_per_user}")
print(f"Minimum profile size: {min(min(sizes) for sizes in profile_sizes.values())}")
print(f"Maximum profile size: {max(max(sizes) for sizes in profile_sizes.values())}")
print(f"Number of profiles with more than 1 listen event: {sum(sum(1 for size in sizes if size > 1) for sizes in profile_sizes.values())}")
print(f"Number of profiles with more than 5 interactions: {sum(sum(1 for size in sizes if size > 5) for sizes in profile_sizes.values())}")
print(f"Number of profiles with more than 10 interactions: {sum(sum(1 for size in sizes if size > 10) for sizes in profile_sizes.values())}")


ages = user_stats['age'].value_counts().to_dict()

plt.figure(figsize=(10, 8))
sns.barplot(x=list(ages.keys()), y=list(ages.values()), color='b')
#plt.title('Number of Profiles per Age Group')
plt.xlabel('Age', fontsize=16)
plt.ylabel('Number of Profiles', fontsize=16)
# Create a list of x-tick positions. Start with the minimum age and add every 5 years.
ages_to_display = [0, 3] + list(range(3, int(max(ages.keys()))-10, 5))

# Ensure that the x-ticks only show the desired ages
plt.xticks(ticks=ages_to_display, fontsize=16)
plt.yticks(fontsize=16)
plt.tight_layout()
plt.show()

Number of users: 35028
Number of user profiles: 35028
Average number of profiles: 1.0
Minimum number of profiles: 1
Maximum number of profiles: 1
Number of users with more than 1 profile: 0
Number of users with more than 5 profiles: 0
Number of users with more than 10 profiles: 0
Average profile size: 11.318345323741006
Average interactions per user: 11.318345323741006
Minimum profile size: 1
Maximum profile size: 3674
Number of profiles with more than 1 listen event: 17298
Number of profiles with more than 5 interactions: 7572
Number of profiles with more than 10 interactions: 4729


count    36547.000000
mean        35.746628
std         14.886077
min          0.000000
25%         25.000000
50%         34.000000
75%         45.000000
max        244.000000
Name: age, dtype: float64