In [1]:
import utils.age_processing as ap
import os
import ast
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.use('TkAgg')

In [2]:
from dotenv import load_dotenv
from pathlib import Path
env_path = Path('..') / 'config.env'
load_dotenv(dotenv_path=env_path)
dataset_dir = os.getenv("dataset_directory")

In [None]:
dataset = 'bx' # 'ml', 'mlhd', or 'bx'
weighted = True
age_type = 'defined_ages'

In [4]:
ages_sort = ap.get_sorted_ages(dataset, age_type)

In [5]:
genres = []

if dataset == 'ml':
    data_dir = dataset_dir + '/processed/movielens-1m'
    with open('../utils/ML_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'mlhd':
    data_dir = dataset_dir + '/processed/mlhd_sampled_filtered'
    with open('../utils/MLHD_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())
elif dataset == 'bx':
    data_dir = dataset_dir + '/processed/Book-Crossing'
    with open('../utils/BX_genres.txt', 'r') as f:
        for line in f:
            genres.append(line.strip())



In [6]:
user_profile_stats_path = data_dir + f'/user_profile_stats{"_weighted" if weighted else ""}.tsv'
user_path = data_dir + '/users.tsv'
interactions_path = data_dir + '/interactions.tsv.bz2'
if dataset == 'ml':
    items_path = data_dir + '/movies.tsv'
elif dataset == 'mlhd':
    items_path = data_dir + '/tracks.tsv'
    artists_path = data_dir + '/artists.tsv'
elif dataset == 'bx':
    items_path = data_dir + '/books.tsv'

users = pd.read_csv(user_path, sep='\t')
user_stats = pd.read_csv(user_profile_stats_path, sep='\t')
user_stats['normalized_genre_distribution'] = user_stats['normalized_genre_distribution'].apply(ast.literal_eval)


user_stats['age_group'] = user_stats['age'].apply(lambda x: ap.age_group(x, dataset, age_type))
user_stats['age_group'] = pd.Categorical(user_stats['age_group'], categories=ages_sort, ordered=True)

grouped_user_stats = user_stats.groupby('age_group')

  grouped_user_stats = user_stats.groupby('age_group')


In [7]:
for age, group in grouped_user_stats:
    print(f"Age group: {age}")
    print(f"Number of users: {len(group)}")
    print(f"Average number of interactions: {group['num_interactions'].mean()}")

Age group: 12-16
Number of users: 17985
Average number of interactions: 6341.105476786211
Age group: 17-29
Number of users: 174996
Average number of interactions: 4758.394820453039
Age group: 30-65
Number of users: 26073
Average number of interactions: 4174.0210946189545


## Profile Size Distribution

In [8]:
interactions_per_age = user_stats.groupby('age_group')['num_interactions'].sum().to_dict()
interactions_per_age = {age: interactions_per_age.get(age, 0) for age in ages_sort}


# Print percentage of ratings per age group
total_ratings = sum(interactions_per_age.values())
for age, count in interactions_per_age.items():
    percentage = (count / total_ratings) * 100 if total_ratings > 0 else 0
    print(f"Age group: {age}, Number of ratings: {count}, Percentage: {percentage:.2f}%")

plt.figure(figsize=(12, 6))
ax = sns.lineplot(data=interactions_per_age, marker='o', color='b', )
ylabels = [f'{x:,}' for x in ax.get_yticks()]
ax.set_yticklabels(ylabels)
plt.title('Number of Ratings per Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Ratings')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
plt.xticks(ha='center')  # Rotate labels by 45 degrees and align to the right
plt.show()



plt.figure(figsize=(12, 6))
ax = sns.boxplot(data=user_stats, x='age_group', y='num_interactions')
ax.set(ylim=(0,250))
plt.title('Number of Ratings per Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Ratings per User')
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
plt.xticks(ha='center')
plt.show()

  interactions_per_age = user_stats.groupby('age_group')['num_interactions'].sum().to_dict()
  ax.set_yticklabels(ylabels)


Age group: 12-16, Number of ratings: 114044782, Percentage: 10.80%
Age group: 17-29, Number of ratings: 832700060, Percentage: 78.89%
Age group: 30-65, Number of ratings: 108829252, Percentage: 10.31%


In [9]:
    ages = user_stats['age_group'].value_counts().to_dict()
    ages = {age: ages.get(age, 0) for age in ages_sort}
    plt.figure(figsize=(10, 8))
    sns.barplot(x=list(ages.keys()), y=list(ages.values()), color='b')
    #plt.title('Number of Profiles per Age Group')
    plt.xlabel('Age', fontsize=16)
    plt.ylabel('Number of Profiles', fontsize=16)
    # Ensure that the x-ticks only show the desired ages
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.tight_layout()
    plt.show()

In [10]:
stats_grouped = user_stats.groupby('user_id')
num_users = len(stats_grouped)
print(f"Number of users: {num_users}")
print(f"Number of user profiles: {len(user_stats)}")

num_profiles = {user_id: len(group) for user_id, group in stats_grouped}
avg_num_profiles = sum(num_profiles.values()) / len(num_profiles)
min_num_profiles = min(num_profiles.values())
max_num_profiles = max(num_profiles.values())
print(f"Average number of profiles: {avg_num_profiles}")
print(f"Minimum number of profiles: {min_num_profiles}")
print(f"Maximum number of profiles: {max_num_profiles}")
print(f"Number of users with more than 1 profile: {sum(1 for num in num_profiles.values() if num > 1)}")
print(f"Number of users with more than 5 profiles: {sum(1 for num in num_profiles.values() if num > 5)}")
print(f"Number of users with more than 10 profiles: {sum(1 for num in num_profiles.values() if num > 10)}")

profile_sizes = {user_id: group['num_interactions'].tolist() for user_id, group in stats_grouped}
avg_profile_size = sum(sum(sizes) / len(sizes) for sizes in profile_sizes.values()) / len(profile_sizes)
avg_listen_events_per_user = sum(sum(sizes) for sizes in profile_sizes.values()) / len(profile_sizes)
print(f"Average profile size: {avg_profile_size}")
print(f"Average interactions per user: {avg_listen_events_per_user}")
print(f"Minimum profile size: {min(min(sizes) for sizes in profile_sizes.values())}")
print(f"Maximum profile size: {max(max(sizes) for sizes in profile_sizes.values())}")
print(f"Number of profiles with more than 1 listen event: {sum(sum(1 for size in sizes if size > 1) for sizes in profile_sizes.values())}")
print(f"Number of profiles with more than 5 interactions: {sum(sum(1 for size in sizes if size > 5) for sizes in profile_sizes.values())}")
print(f"Number of profiles with more than 10 interactions: {sum(sum(1 for size in sizes if size > 10) for sizes in profile_sizes.values())}")


ages = user_stats['age'].value_counts().to_dict()

plt.figure(figsize=(10, 8))
sns.barplot(x=list(ages.keys()), y=list(ages.values()), color='b')
#plt.title('Number of Profiles per Age Group')
plt.xlabel('Age', fontsize=16)
plt.ylabel('Number of Profiles', fontsize=16)
# Create a list of x-tick positions. Start with the minimum age and add every 5 years.
ages_to_display = [0, 3] + list(range(3, int(max(ages.keys()))-10, 5))

# Ensure that the x-ticks only show the desired ages
plt.xticks(ticks=ages_to_display, fontsize=16)
plt.yticks(fontsize=16)
plt.tight_layout()
plt.show()

Number of users: 44349
Number of user profiles: 219054
Average number of profiles: 4.939322194412501
Minimum number of profiles: 1
Maximum number of profiles: 6
Number of users with more than 1 profile: 44345
Number of users with more than 5 profiles: 7
Number of users with more than 10 profiles: 0
Average profile size: 4808.3177448570805
Average interactions per user: 23801.530902613362
Minimum profile size: 1
Maximum profile size: 95749
Number of profiles with more than 1 listen event: 218632
Number of profiles with more than 5 interactions: 217818
Number of profiles with more than 10 interactions: 217057


In [11]:
print(f"Number of users: {len(users)}")
print(f"Number of interactions: {user_stats['num_interactions'].sum()}")
items = pd.read_csv(items_path, sep='\t')
print(f"Number of items: {len(items)}")
if dataset == 'mlhd':
    artists = pd.read_csv(artists_path, sep='\t')
    print(f"Number of artists: {len(artists)}")

Number of users: 44349
Number of interactions: 1055574094
Number of items: 1918414
Number of artists: 45451


In [12]:
user_stats

Unnamed: 0,user_id,age,num_interactions,num_unique_items,normalized_genre_distribution,age_group
0,15,12.0,4117,672,"{'pop': 0.20456643186786935, 'punk': 0.0351226...",12-16
1,15,13.0,6671,903,"{'rock': 0.4229800629590814, 'punk': 0.0415030...",12-16
2,15,14.0,5661,1021,"{'rock': 0.3957840193134275, 'pop': 0.26923688...",12-16
3,15,15.0,4166,1096,"{'rock': 0.3408665386461778, 'pop': 0.26416626...",12-16
4,15,16.0,390,203,"{'punk': 0.2271367521367523, 'alternative': 0....",12-16
...,...,...,...,...,...,...
219049,44321,57.0,1356,1034,"{'pop': 0.15083052394999366, 'classical': 0.01...",30-65
219050,44321,58.0,211,166,"{'jazz': 0.655608214849921, 'rock': 0.05924170...",30-65
219051,44321,59.0,93,71,"{'blues': 0.01881720430107527, 'rock': 0.04301...",30-65
219052,44321,60.0,551,489,"{'easy listening': 0.0070175438596491255, 'voc...",30-65
