In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from pathlib import Path
env_path = Path('../..') / 'config.env'
load_dotenv(dotenv_path=env_path)
dataset_dir = os.getenv("dataset_directory")

In [2]:
dataset = 'bx' # ml, mlhd, or bx
year = 2013 # Only for mlhd dataset
filtered = True 

In [None]:
if dataset == 'ml':
    data_dir = dataset_dir + f'/processed/ml_rec{"_filtered" if filtered else ""}'
elif dataset == 'mlhd':
    data_dir = dataset_dir + f'/processed/mlhd_rec{"_filtered" if filtered else ""}_{year}'
elif dataset == 'bx':
    data_dir = dataset_dir + f'/processed/bx_rec{"_filtered" if filtered else ""}'

In [4]:
train_path = data_dir + f'/train.tsv'
validation_path = data_dir + f'/validation.tsv'
test_path = data_dir + f'/test.tsv'
user_info_path = data_dir + f'/user_info.tsv'

column_names = ['user_id', 'track_id', 'count'] if dataset == 'bx' else ['user_id', 'track_id', 'count', 'timestamp']

train = pd.read_csv(train_path, sep='\t', header=None, names=column_names)
validation = pd.read_csv(validation_path, sep='\t', header=None, names=column_names)
test = pd.read_csv(test_path, sep='\t', header=None, names=column_names)

users = pd.read_csv(user_info_path, sep='\t')


train = pd.merge(train, users, on='user_id', how='inner')
validation = pd.merge(validation, users, on='user_id', how='inner')
test = pd.merge(test, users, on='user_id', how='inner')

In [5]:
len_interactions = len(train) + len(validation) + len(test)

print(f'Train set: {len(train)} rows; {len(train) / len_interactions * 100:.2f}% of total interactions')
print(f'Validation set: {len(validation)} rows; {len(validation) / len_interactions * 100:.2f}% of total interactions')
print(f'Test set: {len(test)} rows; {len(test) / len_interactions * 100:.2f}% of total interactions')

print(f'Train set: {len(train["user_id"].unique())} unique users')
print(f'Validation set: {len(validation["user_id"].unique())} unique users')
print(f'Test set: {len(test["user_id"].unique())} unique users')

train_interactions_per_user = train.groupby('user_id').size()
print(f'Train set: {train_interactions_per_user.mean()} average interactions per user')

validation_interactions_per_user = validation.groupby('user_id').size()
print(f'Validation set: {validation_interactions_per_user.mean()} average interactions per user')

test_interactions_per_user = test.groupby('user_id').size()
print(f'Test set: {test_interactions_per_user.mean()} average interactions per user')

Train set: 144854 rows; 58.96% of total interactions
Validation set: 46770 rows; 19.04% of total interactions
Test set: 54037 rows; 22.00% of total interactions
Train set: 6950 unique users
Validation set: 6950 unique users
Test set: 6950 unique users
Train set: 20.84230215827338 average interactions per user
Validation set: 6.729496402877698 average interactions per user
Test set: 7.775107913669065 average interactions per user


In [6]:
age_identifier = 'age_at_listen' if dataset == 'mlhd' else 'age'
train_interactions_per_age_group = train.groupby(age_identifier)

validation_interactions_per_age_group = validation.groupby(age_identifier)

test_interactions_per_age_group = test.groupby(age_identifier)

In [7]:
empty_train_profiles = users[users['user_id'].isin(train['user_id']) == False]
empty_validation_profiles = users[users['user_id'].isin(validation['user_id']) == False]
empty_test_profiles = users[users['user_id'].isin(test['user_id']) == False]

print(f'Train set: {len(empty_train_profiles)} users with empty profiles')
print(f'Validation set: {len(empty_validation_profiles)} users with empty profiles')
print(f'Test set: {len(empty_test_profiles)} users with empty profiles')

small_user_profiles = train_interactions_per_user[train_interactions_per_user < 10]
print(f'Train set: {len(small_user_profiles)} users with less than 10 interactions')
small_user_profiles = validation_interactions_per_user[validation_interactions_per_user < 10]
print(f'Validation set: {len(small_user_profiles)} users with less than 10 interactions')
small_user_profiles = test_interactions_per_user[test_interactions_per_user < 10]
print(f'Test set: {len(small_user_profiles)} users with less than 10 interactions')

Train set: 37 users with empty profiles
Validation set: 37 users with empty profiles
Test set: 37 users with empty profiles
Train set: 4401 users with less than 10 interactions
Validation set: 5951 users with less than 10 interactions
Test set: 5826 users with less than 10 interactions


In [8]:
print("Train set")
for age, group in train_interactions_per_age_group:
    print(f'Age: {age}')
    print(f'Number of user profiles: {len(group["user_id"].unique())}')
    print(f'Average items in user profile: {group.groupby("user_id").size().mean()}')
    small_user_profiles = group.groupby("user_id").size()[group.groupby("user_id").size() < 10]
    print(f'Users with less than 10 interactions: {len(small_user_profiles)}')
print()
print()

print("Validation set")
for age, group in validation_interactions_per_age_group:
    print(f'Age: {age}')
    print(f'Number of user profiles: {len(group["user_id"].unique())}')
    print(f'Average items in user profile: {group.groupby("user_id").size().mean()}')
    small_user_profiles = group.groupby("user_id").size()[group.groupby("user_id").size() < 10]
    print(f'Users with less than 10 interactions: {len(small_user_profiles)}')
print()
print()   

print("Test set")
for age, group in test_interactions_per_age_group:
    print(f'Age: {age}')
    print(f'Number of user profiles: {len(group["user_id"].unique())}')
    print(f'Average items in user profile: {group.groupby("user_id").size().mean()}')
    small_user_profiles = group.groupby("user_id").size()[group.groupby("user_id").size() < 10]
    print(f'Users with less than 10 listening events: {len(small_user_profiles)}')
print()
print()

Train set
Age: 12.0
Number of user profiles: 15
Average items in user profile: 10.6
Users with less than 10 interactions: 10
Age: 13.0
Number of user profiles: 23
Average items in user profile: 8.347826086956522
Users with less than 10 interactions: 15
Age: 14.0
Number of user profiles: 44
Average items in user profile: 15.75
Users with less than 10 interactions: 32
Age: 15.0
Number of user profiles: 47
Average items in user profile: 11.106382978723405
Users with less than 10 interactions: 33
Age: 16.0
Number of user profiles: 53
Average items in user profile: 9.09433962264151
Users with less than 10 interactions: 39
Age: 17.0
Number of user profiles: 84
Average items in user profile: 11.94047619047619
Users with less than 10 interactions: 62
Age: 18.0
Number of user profiles: 109
Average items in user profile: 16.376146788990827
Users with less than 10 interactions: 75
Age: 19.0
Number of user profiles: 73
Average items in user profile: 12.0
Users with less than 10 interactions: 52
Ag

In [9]:
sparsity = 1 - (len_interactions / (len(users) * len(users)))
print(sparsity)

0.9949678366675325
