# Compare user stats to filtered set
As the filtering for years and splitting of the data removes certain user profiles, this script can be used to gauge whether the train set properly reflects age distributions of the original data.


In [8]:
import utils.age_processing as ap

In [9]:
import pandas as pd
import os
from dotenv import load_dotenv
from pathlib import Path

env_path = Path('../..') / 'config.env'
load_dotenv(dotenv_path=env_path)
dataset_dir = os.getenv("dataset_directory")



In [10]:
dataset = 'ml' # ml, mlhd, or bx
filtered = True 
weighted = True
age_type = 'finegrained_age'

In [11]:
if dataset == 'ml':    
    data_dir = dataset_dir + f'/processed/ml_rec{"_filtered" if filtered else ""}'
    user_stats_path = dataset_dir + f'/processed/movielens-1m'
elif dataset == 'mlhd':
    data_dir = dataset_dir + f'/processed/mlhd_rec{"_filtered" if filtered else ""}_{year}'
    user_stats_path = dataset_dir + f'/processed/mlhd_sampled_filtered'
elif dataset == 'bx':
    data_dir = dataset_dir + f'/processed/bx_rec{"_filtered" if filtered else ""}'
    user_stats_path = dataset_dir + f'/processed/Book-Crossing'
    

In [12]:
user_info_path = data_dir + '/user_info.tsv'
user_stats = pd.read_csv(user_stats_path + f'/user_profile_stats{"_weighted" if weighted else ""}.tsv', sep='\t')
user_info = pd.read_csv(user_info_path, sep='\t')

ages_sort = ap.get_sorted_ages(dataset, age_type)

In [13]:
user_stats['age_group'] = user_stats['age'].apply(lambda x: ap.age_group(x, dataset, age_type))
user_stats['age_group'] = pd.Categorical(user_stats['age_group'], categories=ages_sort, ordered=True)

user_info['age_group'] = user_info['age'].apply(lambda x: ap.age_group(x, dataset, age_type))
user_info['age_group'] = pd.Categorical(user_info['age_group'], categories=ages_sort, ordered=True)


In [17]:
filtered_stats_ratio = user_stats['age_group'].value_counts(normalize=True)
original_stats_ratio = user_info['age_group'].value_counts(normalize=True)
print(f"Age group ratios for {dataset} dataset:")
for age_group in ages_sort:
    filtered_ratio = filtered_stats_ratio.get(age_group, 0)
    original_ratio = original_stats_ratio.get(age_group, 0)
    print(f"{age_group}: Filtered: {filtered_ratio:.4f}, Original: {original_ratio:.4f}")

Age group ratios for ml dataset:
Under 18: Filtered: 0.0368, Original: 0.0366
18-24: Filtered: 0.1826, Original: 0.1808
25-34: Filtered: 0.3470, Original: 0.3488
35-44: Filtered: 0.1975, Original: 0.1976
45-49: Filtered: 0.0911, Original: 0.0907
50-55: Filtered: 0.0821, Original: 0.0820
56+: Filtered: 0.0629, Original: 0.0634
