In [1]:
#%load_ext watermark
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import json
from functools import partial
import matplotlib.pyplot as plt

In [3]:
from pybpr import *
from ml_test import *

In [4]:
ml = MovieLensDownloader(cache_dir="./output")
mldata = ml.load_dataset_with_tags(dataset="ml-20m")

MovieLensDownloader-INFO-Cache directory: output/movielens
MovieLensDownloader-DEBUG-Loading dataset with tags: ml-20m
MovieLensDownloader-DEBUG-Downloading dataset: ml-20m
MovieLensDownloader-INFO-Dataset ml-20m already exists at output/movielens/ml-20m
MovieLensDownloader-DEBUG-Data path: output/movielens/ml-20m
MovieLensDownloader-DEBUG-Data directory contents: ['genome-scores.csv', 'ratings.csv', 'tags.csv', 'genome-tags.csv', 'links.csv', 'movies.csv', 'README.txt']
MovieLensDownloader-INFO-Loaded 20,000,263 ratings
MovieLensDownloader-INFO-Loaded 27,278 movies
MovieLensDownloader-INFO-Loaded 465,564 tags
MovieLensDownloader-INFO-Loaded 27,278 links
MovieLensDownloader-INFO-Loaded 11,709,768 genome_scores
MovieLensDownloader-INFO-Loaded 1,128 genome_tags
MovieLensDownloader-INFO-Loaded 6 datasets: ['ratings', 'movies', 'tags', 'links', 'genome_scores', 'genome_tags']


In [5]:
ml.print_dataset_summary(mldata)

MovieLensDownloader-INFO-Dataset summary:
RATINGS--------------------
Total rows: 20,000,263
Unique Users: 138,493
Unique Movies: 26,744
MOVIES--------------------
Total rows: 27,278
Unique Movies: 27,278
Unique Genres: 1,342
Unique Titles: 27,262
TAGS--------------------
Total rows: 465,564
Unique Users: 7,801
Unique Movies: 19,545
Unique Tags: 38,643
LINKS--------------------
Total rows: 27,278
Unique Movies: 27,278
GENOME_SCORES--------------------
Total rows: 11,709,768
Unique Movies: 10,381
Unique Tag IDs: 1,128
GENOME_TAGS--------------------
Total rows: 1,128
Unique Tags: 1,128
Unique Tag IDs: 1,128


In [6]:
mldata = filter_ratings_by_genome_scores(
    mldata=mldata,
    min_interactions=2,
    min_relevance=0.8
)
ml.print_dataset_summary(mldata)

MovieLensDownloader-INFO-Dataset summary:
RATINGS--------------------
Total rows: 19,738,489
Unique Users: 138,493
Unique Movies: 10,150
MOVIES--------------------
Total rows: 27,278
Unique Movies: 27,278
Unique Genres: 1,342
Unique Titles: 27,262
TAGS--------------------
Total rows: 465,564
Unique Users: 7,801
Unique Movies: 19,545
Unique Tags: 38,643
LINKS--------------------
Total rows: 27,278
Unique Movies: 27,278
GENOME_SCORES--------------------
Total rows: 11,709,768
Unique Movies: 10,381
Unique Tag IDs: 1,128
GENOME_TAGS--------------------
Total rows: 1,128
Unique Tags: 1,128
Unique Tag IDs: 1,128


In [13]:
ui = UserItemData(name='User-Metadata-only')
idf = mldata['ratings']
ui.add_positive_interactions(
    user_ids=idf['userId'][idf['rating'] >= 4],
    item_ids=idf['movieId'][idf['rating'] >= 4]
)
ui.add_negative_interactions(
    user_ids=idf['userId'][idf['rating'] < 4],
    item_ids=idf['movieId'][idf['rating'] < 4]
)
ui.add_user_features(
    user_ids=idf['userId'].unique(),
    feature_ids=idf['userId'].unique()
)
ui.add_item_features(
    item_ids=idf['movieId'].unique(),
    feature_ids=idf['movieId'].unique()
)
ui.train_test_split(
    train_ratio_pos=0.8,
    train_ratio_neg=0.8,
    show_progress=True
)

UserItemData.User-Metadata-only - INFO - Initialized UserItemData 'User-Metadata-only' with dtype <class 'numpy.float32'>
UserItemData.User-Metadata-only - INFO - Adding 9,908,333 positive interactions
UserItemData.User-Metadata-only - INFO - Successfully added positive interactions. New dimensions: 138287 users × 10130 items
UserItemData.User-Metadata-only - INFO - Adding 9,830,156 negative interactions
UserItemData.User-Metadata-only - INFO - Successfully added negative interactions. New dimensions: 138493 users × 10150 items
UserItemData.User-Metadata-only - INFO - Adding 138,493 user features
UserItemData.User-Metadata-only - INFO - Successfully added user features
UserItemData.User-Metadata-only - INFO - 138493 users × 138493 user features
UserItemData.User-Metadata-only - INFO - Adding 10,150 item features
UserItemData.User-Metadata-only - INFO - Successfully added item features
UserItemData.User-Metadata-only - INFO - New dimensions: 10150 items × 10150 item features


In [17]:
ui

UserItemData(User-Metadata-only)
  Fuser     :(138493×138493) nnz=   138,493 (0.001%), empty rows/cols=     0/     0
  Fitem     :( 10150× 10150) nnz=    10,150 (0.010%), empty rows/cols=     0/     0
  Rpos      :(138493× 10150) nnz= 9,908,333 (0.705%), empty rows/cols=   206/    20
    └─ users: min=1, max=3028 | items: min=1, max=55807
  Rneg      :(138493× 10150) nnz= 9,830,156 (0.699%), empty rows/cols=   239/     2
    └─ users: min=1, max=4461 | items: min=1, max=27737
  Rpos_train:(138493× 10150) nnz= 7,926,734 (0.564%), empty rows/cols=   206/    26
    └─ users: min=1, max=2450 | items: min=1, max=44676
  Rpos_test :(138493× 10150) nnz= 1,981,599 (0.141%), empty rows/cols=  4022/   234
    └─ users: min=1, max=578 | items: min=1, max=11131
  Rneg_train:(138493× 10150) nnz= 7,864,219 (0.559%), empty rows/cols=   239/     4
    └─ users: min=1, max=3574 | items: min=1, max=22135
  Rneg_test :(138493× 10150) nnz= 1,965,937 (0.140%), empty rows/cols=  6880/    39
    └─ users: mi

In [15]:
recommender = RecSys(
    data=ui,
    model=HybridMF(ui.n_user_features,
                   ui.n_item_features, n_latent=n_latent),
    optimizer=partial(
        torch.optim.Adam, lr=learning_rate, weight_decay=weight_decay
    ),
    loss_function=loss_function,
    output_dir=os.path.join(output_dir, ui.name),
    log_level=1
)
# print(f"[{name}] {recommender}", flush=True)

# Train the model
recommender.fit(
    n_iter=n_iter,
    batch_size=batch_size,
    eval_every=eval_every,
    eval_user_size=eval_user_size,
    early_stopping_patience=100
)


UserItemData.User-Metadata-only - INFO - Splitting interactions: train_pos=0.80, train_neg=0.80, random_state=None
Ensuring user representation: 100%|██████████| 68/68 [00:00<00:00, 102.80it/s]
Ensuring user representation: 100%|██████████| 95/95 [00:00<00:00, 135.17it/s]
UserItemData.User-Metadata-only - INFO - Split 9,830,156 negative interactions
UserItemData.User-Metadata-only - INFO - Train/test split completed. Pos train: 7,926,734, Pos test: 1,981,599


In [16]:
print_sparse_matrix_stats(ui.Rpos_train)

'(138493× 10150) nnz= 7,926,734 (0.564%), empty rows/cols=   206/    26'

In [None]:

genome_scores_df = mldata['genome_scores']
ratings_df = mldata['ratings']

# Get unique movie IDs with genome scores above relevance threshold
relevant_movie_ids = genome_scores_df[
    genome_scores_df['relevance'] >= min_relevance
]['movieId'].unique()

# Filter ratings to only include movies with relevant genome scores
relevant_movie_ratings = ratings_df[
    ratings_df['movieId'].isin(relevant_movie_ids)
]

# Count interactions per user with relevant movies and filter users
user_interaction_counts = (
    relevant_movie_ratings
    .groupby('userId')
    .size()
    .reset_index(name='interaction_count')
)

qualifying_users = user_interaction_counts[
    user_interaction_counts['interaction_count'] >= min_interactions
]['userId']

# Filter ratings for qualifying users only
filtered_ratings = relevant_movie_ratings[
    relevant_movie_ratings['userId'].isin(qualifying_users)
]

In [None]:
mldata = ml.filter_genome_and_active_users(
    data=mldata,
    
    min_user_interactions=2
)
ml.get_tag_statistics(mldata)


In [None]:
ui = UserItemData(name='User-Metadata-only')
idf = mldata['ratings']
ui.add_positive_interactions(
    user_ids=idf['userId'][idf['rating'] >= 4],
    item_ids=idf['movieId'][idf['rating'] >= 4]
)
ui.add_negative_interactions(
    user_ids=idf['userId'][idf['rating'] < 4],
    item_ids=idf['movieId'][idf['rating'] < 4]
    )
ui.add_user_features(
    user_ids=idf['userId'],
    feature_ids=idf['userId']
)
ui.add_item_features(
    item_ids=idf['movieId'],
    feature_ids=idf['movieId']
)
#ui.validate_dataset()

In [None]:
ui

In [None]:
ui.train_test_split(
    train_ratio_pos=0.8,
    train_ratio_neg=0.8, 
    show_progress=True
    )