In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from app.models.baseline import Baseline
from app.models.collaborative import CollaborativeFilteringUserUser
from app.models import ContentBasedModel

# Metric Functions

In [2]:
def hit_rate(y_rec, y_rel, k=10):
    return int(len(set(y_rel).intersection(y_rec[:k])) > 0)

# Datasets

In [3]:
train_set = pd.read_parquet('../data/train.parquet')
test_set = pd.read_parquet('../data/test.parquet')

anime_with_features = pd.read_parquet('../data/anime_with_features.parquet')

# Models evaluation

## Baseline by average rating 

In [14]:
model = Baseline(kind='avg')

In [15]:
model.train(train_set)

In [16]:
test_hit_rates = []
cold_user_count = 0
for user_id in tqdm(test_set['user_id'].unique()):
    try:
        actual_items = test_set[test_set['user_id'] == user_id]['anime_id'].tolist()
        recommendations = model.recommend(top_k=10)
        test_hit_rates.append(hit_rate(recommendations, actual_items))
    except ValueError as e:
        cold_user_count += 1

test_hit_rate = np.mean(test_hit_rates)
print(f"Hit Rate on Test Set: {test_hit_rate}")
print(f"Cold User Count: {cold_user_count}")

100%|██████████| 26851/26851 [00:08<00:00, 3107.78it/s]

Hit Rate on Test Set: 7.4485121596961e-05
Cold User Count: 0





## Baseline by max '10' marks

In [17]:
model = Baseline(kind='max10')

In [18]:
model.train(train_set)

In [19]:
test_hit_rates = []
cold_user_count = 0
for user_id in tqdm(test_set['user_id'].unique()):
    try:
        actual_items = test_set[test_set['user_id'] == user_id]['anime_id'].tolist()
        recommendations = model.recommend(top_k=10)
        test_hit_rates.append(hit_rate(recommendations, actual_items))
    except ValueError as e:
        cold_user_count += 1

test_hit_rate = np.mean(test_hit_rates)
print(f"Hit Rate on Test Set: {test_hit_rate}")
print(f"Cold User Count: {cold_user_count}")

100%|██████████| 26851/26851 [00:08<00:00, 3038.17it/s]

Hit Rate on Test Set: 0.5217310342259134
Cold User Count: 0





## Collaborative filtering User-User

In [21]:
model = CollaborativeFilteringUserUser()

In [22]:
model.train(train_set)

In [23]:
# Evaluate on the test set
test_hit_rates = []
cold_user_count = 0
for user_id in tqdm(test_set['user_id'].unique()):
    try:
        actual_items = test_set[test_set['user_id'] == user_id]['anime_id'].tolist()
        recommendations = model.recommend(user_id, top_k=10)
        test_hit_rates.append(hit_rate(recommendations, actual_items))
    except ValueError as e:
        cold_user_count += 1

test_hit_rate = np.mean(test_hit_rates)
print(f"Hit Rate on Test Set: {test_hit_rate}")
print(f"Cold User Count: {cold_user_count}")

100%|██████████| 26851/26851 [04:06<00:00, 109.04it/s]

Hit Rate on Test Set: 0.04051220608057511
Cold User Count: 143





## Content-based

In [4]:
model = ContentBasedModel()

In [6]:
model.train(anime_with_features)

100%|██████████| 9/9 [00:36<00:00,  4.03s/it]


In [7]:
test_hit_rates = []
cold_user_count = 0
for user_id in tqdm(test_set['user_id'].unique()):
    try:
        actual_items = test_set[test_set['user_id'] == user_id]['anime_id'].tolist()
        recommendations = model.recommend(user_id, train_set, top_k=10)
        test_hit_rates.append(hit_rate(recommendations, actual_items))
    except ValueError as e:
        cold_user_count += 1

test_hit_rate = np.mean(test_hit_rates)
print(f"Hit Rate on Test Set: {test_hit_rate}")
print(f"Cold User Count: {cold_user_count}")

100%|██████████| 26851/26851 [06:37<00:00, 67.63it/s]

Hit Rate on Test Set: 0.20368428935150518
Cold User Count: 143





## Conclusions

Baseline по средней оценки (без учета популярности) показала самый низкий хитрейт@10=7.4485121596961e-05

Baseline по самому большому количество десяток показала самый высокий хитрейт@10=52%
   - скорее всего связано с большими фандомами таких аниме как Наруто, Ван Пис, ДрагонболЗ

Коллаборативная фильтраци (Юзер к Юзеру) показала хитрейт@10=0.040

Content-based рекомендации показали относительно хороший результат
с хитрейт@10=20%