In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix

In [2]:
def hit_rate(y_rec, y_rel, k=10):
    return int(len(set(y_rel).intersection(y_rec[:k])) > 0)

In [3]:
train_set = pd.read_parquet('../data/train.parquet')
test_set = pd.read_parquet('../data/test.parquet')

anime_with_features = pd.read_parquet('../data/anime_with_features.parquet')

In [4]:
anime_ids = anime_with_features['MAL_ID'].to_list()

In [5]:
class ContentBasedModel:
    def __init__(self):
        self.similarity_matrix = None
        self.anime_ids = None

    def train(self, data):
        # id to index map
        self.anime_ids = data['MAL_ID'].to_list()
        
        similarities_by_col = {}
        feature_columns = [
            'Genres_f',
            'Type_f',
            'Source_f',
            'Studios_f',
            'Favorites_f',
            'Popularity_f',
            'Members_f',
            'Rating_f',
            'Synopsis_f'
        ]
        
        for col in tqdm(feature_columns):
            if col == 'Synopsis_f':
                continue
            # закодируем категориальные признаки
            mapping = {k: v for v, k in enumerate(data[col].explode().unique().tolist())}
        
            # построим разреженную матрицу признаков для каждого тайтла
            rows = []
            cols = []
            values = []
            for row_ind, value in enumerate(data[col]):
                value = [] if value is None else value
        
                # колонки – значения признака
                col_inds = [mapping[x] for x in value]
                rows.extend([row_ind] * len(col_inds))
                # ставим 1, если такой признак относится к тайтлу
                values.extend([1] * len(col_inds))
                cols.extend(col_inds)
        
            # построим матрицу похожести для каждого признака с помощью матрики косинусного расстояния
            sparse_data = csr_matrix((values, (rows, cols)))
            sparse_data = normalize(sparse_data, norm="l2", axis=1)
            similarities_by_col[col] = (sparse_data @ sparse_data.T).A
            
            
        vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), norm='l2')
        data['synopsis'] = data['synopsis'].fillna('')
        anime_tfidf_matrix = vectorizer.fit_transform(data['synopsis'].tolist())
        similarities_by_col['Synopsis_f'] = (anime_tfidf_matrix @ anime_tfidf_matrix.T).A
        
        
        N = len(data)
        self.similarity_matrix = np.zeros((N, N))
        for k, v in tqdm(similarities_by_col.items()):
            weight = 1  # каждый признак имеет одинаковую важность
            # итоговая похожесть -- усреднение похожести по каждому признаку
            self.similarity_matrix += weight * (v - np.eye(N))  # вычитаем
        

    def recommend(self, user_id, data, top_k=10):
        if user_id not in data['user_id'].unique():
            raise ValueError('cold user 🥶')
        
        # random item from user rating history
        user_anime_ids = data[data['user_id'] == user_id]['anime_id'].tolist()
        anime_id = np.random.choice(user_anime_ids, 1)
        anime_ind = self.anime_ids.index(anime_id)
        
        # find top_k similar animes
        similarity_scores = self.similarity_matrix[anime_ind]
        nearest_animes = np.argsort(similarity_scores)[::-1][1:top_k + 1]
        recs = [self.anime_ids[i] for i in nearest_animes]

        return recs

In [6]:
cb_model = ContentBasedModel()

In [7]:
anime_with_features.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1', 'synopsis',
       'Source_f', 'Studios_f', 'Type_f', 'Rating_f', 'Popularity_f',
       'Members_f', 'Favorites_f', 'Genres_f'],
      dtype='object')

In [8]:
cb_model.train(anime_with_features)

100%|██████████| 9/9 [00:06<00:00,  1.41it/s]
100%|██████████| 9/9 [00:32<00:00,  3.59s/it]


In [9]:
train_set.dtypes

user_id             int64
anime_id            int64
rating              int64
watching_status     int64
watched_episodes    int64
dtype: object

In [11]:
# Evaluate on the test set
test_hit_rates = []
cold_user_count = 0
for user_id in tqdm(test_set['user_id'].unique()):
    try:
        actual_items = test_set[test_set['user_id'] == user_id]['anime_id'].tolist()
        recommendations = cb_model.recommend(user_id, train_set, top_k=10)
        test_hit_rates.append(hit_rate(recommendations, actual_items))
    except ValueError as e:
        cold_user_count += 1

test_hit_rate = np.mean(test_hit_rates)
print(f"Hit Rate on Test Set: {test_hit_rate}")

100%|██████████| 26851/26851 [08:27<00:00, 52.94it/s]

Hit Rate on Test Set: 0.1913284409165793





In [None]:
train_set

In [68]:
train_set[train_set['user_id'] == 28645].sort_values(by='rating', ascending=False)['anime_id'].to_list()[0]

19