In [24]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
anime = pd.read_csv('./anime_dataset_1/anime.csv')
anime_with_synopsis = pd.read_csv('./anime_dataset_1/anime_with_synopsis.csv')

train_set = pd.read_parquet('./data/train.parquet')
test_set = pd.read_parquet('./data/test.parquet')

## Признаки пользователей

In [4]:
data = train_set.reset_index(drop=True)

In [5]:
data.columns

Index(['user_id', 'anime_id', 'rating', 'watching_status', 'watched_episodes'], dtype='object')

In [6]:
data = data.merge(anime, left_on='anime_id', right_on='MAL_ID', how='left')

In [14]:
user_ids = data['user_id'].unique().tolist()
item_ids = data['anime_id'].unique().tolist()

In [16]:
data['user_idx'] = pd.Categorical(data['user_id']).codes
data['item_idx'] = pd.Categorical(data['anime_id']).codes

In [18]:
len(user_ids), len(item_ids)

(29214, 15695)

In [19]:
user_idx = data['user_idx'].to_numpy().flatten()
item_idx = data['item_idx'].to_numpy().flatten()

rating = data['rating'].to_numpy().flatten()

In [23]:
sparse_matrix = csr_matrix(
    (rating, (user_idx, item_idx)),
    shape=(len(user_ids), len(item_ids))
)

In [25]:
similarity_matrix = cosine_similarity(sparse_matrix)

In [26]:
similarity_matrix.shape

(29214, 29214)

In [29]:
def recommend(user_id, top_k=10):
    user_idx = user_ids.index(user_id)
    scores = similarity_matrix[user_idx] @ sparse_matrix
    recommended_indices = np.argsort(scores)[::-1][:top_k]
    recommended_item_ids = [item_ids[i] for i in recommended_indices]

    return recommended_item_ids

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
58,79,Shuffle!,7.09,"Harem, Comedy, Drama, Magic, Romance, Ecchi, F...",Shuffle!,SHUFFLE! (シャッフル!),TV,24,"Jul 8, 2005 to Jan 6, 2006",Summer 2005,...,8852.0,14660.0,26443.0,32343.0,18951.0,10530.0,5019.0,2184.0,1323.0,1088.0
2218,2421,Super Kuma-san,6.13,"Comedy, Fantasy",Unknown,スーパークマさん,Special,1,"Jun 1, 2003",Unknown,...,15.0,17.0,33.0,67.0,93.0,64.0,22.0,10.0,9.0,5.0
3285,3799,Speed,5.18,Comedy,Unknown,スピード,Movie,1,"Jan 1, 1980",Unknown,...,5.0,9.0,24.0,70.0,98.0,120.0,74.0,43.0,31.0,35.0
3928,5031,Cobra The Animation: Time Drive,6.92,"Action, Adventure, Space, Sci-Fi",Unknown,COBRA THE ANIMATION タイム・ドライブ,OVA,2,"Apr 24, 2009 to Jun 26, 2009",Unknown,...,82.0,128.0,357.0,532.0,298.0,133.0,45.0,17.0,9.0,12.0
7473,16586,Michitekuru Toki no Mukou ni,Unknown,"Adventure, Fantasy, Romance",Unknown,満ちてくる　時のむこうに,Special,1,"Jun 16, 1991",Unknown,...,5.0,6.0,4.0,9.0,9.0,6.0,2.0,2.0,1.0,2.0
9368,25641,Monotonous Purgatory,6.18,Music,Unknown,MONOTONOUS PURGATORY,Music,1,"Oct 26, 2012",Unknown,...,20.0,32.0,95.0,169.0,175.0,125.0,48.0,12.0,11.0,21.0
9437,26035,Sweet Spot,Unknown,Comedy,Unknown,スイートスポット,OVA,1,"Apr 21, 1991",Unknown,...,6.0,2.0,4.0,16.0,23.0,25.0,12.0,5.0,3.0,3.0
11823,33281,Mori no Andou,5.92,Comedy,Unknown,森の安藤,ONA,1,"Jan, 2006",Unknown,...,274.0,67.0,114.0,170.0,201.0,148.0,104.0,103.0,104.0,144.0
13841,36754,Kakuriyo no Yadomeshi,7.5,"Demons, Drama, Romance, Supernatural",Kakuriyo:Bed and Breakfast for Spirits,かくりよの宿飯,TV,26,"Apr 2, 2018 to Sep 24, 2018",Spring 2018,...,3539.0,5112.0,9551.0,10033.0,4754.0,2368.0,872.0,311.0,138.0,109.0
17334,44059,Puparia,7.48,Dementia,Puparia,PUPARIA,ONA,1,"Nov 20, 2020",Unknown,...,490.0,615.0,1092.0,1036.0,517.0,238.0,71.0,39.0,18.0,19.0


In [8]:
user_interactions_count = data.groupby('user_id').size()

In [9]:
user_avg_rating = data.groupby('user_id')['rating'].mean()

In [10]:
user_genres = data.groupby('user_id')['Genres'].apply(lambda x: ','.join(set(','.join(x).split(','))))

In [None]:
user_episodes_watched = data.groupby('user_id')['watched_episodes'].sum()