In [101]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from typing import List
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

In [9]:
anime_with_synopsis = pd.read_csv('./anime_dataset_1/anime_with_synopsis.csv')
anime = pd.read_csv('./anime_dataset_1/anime.csv')

In [32]:
len(anime), len(anime_with_synopsis), len(anime.merge(anime_with_synopsis, on='MAL_ID', how='left'))

(17562, 16214, 17562)

In [33]:
anime_with_features = anime.merge(anime_with_synopsis.drop(['Name', 'Score', 'Genres',], axis=1), on='MAL_ID', how='left').fillna('')

In [34]:
anime_with_features = anime_with_features.rename(columns={'sypnopsis': 'synopsis'})

In [35]:
anime.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [36]:
anime_with_synopsis.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'sypnopsis'], dtype='object')

In [37]:
anime_with_features.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1', 'synopsis'],
      dtype='object')

In [38]:
anime_with_features

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,It is the dark century and the people are suff...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",Unknown,盗墓笔记之秦岭神树,ONA,Unknown,"Apr 4, 2021 to ?",Unknown,...,Unknown,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,No synopsis information has been added to this...
17558,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",Unknown,見える子ちゃん,TV,Unknown,2021 to ?,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,ko is a typical high school student whose life...
17559,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Higurashi:When They Cry – SOTSU,ひぐらしのなく頃に卒,TV,Unknown,"Jul, 2021 to ?",Summer 2021,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Sequel to Higurashi no Naku Koro ni Gou .
17560,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",Unknown,ヤマノススメ Next Summit,TV,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,New Yama no Susume anime.


In [98]:
vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), norm='l2')

anime_with_features['synopsis'] = anime_with_features['synopsis'].fillna('')

anime_tfidf_matrix = vectorizer.fit_transform(anime_with_features['synopsis'].tolist())

In [40]:
def get_recommendations(anime_name: str, n_recommendations: int = 10) -> List[str]:
    idx = anime_with_features[anime_with_features['Name'] == anime_name].index[0]
    cosine_similarities = anime_tfidf_matrix.dot(anime_tfidf_matrix[idx].T).toarray().ravel()
    most_similar_animes = cosine_similarities.argsort()[-n_recommendations-1:-1]
    return anime_with_features['Name'].iloc[most_similar_animes].tolist()

def get_anime_synopsis(anime_name: str) -> str:
    return anime_with_features[anime_with_features['Name'] == anime_name]['synopsis'].values[0]

In [66]:
anime_with_features.loc[2608]

MAL_ID                                                        2832
Name                                                    Ani*Kuri15
Score                                                         6.79
Genres           Slice of Life, Adventure, Fantasy, Magic, Game...
English name                                               Unknown
Japanese name                                              アニ＊クリ15
Type                                                       Special
Episodes                                                        15
Aired                                  Jun 7, 2007 to Jun 27, 2007
Premiered                                                  Unknown
Producers                                                  Unknown
Licensors                                                  Unknown
Studios          Gonzo, Gainax, Production I.G, Madhouse, Studi...
Source                                                    Original
Duration                                            1 min. per

In [99]:
similarities_by_col = {}
feature_columns = [
    'Genres_f',
    'Type_f',
    'Source_f',
    'Studios_f',
    'Favorites_f',
    'Popularity_f',
    'Members_f',
    'Rating_f',
    'Synopsis_f'
]

In [97]:
for f in ['Source_f', 'Studios_f', 'Type_f', 'Rating_f', 'Genres_f']:
    anime_with_features[f] = anime_with_features[f[:-2]].apply(lambda x: x.split(', '))

In [94]:
for f in ['Favorites_f', 'Members_f', 'Popularity_f']:
    anime_with_features[f] = anime_with_features[f[:-2]].apply(lambda x: [x])

In [95]:
for col in tqdm(feature_columns):
    # закодируем категориальные признаки
    mapping = {k: v for v, k in enumerate(anime_with_features[col].explode().unique().tolist())}

    # построим разреженную матрицу признаков для каждого тайтла
    rows = []
    cols = []
    values = []
    for row_ind, value in enumerate(anime_with_features[col]):
        value = [] if value is None else value

        # колонки – значения признака
        col_inds = [mapping[x] for x in value]
        rows.extend([row_ind] * len(col_inds))
        # ставим 1, если такой признак относится к тайтлу
        values.extend([1] * len(col_inds))
        cols.extend(col_inds)

    # построим матрицу похожести для каждого признака с помощью матрики косинусного расстояния
    sparse_data = csr_matrix((values, (rows, cols)))
    sparse_data = normalize(sparse_data, norm="l2", axis=1)
    similarities_by_col[col] = (sparse_data @ sparse_data.T).A

100%|██████████| 7/7 [00:04<00:00,  1.64it/s]


In [100]:
similarities_by_col['Synopsis'] = (anime_tfidf_matrix @ anime_tfidf_matrix.T).A

## Ансамбль посчитанных схожестей

In [102]:
N = len(anime_with_features)

similarities = np.zeros((N, N))
for k, v in tqdm(similarities_by_col.items()):
    weight = 1  # каждый признак имеет одинаковую важность
    # итоговая похожесть -- усреднение похожести по каждому признаку
    similarities += weight * (v - np.eye(N))  # вычитаем

similarities.shape

100%|██████████| 1/1 [00:03<00:00,  3.89s/it]


(17562, 17562)

In [103]:
anime_with_features.to_parquet('./data/anime_with_features.parquet')