In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## はじめに
今回はKNNを使ってアニメのリコメンドエンジンを作っていきたいと思います。
学習内容は[こちら](https://www.codexa.net/collaborative-filtering-k-nearest-neighbor/)を参考にしました


In [None]:
# import Machine-Learning library
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
# read datasets
ratings = pd.read_csv('/kaggle/input/anime-recommendations-database-vol2/ratings.csv')
animes = pd.read_csv('/kaggle/input/anime-recommendations-database-vol2/animes.csv')

まず、データセットの中身を確認していきます

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
animes.head()

In [None]:
animes.info()

animesデータをmembersによってソートします。membersは当該アニメをmylistに含めた人の総数となり、簡単に言うとフォロワー数のようなものとなります。
標準では昇順になるので、ascending=Falseを指定して降順（人気順）にします

In [None]:
animes.sort_values('members', ascending=False)[:20]

各データの基本統計量を確認したあとに、ratingsのヒストグラムを作成します

In [None]:
round(animes.describe(),2)

In [None]:
round(ratings.describe(),2)

In [None]:
# 0は0点ではなく、無投票を表します
# 単位は1メモリで500,000票となります
ratings['rating'].hist(bins=10, figsize=(10,10))

## 前処理
animesとratingsから有効なデータだけを抽出してanime_idをキーにしてマージします

In [None]:
# membersの値が10,000より大きいデータのみ抽出
animes = animes[animes['members'] > 10000]

In [None]:
# 未評価のデータを削除
ratings = ratings[ratings.rating > 0]

In [None]:
# anime_idをキーにしてマージ
merged_df = ratings.merge(animes, left_on='anime_id', right_on='anime_id', suffixes=['_user', ''])
merged_df.head()

In [None]:
# 学習に用いる特徴量を抽出
merged_df = merged_df[['user_id', 'title', 'rating_user']]
# 重複を削除
merged_df = merged_df.drop_duplicates(['user_id', 'title'])

merged_df.head()

In [None]:
# アニメタイトルを軸にグルーピング
animes_pivot = merged_df.pivot(index='title', columns='user_id', values='rating_user').fillna(0)
animes_pivot_sparse = csr_matrix(animes_pivot.values)

animes_pivot.head()

## KNNでレコメンドエンジンを構築

In [None]:
# k=9, 最適化アルゴリズム=, 距離=コサイン類似度でknnインスタンスを作成
knn = NearestNeighbors(n_neighbors=9, algorithm='brute', metric='cosine')
# モデルを訓練
model_knn = knn.fit(animes_pivot_sparse)

In [None]:
# k=11で引数に入れるアニメから類似する順に10個のアニメを返す
def anime_recommend(anime):
    distance, indice = model_knn.kneighbors(animes_pivot.iloc[animes_pivot.index== anime].values.reshape(1,-1),n_neighbors=11)
    for i in range(0, len(distance.flatten())):
        if  i == 0:
            print('Recommendations if you like the anime {0}:\n'.format(animes_pivot[animes_pivot.index== anime].index[0]))
        else:
            print('{0}: {1} with distance: {2}'.format(i,animes_pivot.index[indice.flatten()[i]],distance.flatten()[i]))


In [None]:
anime_recommend('K-On!')

In [None]:
anime_recommend('Chihayafuru')

In [None]:
anime_recommend('Koi wa Ameagari no You ni')

In [None]:
anime_recommend('Mahoutsukai no Yome')

In [None]:
anime_recommend('Kuzu no Honkai')

In [None]:
anime_recommend('Kakegurui')