In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import re

import numpy as np # linear algebra
from scipy.sparse import csr_matrix

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator

from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
anime =  pd.read_csv('../input/anime-recommendation-database-2020/anime.csv')

In [None]:
usecols = ['MAL_ID', 'Name', 'sypnopsis']
anime_with_synopsis = pd.read_csv('../input/anime-recommendation-database-2020/anime_with_synopsis.csv', usecols=usecols)

In [None]:
rating_complete = pd.read_csv('../input/anime-recommendation-database-2020/rating_complete.csv')

# pre-processing

In [None]:
anime.info()

In [None]:
pd.set_option("display.max_columns", 50)
anime.head()

In [None]:
def split_labels(series):
    series = series.split(",")
    if "Unknown" in series:
        series.remove("Unknown")
    return series

anime["Genres"] = anime["Genres"].map(split_labels)
anime["Studios"] = anime["Studios"].map(split_labels)

In [None]:
anime.head()

In [None]:
# replace Unknown
anime['Score'] = anime['Score'].replace('Unknown', 0).astype(float)

In [None]:
anime_with_synopsis.info()

In [None]:
anime_with_synopsis.head()

In [None]:
rating_complete.info()

In [None]:
rating_complete.head()

# Search Anime by Name

In [None]:
def search_anime(keyword):
    query1 = anime['Japanese name'].str.contains(keyword, case=False)
    query2 = anime['English name'].str.contains(keyword, case=False)
    print(anime[query1 | query2][['Name', 'Japanese name']].head(20))
    return anime[query1 | query2].head(1).index

In [None]:
search_anime('STEINS')

# TOP 100 anime

In [None]:
anime100 = anime.sort_values('Score', ascending=False).head(100)
anime100.head(5)

# Content based filtering

## meta data

In [None]:
def to_category(df, column, is_multilabel=False):
    lb = LabelBinarizer()
    if is_multilabel:
        lb = MultiLabelBinarizer()
        
    expandedLabelData = lb.fit_transform(df[column])
    labelClasses = lb.classes_

    category_df = pd.DataFrame(expandedLabelData, columns=labelClasses)
    del df[column]
    return pd.concat([df, category_df], axis=1)

In [None]:
anime_metadata = anime[['MAL_ID', 'Name', 'Genres', 'Japanese name', 'Type', 'Studios', 'Source', 'Rating']].copy()

anime_metadata = to_category(anime_metadata, 'Type')
anime_metadata = to_category(anime_metadata, 'Source')
anime_metadata = to_category(anime_metadata, 'Rating')
anime_metadata = to_category(anime_metadata, 'Genres', is_multilabel=True)
anime_metadata = to_category(anime_metadata, 'Studios', is_multilabel=True)

anime_id = anime_metadata[['MAL_ID', 'Name', 'Japanese name']]

del anime_metadata['Unknown']
del anime_metadata['MAL_ID']
del anime_metadata['Name']
del anime_metadata['Japanese name']

print(anime_metadata.shape)
anime_metadata.head()

In [None]:
anime_id = anime['MAL_ID'].copy()
anime_metadata2 = pd.merge(anime_id, anime_with_synopsis, on='MAL_ID', how='left').fillna('').astype(str)
anime_metadata2.head()

In [None]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      stop_words = 'english')

synopsis_original = anime_metadata2['sypnopsis']
synopsis_vector_tf_idf = tfv.fit_transform(synopsis_original)
synopsis_vector_tf_idf.shape

In [None]:
all_meta_data = np.concatenate((anime_metadata.values, synopsis_vector_tf_idf.todense()), axis=1)
all_meta_data.shape

In [None]:
cb_model_knn = NearestNeighbors(metric='cosine', n_neighbors=9)
cb_model_knn.fit(csr_matrix(all_meta_data))

In [None]:
def cb_recommend_anime(query_index):
    distances, indices = cb_model_knn.kneighbors(all_meta_data[query_index,:].reshape(1, -1), n_neighbors=11)
    result = []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(anime.iloc[index])
        
    return pd.DataFrame(result)

In [None]:
cb_recommend_anime(search_anime('この音'))

# Collaborative Filtering

In [None]:
user_anime = rating_complete.groupby('user_id').size().reset_index()
user_anime.columns = ['user_id', 'anime_count']
user_anime.head()

In [None]:
print(user_anime['anime_count'].quantile(0.8))
user_anime.describe()

In [None]:
cdf = user_anime['anime_count'].value_counts().sort_index().cumsum()

plt.plot(list(cdf.index), cdf/cdf.max())
plt.xlabel('Value')
plt.ylabel('ECDF')
plt.ylim([-0.05,1.05])
plt.show()

In [None]:
filtered_users = user_anime[user_anime['anime_count'] > 280]
users = set(filtered_users['user_id'])
len(users)

In [None]:
rating_data = rating_complete[rating_complete['user_id'].isin(users)]
rating_data.shape

In [None]:
unique_users = {int(x): i for i,x in enumerate(rating_data['user_id'].unique())}
unique_items = {int(x): i for i,x in enumerate(anime['MAL_ID'].unique())}

print(len(unique_items), len(unique_users))
anime_collabolative_filter = np.zeros((len(unique_items), len(unique_users)))

for user_id, anime_id, rating in rating_data.values:
    anime_collabolative_filter[unique_items[anime_id], unique_users[user_id]] = rating
    
anime_collabolative_filter.shape

In [None]:
cf_model_knn = NearestNeighbors(metric='cosine', n_neighbors=9)
cf_model_knn.fit(csr_matrix(anime_collabolative_filter))

In [None]:
def cf_recommend_anime(query_index):
    distances, indices = cf_model_knn.kneighbors(anime_collabolative_filter[query_index,:].reshape(1, -1), n_neighbors=11)
    result = []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(anime.iloc[index])
        
    return pd.DataFrame(result)

In [None]:
cf_recommend_anime(search_anime('この音'))