 # Memory-based User-User Collaborative filtering demo
 ## on Anime Recommendations Database

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
anime = pd.read_csv('/kaggle/input/anime-recommendations-database/anime.csv')
ratings = pd.read_csv('/kaggle/input/anime-recommendations-database/rating.csv')

In [None]:
anime.head(10)

In [None]:
anime.info()

In [None]:
ratings.head(10)

In [None]:
ratings.info()

In [None]:
modified_df = ratings.sort_values('rating').drop_duplicates(subset=['user_id', 'anime_id'])
reshaped_df = modified_df.pivot(index='user_id', columns='anime_id', values='rating')

In [None]:
reshaped_df.fillna(0, inplace=True)

In [None]:
reshaped_df.head(10)

In [None]:
ratings.hist(column='anime_id', bins=1000)

In [None]:
ratings.hist(column='user_id', bins=1000)

In [None]:
ratings.plot.scatter(x='user_id', y='anime_id', c='DarkBlue')

## User-user recommender system with smaller subset of data

In [None]:
subset_5000_df = reshaped_df.loc[:5000, :5000]
subset_5000_df.head()

In [None]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=4, algorithm='ball_tree').fit(subset_5000_df)

In [None]:
distances, indices = nbrs.kneighbors(subset_5000_df.loc[:3, :])
indices

In [None]:
distances

Suppose I want to recommend to the 35th user (the one at index 35), then the process would be as follows. 

In [None]:
current_user = 35

distances, indices = nbrs.kneighbors(subset_5000_df.loc[[current_user], :])
distances

In [None]:
indices

In [None]:
print(indices[0][1])
result = subset_5000_df.loc[indices[0][1:], :]
result

In [None]:
result = result.transpose()

result['rating_sum'] = result[indices[0][1]] + result[indices[0][2]] + result[indices[0][3]]

result

In [None]:
result['anime_id'] = result.index

result = result.sort_values(by=['rating_sum'], ascending=False).reset_index(drop=True)
result

In [None]:
recommendations = result.loc[:4, 'anime_id']
recommendations

In [None]:
[anime.loc[anime['anime_id'] == recommendation]['name'].to_string() for recommendation in recommendations.to_numpy()]

## Comparison of execution time performance for various sizes of the dataframe

5000 users and 5000 anime titles

In [None]:
import time

Here we will document the time taken to fit the nearest neighbors algorithm and retrieve the identities and distances of four nearest neighbors for ten users.

In [None]:
elapsed = time.perf_counter()
nbrs = NearestNeighbors(n_neighbors=4, algorithm='ball_tree').fit(subset_5000_df)
distances, indices = nbrs.kneighbors(subset_5000_df.loc[:10, :])
elapsed = time.perf_counter() - elapsed
print('Elapsed time is: {}.'.format(elapsed))

It takes about 2.9 seconds to perform 4-Nearest Neighbor algorithm for a 5000x5000 matrix. Let us see how this algorithm scales.

### 10000 users and 10000 anime titles

In [None]:
subset_10000_df = reshaped_df.loc[:10000, :10000]
subset_10000_df.head()

In [None]:
elapsed = time.perf_counter()
nbrs = NearestNeighbors(n_neighbors=4, algorithm='ball_tree').fit(subset_10000_df)
distances, indices = nbrs.kneighbors(subset_10000_df.loc[:10, :])
elapsed = time.perf_counter() - elapsed
print('Elapsed time is: {}.'.format(elapsed))

It takes about 9.07 seconds for 10000x10000 dataset. 

### 100000x100000 dataset (full dataset)

In [None]:
subset_100000_df = reshaped_df.loc[:100000, :100000]
subset_100000_df.head()

In [None]:
elapsed = time.perf_counter()
nbrs = NearestNeighbors(n_neighbors=4, algorithm='ball_tree').fit(subset_100000_df)
distances, indices = nbrs.kneighbors(subset_100000_df.loc[:10, :])
elapsed = time.perf_counter() - elapsed
print('Elapsed time is: {}.'.format(elapsed))

It takes about 208 seconds for the computation to run on the whole dataset. The order of complexity is O(UIK), where U, I and k are the number of users, items and number of neighbors respectively. 