# Anime Recommendation System using Pearson Correlation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Load the datasets

* 2 different datasets will be loaded in to dataframes 
* Dataset can be downloaded in https://www.kaggle.com/CooperUnion/anime-recommendations-database

In [None]:
anime = pd.read_csv('../input/anime-recommendations-database/anime.csv')
rating = pd.read_csv('../input/anime-recommendations-database/rating.csv')

### anime.csv

* anime_id - myanimelist.net's unique id identifying an anime.
* name - full name of anime.
* genre - comma separated list of genres for this anime.
* type - movie, TV, OVA, etc.
* episodes - how many episodes in this show. (1 if movie).
* rating - average rating out of 10 for this anime.
* members - number of community members that are in this anime's "group".

### rating.csv

* user_id - non identifiable randomly generated user id.
* anime_id - the anime that this user has rated.
* rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).

In [None]:
anime.info()

In [None]:
rating.info()

In [None]:
print('anime.csv (shape):',anime.shape)
print('rating.csv (shape):',rating.shape)

In [None]:
anime.head()

In [None]:
rating.head()

In [None]:
# checking for null values

anime.isnull().sum()

In [None]:
# filling all anime without rating with 0

anime.fillna({'rating':0},inplace=True)

# Exploratory Data Analysis

* Distribution of anime type
* Anime with highest rating counts
* Movie and TV ratings
* Number of episodes and rating
* Number of ratings given by the user
* Anime rating count

### Distribution of anime type

In [None]:
sns.countplot(x='type',data=anime)
plt.title('Distribution of anime type')
plt.show()

We can see from the chart the different types of anime present in the dataset.

### Anime with highest rating counts

In [None]:
top_anime_rating_count = rating.groupby(by='anime_id').count()['rating'].sort_values(ascending=False).head(10)
top_anime_rating_count = pd.DataFrame({'rating_count':top_anime_rating_count}).reset_index()
top_anime_rating = pd.merge(top_anime_rating_count,anime,on='anime_id')

plt.title("Top anime with highest rating count")
sns.barplot(x='rating_count',y='name',data=top_anime_rating,orient='h')
plt.show()

The anime 'Death Note' has the highest rating count with almost 40000 ratings.

### Movie and TV ratings

In [None]:
fig,ax = plt.subplots(ncols=2,figsize=(14,6))

sns.kdeplot(color='b',shade=True,data=anime[(anime['type']=='Movie')]['rating'],label='Movies',ax=ax[0])
sns.kdeplot(color='r',shade=True,data=anime[(anime['type']=='TV')]['rating'],label='TV',ax=ax[0])
sns.boxplot(x='type',y='rating',data=anime[(anime['type']=='Movie')|(anime['type']=='TV')],ax=ax[1])
ax[0].legend()
plt.show()

We can see that the ratings in movies are more spread out than ratings in TV

### Number of episodes(TV) and rating

In [None]:
# we will only get the episodes for anime series in TV

episodes_rating = anime[(anime['episodes']!='Unknown')&(anime['type']=='TV')].astype({'episodes':int})

# we can see that there are outliers present
# for the sake of visualization, we will only consider anime with below 100 episodes

episodes_rating['episodes'].describe()

In [None]:
episodes_rating_filtered = episodes_rating[episodes_rating['episodes']<100][['episodes','rating']]

In [None]:
sns.kdeplot(data=episodes_rating_filtered,shade=True)
plt.show()

We can see that most of the anime with episodes ranging 10-20,20-30 and 45-55 receives a rating of around 7.

### Number of ratings given by the user

In [None]:
user_rating_count = rating.groupby(by='user_id').count()['anime_id']
user_rating_count.describe()

In [None]:
sns.kdeplot(data=user_rating_count,shade=True)
plt.title('Number of ratings given by the user')
plt.xlabel("Rating count")
plt.show()

We can see that most anime have rating count below 500

# Collaborative Filtering using Pearson Correlation

<br>

```
* In this recommendation system, we will be utilizing the collaborative filtering technique.
* By using this technique, the system will recommend anime based on the correlation between the ratings of 
  the user's anime and the ratings of other anime.
* For example, I watched 10 anime and gave each of them a rating. Now, my friend watched an anime from my 
  anime list and now asks me to recommend three anime. With that, I will recommend three anime with closest 
  rating to the rating I gave for the anime that my friend watched.
```

### Process

<br>

```
* Remove anime with low count of ratings and users who gave low count of ratings
* Construct Rating Matrix
* Correlate user's anime with other anime based on ratings
* Output ten recommended anime
```

### Remove anime with low count of ratings and users who gave low count of ratings

* We will only consider popular anime (rating count over 250) and users who gave lots of rating on different anime (>100)

In [None]:
anime_rating_count = rating.groupby(by='anime_id').count()['rating'].reset_index().rename(columns={'rating':'rating_count'})
anime_rating_count['rating_count'].describe()

In [None]:
filtered_anime = anime_rating_count[anime_rating_count['rating_count']>250]

In [None]:
# anime with over 250 rating count

filtered_anime.head()

In [None]:
user_rating_count = rating.groupby(by='user_id').count()['rating'].reset_index().rename(columns={'rating':'rating_count'})
user_rating_count['rating_count'].describe()

In [None]:
# users who gave over 100 ratings to different anime

filtered_user = user_rating_count[user_rating_count['rating_count']>100]

In [None]:
filtered_user.head()

In [None]:
filtered_rating_anime = rating[rating['anime_id'].isin(filtered_anime['anime_id'])]
filtered_rating = filtered_rating_anime[filtered_rating_anime['user_id'].isin(filtered_user['user_id'])]

In [None]:
# this dataset now contains popular anime and users wth high rating counts

filtered_rating.head()

### Construct Rating Matrix

* We will construct a matrix by using pivot table wherein users will be indexes and anime in columns

In [None]:
# we can see that most of the values are zero since most of the users does not have ratings for every anime

rating_matrix = filtered_rating.pivot_table(index='user_id',columns='anime_id',values='rating').fillna(0)
print(rating_matrix.shape)
rating_matrix.head()

### Correlate user's anime with other anime based on ratings

In [None]:
user_anime = anime[anime['name']=='Bleach']
user_anime

In [None]:
user_anime_ratings = rating_matrix[int(user_anime['anime_id'])]

In [None]:
correlated_anime = rating_matrix.corrwith(user_anime_ratings).reset_index().rename(columns={0:'Correlation'})
correlated_anime.head()

### Output ten recommended anime

In [None]:
recommended_anime = correlated_anime.sort_values(by='Correlation',ascending=False).head(11).iloc[1:]

In [None]:
pd.merge(recommended_anime,anime,on='anime_id',how='left').drop(columns=['anime_id','Correlation'])