# Content Based Recommender System

In [None]:
import numpy as np 
#for arrays 
import pandas as pd
#for csv files
import matplotlib.pyplot as plt
#for plots
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

# Read the data

In [None]:
df = pd.read_csv("../input/anime.csv")
df.head()

In [None]:
df.shape

In [None]:
#sum of null values in data
df.isnull().sum()

We only want to build recommender based on the genre and the type, so let's focus only to those feature.

# Handling Missing Value

## Handling missing rating

In [None]:
#checking if any null value exist in rating column
df.loc[df['rating'].isnull(), 'rating'] = 0.0
df['rating'].isnull().any()

## Handling missing type

In [None]:
df['type'].value_counts().plot.bar()

****My approach is to fill the missing type based on the number of the episodes. But after checking the data, some of the anime have 'unknown' episodes and missing type.

In [None]:
df.loc[(df['episodes']=="Unknown") & (df['type'].isnull())].head()

In [None]:
missing = df.loc[(df['episodes']=="Unknown") & (df['type'].isnull())].copy()
missing.shape

It seems the data is not up to date with the current season. There is no other way, We have to manually fill the type and the episodes.

In [None]:
missing.head(25)

In [None]:
df.loc[(df['name'] == "Steins;Gate 0"), 'type'] = 'TV'
df.loc[(df['name'] == "Steins;Gate 0"), 'episodes'] = '23'
df.loc[(df['name'] == "Violet Evergarden"), 'type'] = 'TV'
df.loc[(df['name'] == "Violet Evergarden"), 'episodes'] = '13'
df.loc[(df['name'] == "Code Geass: Fukkatsu no Lelouch"), 'type'] = 'TV'
df.loc[(df['name'] == "Code Geass: Fukkatsu no Lelouch"), 'episodes'] = '25'
df.loc[(df['name'] == "K: Seven Stories"), 'type'] = 'Movie'
df.loc[(df['name'] == "K: Seven Stories"), 'episodes'] = '6'
df.loc[(df['name'] == "Free! (Shinsaku)"), 'type'] = 'TV'
df.loc[(df['name'] == "Free! (Shinsaku)"), 'episodes'] = '12'
df.loc[(df['name'] == "Busou Shoujo Machiavellianism"), 'type'] = 'TV'
df.loc[(df['name'] == "Busou Shoujo Machiavellianism"), 'episodes'] = '12'
df.loc[(df['name'] == "Code:Realize: Sousei no Himegimi"), 'type'] = 'TV'
df.loc[(df['name'] == "Code:Realize: Sousei no Himegimi"), 'episodes'] = '12'
df.loc[(df['name'] == "Gamers!"), 'type'] = 'TV'
df.loc[(df['name'] == "Gamers!"), 'episodes'] = '12'
df.loc[(df['name'] == "Ganko-chan"), 'type'] = 'TV'
df.loc[(df['name'] == "Ganko-chan"), 'episodes'] = '10'
df.loc[(df['name'] == "Ginga Eiyuu Densetsu (2017)"), 'type'] = 'OVA'
df.loc[(df['name'] == "Ginga Eiyuu Densetsu (2017)"), 'episodes'] = '110'
df.loc[(df['name'] == "Grancrest Senki"), 'type'] = 'TV'
df.loc[(df['name'] == "Grancrest Senki"), 'episodes'] = '24'
df.loc[(df['name'] == "IDOLiSH7"), 'type'] = 'TV'
df.loc[(df['name'] == "IDOLiSH7"), 'episodes'] = '17'
df.loc[(df['name'] == "Isekai Shokudou"), 'type'] = 'TV'
df.loc[(df['name'] == "Isekai Shokudou"), 'episodes'] = '12'
df.loc[(df['name'] == "Oushitsu Kyoushi Haine"), 'type'] = 'TV'
df.loc[(df['name'] == "Oushitsu Kyoushi Haine"), 'episodes'] = '12'
df.loc[(df['name'] == "Peace Maker Kurogane (Shinsaku)"), 'type'] = 'TV'
df.loc[(df['name'] == "Peace Maker Kurogane (Shinsaku)"), 'episodes'] = '24'
df.loc[(df['name'] == "Seikaisuru Kado"), 'type'] = 'TV'
df.loc[(df['name'] == "Seikaisuru Kado"), 'episodes'] = '12'
df.loc[(df['name'] == "UQ Holder!"), 'type'] = 'TV'
df.loc[(df['name'] == "UQ Holder!"), 'episodes'] = '12'
df.loc[(df['name'] == "Citrus"), 'type'] = 'TV'
df.loc[(df['name'] == "Citrus"), 'episodes'] = '12'
df.loc[(df['name'] == "Hitorijime My Hero"), 'type'] = 'TV'
df.loc[(df['name'] == "Hitorijime My Hero"), 'episodes'] = '12'
df.isnull().sum()

There still some unknown 'type', because it has not yet aired. I will drop those anime.

In [None]:
df.dropna(subset=['type'], inplace=True)
df.shape

## Handling Unknown Genre

In [None]:
df.isnull().sum()

In [None]:
df[df['genre'].isnull()]

For now, let's just fill it with 'Unknown'.

In [None]:
df['genre'].fillna('Unknown', inplace=True)
df.isnull().any()

# Build the Recommender System

In [None]:
df.head()

## Calculate Weighted Rating

Maybe it is also a good idea to show the weighted rating to the user whenever they query for similiar anime. We can improve the recommendation by sorting the recommendation based on their respective weighted rating.

In [None]:
m = df.members.quantile(0.75)
C = df.rating.mean()
print(m, C)

In [None]:
def weighted_rating(df, m, C):
    term = df['members'] / (m + df['members'])
    return df['rating'] * term + (1-term) * C

In [None]:
df['community_rating'] = df.apply(weighted_rating, axis=1, args=(m,C))
df.head()

## Dropping some unused columns

these are the features that will be dropped:
* anime_id -> just the index of the anime, it is easier if we used the panda's index
* rating -> we have weighted rating (community_rating)
* members -> we have weighted rating (community_rating)
* episodes -> the data that we have is not updated with the current airing anime, and some anime is still airing (One Piece, etc)

In [None]:
df.drop(['anime_id', 'rating', 'members', 'episodes'], axis=1, inplace=True)
df.head()

## Breakdown the genre and type

We want our algorithm to treat the 'type' and 'genre' of anime as equal. Therefore if we use label encoding, maybe the algorithm will treat a certain category more important than the other categories.

In [None]:
df = pd.concat([df, df['type'].str.get_dummies(), df['genre'].str.get_dummies(sep=',')], axis=1)
df.head()

In [None]:
anime_features = df.loc[:, "Movie":].copy()
anime_features.head()

## Calculate the similarity matrix

In [None]:
cosine_sim = cosine_similarity(anime_features.values, anime_features.values)

In [None]:
cosine_sim

In [None]:
cosine_sim.shape

Now we must create an indexing for each anime name, this will be used when user querying a recommendation.

In [None]:
anime_index = pd.Series(df.index, index=df.name).drop_duplicates()

In [None]:
def get_recommendation(anime_name, similarity=cosine_sim):
    idx = anime_index[anime_name]
    
    # Get the pairwsie similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[0:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    result = df[['name', 'genre', 'community_rating']].iloc[anime_indices].drop(idx)
    return result

In [None]:
get_recommendation("Steins;Gate")

We see that it is very reasonable that __Steins;Gate 0__ is the most similar anime, because it is the alternate ending of __Steins;Gate__. The rating is very low because __Steins;Gate 0__ was a new anime when the data is collected.

In [None]:
get_recommendation("Kimi no Na wa.")

In [None]:
get_recommendation("Kokoro ga Sakebitagatterunda.")

In [None]:
get_recommendation("Naruto")

In [None]:
get_recommendation("Noragami")

In [None]:
get_recommendation("Plastic Memories")