# 1. Importing Libraries

In [1]:
import os 
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA




* #### Loading Datasets

In [2]:
usecols = ["MAL_ID", "Name", "Score", "Genres", "Type", "Episodes", "Premiered",
           "Studios", "Source", "Rating", "Members"]
anime_recom =pd.read_csv('../input/anime-recommendation-database-2020/anime.csv',usecols=usecols)
rating_data = pd.read_csv('../input/anime-recommendation-database-2020/rating_complete.csv')
anime = pd.read_csv('../input/anime-recommendation-database-2020/anime.csv',low_memory=True)

In [3]:
anime.rename(columns = {'MAL_ID':'anime_id'},inplace=True)
rating_data.rename(columns = {'MAL_ID':'anime_id'},inplace=True)

# 2. Recommendation

* #### Function to remove Unknown values from Columns

In [4]:
def process_multilabel(series):
    series = series.split(",")
    if "Unknown" in series:
        series.remove("Unknown")
    return series

anime_recom["Genres"] = anime_recom["Genres"].map(process_multilabel)
anime_recom["Studios"] = anime_recom["Studios"].map(process_multilabel)
anime_recom["Score"] = anime_recom["Score"].replace("Unknown", 0).astype(float)
anime_recom["Episodes"] = anime_recom["Episodes"].replace("Unknown", 0).astype(int)
anime_recom.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
0,1,Cowboy Bebop,8.78,"[Action, Adventure, Comedy, Drama, Sci-Fi,...",TV,26,Spring 1998,[Sunrise],Original,R - 17+ (violence & profanity),1251960
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"[Action, Drama, Mystery, Sci-Fi, Space]",Movie,1,Unknown,[Bones],Original,R - 17+ (violence & profanity),273145
2,6,Trigun,8.24,"[Action, Sci-Fi, Adventure, Comedy, Drama,...",TV,26,Spring 1998,[Madhouse],Manga,PG-13 - Teens 13 or older,558913
3,7,Witch Hunter Robin,7.27,"[Action, Mystery, Police, Supernatural, Dr...",TV,26,Summer 2002,[Sunrise],Original,PG-13 - Teens 13 or older,94683
4,8,Bouken Ou Beet,6.98,"[Adventure, Fantasy, Shounen, Supernatural]",TV,52,Fall 2004,[Toei Animation],Manga,PG - Children,13224


### 2.1 Feature Extraction and Feature Engineering

In [5]:
def preprocessing_category(df, column, is_multilabel=False):
    
    # Binarise labels
    lb = LabelBinarizer()
    if is_multilabel:
        lb = MultiLabelBinarizer()

    expandedLabelData = lb.fit_transform(df[column])
    labelClasses = lb.classes_

    # Create a pandas.DataFrame from our output
    category_df = pd.DataFrame(expandedLabelData, columns=labelClasses)
    del df[column]
    return pd.concat([df, category_df], axis=1)

anime_metadata = anime_recom.copy()
anime_metadata = preprocessing_category(anime_metadata, "Type")
anime_metadata = preprocessing_category(anime_metadata, "Premiered")
anime_metadata = preprocessing_category(anime_metadata, "Studios", is_multilabel=True)
anime_metadata = preprocessing_category(anime_metadata, "Source")
anime_metadata = preprocessing_category(anime_metadata, "Rating")

Genres = anime_metadata["Genres"]
ID_NAME = anime_metadata[["MAL_ID", "Name"]]

#Deleting Repeated Columns from the Dataframe
del anime_metadata["Genres"]
del anime_metadata["MAL_ID"]
del anime_metadata["Name"]
del anime_metadata["Unknown"]

#### 2.2 Normalization


In [6]:
anime_metadata[["Score", "Episodes", "Members"]] = MinMaxScaler().fit_transform(anime_metadata[["Score", "Episodes", "Members"]])
anime_metadata = anime_metadata.values

* #### Extracting  Keywords from the Dataframe

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
genres_original = anime_recom['Genres'].fillna('').astype(str)
genres_vector_tf_idf = tfv.fit_transform(genres_original)

genres_vector_one_hot = preprocessing_category(pd.DataFrame(Genres), "Genres", True).values

In [8]:
print("anime_metadata.shape:", anime_metadata.shape)
print("genres_vector_tf_idf.shape:", genres_vector_tf_idf.shape)
print("genres_vector_one_hot.shape:", genres_vector_one_hot.shape)

anime_metadata.shape: (17562, 1208)
genres_vector_tf_idf.shape: (17562, 2282)
genres_vector_one_hot.shape: (17562, 83)


# 3.1.  Content Based Recommendation using KNN

* #### Creating a Recommendation function using KNN Model

In [9]:
def get_recommended(vector, query_index, n_neighbors=10):
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(csr_matrix(vector))

    distances, indices = model_knn.kneighbors(vector[query_index,:].reshape(1, -1), n_neighbors = n_neighbors)
    result = []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(anime_recom.iloc[index])
        
    return pd.DataFrame(result)

* #### Select Anime Id on the basis of which recommendations will be done

In [10]:
query_index = ID_NAME[ID_NAME.MAL_ID == 5231].index[0]
anime_recom.iloc[[query_index]]

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
4034,5231,Inazuma Eleven,7.59,"[Sports, Super Power, Shounen]",TV,127,Fall 2008,[OLM],Game,G - All Ages,138185


* ####  **Based on Type and Studios**

In [11]:
get_recommended(anime_metadata, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
7775,17917,Danball Senki Wars,7.08,"[Action, Kids, Mecha]",TV,37,Spring 2013,[OLM],Game,G - All Ages,3680
6795,12651,Danball Senki W,7.05,"[Action, Kids, Mecha]",TV,58,Winter 2012,[OLM],Game,G - All Ages,4559
4893,7081,Danball Senki,7.02,"[Action, Kids, Mecha]",TV,44,Spring 2011,[OLM],Game,G - All Ages,7206
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
14796,38235,Inazuma Eleven: Orion no Kokuin,6.79,[Sports],TV,49,Fall 2018,[OLM],Game,G - All Ages,15200
12038,33733,Inazuma Eleven: Ares no Tenbin,6.68,[Sports],TV,26,Spring 2018,[OLM],Game,G - All Ages,24740
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
14184,37324,Youkai Watch: Shadow Side,6.58,"[Comedy, Demons, Kids, Supernatural]",TV,49,Spring 2018,[OLM],Game,G - All Ages,1447


* ####  **Based on Keywords of the anime**

In [12]:
get_recommended(genres_vector_tf_idf, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
6461,10999,Inazuma Eleven Go: Kyuukyoku no Kizuna Gryphon,7.41,"[Sci-Fi, Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,11451
12043,33740,Katekyo Hitman Reborn! x ēlDLIVE Special,6.76,"[Super Power, Shounen]",Special,1,Unknown,[Artland],Original,PG-13 - Teens 13 or older,8366
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
5617,9032,Inazuma Eleven: Saikyou Gundan Ogre Shuurai,7.32,"[Shounen, Sports, Super Power]",Movie,1,Unknown,[OLM],Unknown,G - All Ages,19711
12921,35230,Zannen Onna Kanbu Black General-san,5.83,"[Comedy, Super Power, Shounen]",ONA,10,Unknown,[Oddjob],Manga,PG-13 - Teens 13 or older,1733


* ####  **Based on Genres**

In [13]:
get_recommended(genres_vector_one_hot, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
6644,11857,Judo Sanka,0.0,"[Sports, Shounen]",TV,27,Spring 1974,[],Unknown,G - All Ages,368
7810,18061,Tiger Mask (Movie),6.13,"[Sports, Shounen]",Movie,1,Unknown,[],Manga,PG-13 - Teens 13 or older,703
7811,18063,Tiger Mask Fuku Men League Sen,6.12,"[Sports, Shounen]",Movie,1,Unknown,[Toei Animation],Manga,PG-13 - Teens 13 or older,705
7541,16824,Hwang-geum-ui Pal,0.0,"[Sports, Shounen]",Movie,1,Unknown,[],Unknown,Unknown,161
8626,22205,Be Blues! Ao ni Nare,5.46,"[Sports, Shounen]",ONA,1,Unknown,[],Manga,G - All Ages,457
6276,10573,Tennis no Ouji-sama: Another Story II - Ano To...,7.54,"[Sports, Shounen]",OVA,4,Unknown,[Production I.G],Manga,PG-13 - Teens 13 or older,9028


* ####  **Based on all Aspects**

In [14]:
all_data = np.concatenate((anime_metadata, genres_vector_tf_idf.todense(), genres_vector_one_hot), axis=1)
all_data.shape

(17562, 3573)

In [15]:
get_recommended(all_data, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
14796,38235,Inazuma Eleven: Orion no Kokuin,6.79,[Sports],TV,49,Fall 2018,[OLM],Game,G - All Ages,15200
12038,33733,Inazuma Eleven: Ares no Tenbin,6.68,[Sports],TV,26,Spring 2018,[OLM],Game,G - All Ages,24740
6461,10999,Inazuma Eleven Go: Kyuukyoku no Kizuna Gryphon,7.41,"[Sci-Fi, Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,11451
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
14501,37814,Inazuma Eleven: Reloaded - Soccer no Henkaku,6.99,[Sports],Special,1,Unknown,[OLM],Game,G - All Ages,4106


* ####  **Based on Top Features**

In [16]:
reduced_all_data = PCA(n_components=250).fit_transform(all_data)
get_recommended(reduced_all_data, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
14796,38235,Inazuma Eleven: Orion no Kokuin,6.79,[Sports],TV,49,Fall 2018,[OLM],Game,G - All Ages,15200
12038,33733,Inazuma Eleven: Ares no Tenbin,6.68,[Sports],TV,26,Spring 2018,[OLM],Game,G - All Ages,24740
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
6461,10999,Inazuma Eleven Go: Kyuukyoku no Kizuna Gryphon,7.41,"[Sci-Fi, Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,11451
14501,37814,Inazuma Eleven: Reloaded - Soccer no Henkaku,6.99,[Sports],Special,1,Unknown,[OLM],Game,G - All Ages,4106


# 3.2. Recommendations using Collaborative Filtering

* #### Preparing Data for Collaborating Filtering

In [17]:
users_count = rating_data.groupby("user_id").size().reset_index()
users_count.columns = ["user_id", "anime_count"]

print(users_count.shape)

filtered_users = users_count[users_count.anime_count >= 300]
users = set(filtered_users.user_id)

print(len(users))

(310059, 2)
56515


In [18]:
rating_data = rating_data[rating_data.user_id.isin(users)]
print ("rating_data.shape:", rating_data.shape)
print (rating_data.info())

rating_data.shape: (31885564, 3)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 31885564 entries, 189 to 57633032
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 973.1 MB
None


In [19]:
unique_users = {int(x): i for i,x in enumerate(rating_data.user_id.unique())}
unique_items = {int(x): i for i,x in enumerate(anime_recom.MAL_ID.unique())}
print(len(unique_items), len(unique_users))

anime_collabolative_filter = np.zeros((len(unique_items), len(unique_users)))

for user_id, anime_id, rating in rating_data.values:
    anime_collabolative_filter[unique_items[anime_id], unique_users[user_id]] = rating

17562 56515


* #### Recommendation based on Colaborative Filtering

In [20]:
get_recommended(anime_collabolative_filter, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
5617,9032,Inazuma Eleven: Saikyou Gundan Ogre Shuurai,7.32,"[Shounen, Sports, Super Power]",Movie,1,Unknown,[OLM],Unknown,G - All Ages,19711
6461,10999,Inazuma Eleven Go: Kyuukyoku no Kizuna Gryphon,7.41,"[Sci-Fi, Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,11451
12038,33733,Inazuma Eleven: Ares no Tenbin,6.68,[Sports],TV,26,Spring 2018,[OLM],Game,G - All Ages,24740
7318,15785,Inazuma Eleven Go vs. Danball Senki W Movie,7.04,"[Action, Kids, Mecha, Sports]",Movie,1,Unknown,[OLM],Unknown,G - All Ages,9338
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
14796,38235,Inazuma Eleven: Orion no Kokuin,6.79,[Sports],TV,49,Fall 2018,[OLM],Game,G - All Ages,15200


# 3.3. Recommendation based on Similarity

In [21]:
df = pd.merge(rating_data,anime[["anime_id","Name"]], left_on = "anime_id", right_on = "anime_id").drop("anime_id", axis = 1)
df.head()

Unnamed: 0,user_id,rating,Name
0,3,8,Shirobako
1,17,8,Shirobako
2,44,9,Shirobako
3,46,8,Shirobako
4,110,10,Shirobako


In [22]:
count_rating = df.groupby("Name")["rating"].count().sort_values(ascending = False)
count_rating

Name
Shingeki no Kyojin                     47232
Sword Art Online                       47160
Angel Beats!                           45900
Death Note                             44885
Toradora!                              44265
                                       ...  
Nendjuugyouji Animation Series             1
Oshiete Hokusai! The Animation             1
Baolie Feiche II: Xing Neng Juexing        1
Doraon Yeongung Hong Gildong               1
Black Clover                               1
Name: rating, Length: 16846, dtype: int64

* #### Keeping only the animes with at least r ratings in the DataFrame

In [23]:
r = 10000
more_than_r_ratings = count_rating[count_rating.apply(lambda x: x >= r)].index

df_r = df[df['Name'].apply(lambda x: x in more_than_r_ratings)]

In [24]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15996633 entries, 0 to 31094881
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user_id  int64 
 1   rating   int64 
 2   Name     object
dtypes: int64(2), object(1)
memory usage: 488.2+ MB


* #### Creating a pivot table for recommendation

In [25]:
df_recom = df_r.pivot_table(index='user_id',columns='Name',values='rating')
df_recom.iloc[:5,:5]

Name,11eyes,3-gatsu no Lion,3-gatsu no Lion 2nd Season,5-toubun no Hanayome,91 Days
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,,9.0,,,8.0
6,7.0,,,,
17,10.0,,,,9.0
19,,7.0,7.0,,
21,8.0,,,,


In [26]:
df_r.Name.value_counts().head(10)

Shingeki no Kyojin                  47232
Sword Art Online                    47160
Angel Beats!                        45900
Death Note                          44885
Toradora!                           44265
One Punch Man                       43896
No Game No Life                     43786
Code Geass: Hangyaku no Lelouch     42826
Steins;Gate                         41425
Fullmetal Alchemist: Brotherhood    40803
Name: Name, dtype: int64


* #### Creating a function to get the correlation of one anime with others.
    


In [27]:
def find_corr(df, name):

    similar_to_movie = df.corrwith(df[name])
    similar_to_movie = pd.DataFrame(similar_to_movie,columns=['Correlation'])
    similar_to_movie = similar_to_movie.sort_values(by = 'Correlation', ascending = False)
    return similar_to_movie

* #### Arguments to the function are :
    1.   ####   df (DataFrame):  with user_id as rows and movie titles as column and ratings as values
    1.   ####   name (str): Name of the anime
    
* #### And it Returns a DataFrame with the correlation of the anime with all others
        

 ## Choose an Anime 

In [28]:
# Let's choose an anime
anime1 = 'Naruto'

# Let's try with "Death Note"

# Recommendations
find_corr(df_recom, anime1).head(20)

Unnamed: 0_level_0,Correlation
Name,Unnamed: 1_level_1
Naruto,1.0
Naruto: Shippuuden,0.689854
Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!,0.606644
Naruto Movie 3: Dai Koufun! Mikazuki Jima no Animaru Panikku Dattebayo!,0.590134
Naruto: Shippuuden Movie 1,0.589966
Naruto Movie 2: Dai Gekitotsu! Maboroshi no Chiteiiseki Dattebayo!,0.584434
Bleach,0.578348
Naruto: Shippuuden Movie 2 - Kizuna,0.575799
Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono,0.572307
Naruto: Shippuuden Movie 5 - Blood Prison,0.546552


* ### Not Recommended

In [29]:
find_corr(df_recom, anime1).tail(10)

Unnamed: 0_level_0,Correlation
Name,Unnamed: 1_level_1
Koukaku Kidoutai,0.075269
Kino no Tabi: The Beautiful World,0.069755
Neon Genesis Evangelion: The End of Evangelion,0.067229
FLCL,0.06672
Mousou Dairinin,0.063287
Perfect Blue,0.056507
Ping Pong the Animation,0.04429
Haibane Renmei,0.018723
Serial Experiments Lain,-0.0044
Yojouhan Shinwa Taikei,-0.044903
