In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Get data

anime_df = pd.read_csv("/content/drive/MyDrive/anime.csv")
rating_df = pd.read_csv("/content/drive/MyDrive/rating.csv")

In [5]:
# Using only TV data for faster computation

anime_less = anime_df[(anime_df['type'] == 'TV')]

In [6]:
anime_less.shape

(3787, 7)

In [7]:
# Filling NA for -1 rating

rating_df.loc[rating_df.rating == -1, 'rating'] = np.NaN
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [8]:
# Creating index for anime name

anime_index = pd.Series(anime_less.index, index=anime_less.name)
anime_index.head()

name
Fullmetal Alchemist: Brotherhood                             1
Gintama°                                                     2
Steins;Gate                                                  3
Gintama&#039;                                                4
Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou    5
dtype: int64

In [9]:
joined = anime_less.merge(rating_df, how='inner', on='anime_id')
joined.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,3,10.0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,10,10.0
2,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,11,8.0
3,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,12,9.0
4,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,17,10.0


In [10]:
# Creating pivot table

joined = joined[['user_id', 'name', 'rating_y']]

pivot = pd.pivot_table(joined, index='name', columns='user_id', values='rating_y')
pivot.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73505,73506,73507,73508,73510,73511,73512,73513,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
009-1,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Drop all users that never rate an anime

pivot.dropna(axis=1, how='all', inplace=True)
pivot.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73505,73506,73507,73508,73510,73511,73512,73513,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
009-1,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# Normalizing data by subtracting average rating

normalized_data = pivot.apply(lambda x: x - np.nanmean(x), axis=1)
normalized_data.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73505,73506,73507,73508,73510,73511,73512,73513,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
009-1,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,


In [13]:
normalized_data.fillna(0, inplace=True)
normalized_data.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73505,73506,73507,73508,73510,73511,73512,73513,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Using cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

similar_item_df = pd.DataFrame(cosine_similarity(normalized_data, normalized_data), index=normalized_data.index, columns=normalized_data.index)
similar_item_df.head()

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,2020 Nyeon Ujuui Wonder Kiddy,21 Emon,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.0,0.289985,0.315053,0.027211,0.074391,0.061119,0.002056,0.0,0.0,0.0,...,0.019498,0.025963,0.038025,0.030098,0.005009,0.00571,0.03935,0.049731,0.038284,0.040426
.hack//Sign,0.289985,1.0,0.269825,0.022038,0.05885,0.050163,0.002424,0.0,0.0,0.0,...,0.012434,0.009209,0.049085,0.050667,-0.001771,0.000688,0.023337,0.067435,0.055709,0.031301
.hack//Tasogare no Udewa Densetsu,0.315053,0.269825,1.0,0.034943,0.047875,0.053741,-0.003796,0.0,0.0,0.0,...,0.027972,-0.008486,0.04459,0.036658,0.003887,0.01359,0.018079,0.072843,0.048902,0.04161
009-1,0.027211,0.022038,0.034943,1.0,0.023267,0.015021,-0.002677,0.0,0.0,0.0,...,0.019066,0.0,0.015646,0.013202,0.02428,0.034099,0.043376,0.015547,0.030451,0.023259
07-Ghost,0.074391,0.05885,0.047875,0.023267,1.0,0.122489,0.005785,0.0,0.0,0.0,...,0.02188,0.023478,0.042737,0.036837,-0.009816,-0.000592,0.022911,0.041351,0.054222,0.066368


In [15]:
def similar_anime_func(anime_name):
    if anime_name not in normalized_data.index:
        return None, None
    else:
        anime_similar = similar_item_df.sort_values(by=anime_name, ascending=False).index[1:]
        score = similar_item_df.sort_values(by=anime_name, ascending=False).loc[:, anime_name].tolist()[1:]
        return anime_similar, score

In [16]:
animes, score = similar_anime_func("Naruto")
print("Anime most similar to Naruto")
i = 0
for x,y in zip(animes[:10], score[:10]):
  i=i+1
  print(i,": {} ".format(x))

Anime most similar to Naruto
1 : Bleach 
2 : Dragon Ball Z 
3 : Sword Art Online 
4 : Fairy Tail 
5 : Ao no Exorcist 
6 : Dragon Ball GT 
7 : Death Note 
8 : Dragon Ball 
9 : Soul Eater 
10 : Shingeki no Kyojin 


In [17]:
def predict_rating(user_id, anime_name, max_neighbor=10):
    animes, scores = similar_anime_func(anime_name)
    anime_arr = np.array([x for x in animes])
    sim_arr = np.array([x for x in scores])
    
    # select only the anime that has already rated by user x
    filtering = normalized_data[user_id].loc[anime_arr] != 0
    
    # calculate the predicted score
    s = np.dot(sim_arr[filtering][:max_neighbor], pivot[user_id].loc[anime_arr[filtering][:max_neighbor]]) \
            / np.sum(sim_arr[filtering][:max_neighbor])
    
    return s

In [18]:
# Predicting rating for Naruto by user 3

predict_rating(3, "Naruto")

8.473102443511113