In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [13]:
anime_data = pd.read_csv('anime.csv')
user_rating = pd.read_csv('rating.csv')

In [14]:
anime_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [15]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
anime_id    12294 non-null int64
name        12294 non-null object
genre       12232 non-null object
type        12269 non-null object
episodes    12294 non-null object
rating      12064 non-null float64
members     12294 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 672.4+ KB


In [16]:
user_rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


I'll drop all the rows that don't have genre information as I'm planning to use genre to create anime profile.

In [17]:
anime_data.dropna(inplace=True)

Now we'll create a lookup table for the anime id and the name.

In [18]:
lookup_anime = anime_data[['anime_id','name',]].sort_values(by='anime_id')

Now we'll create the anime profile using the genre and the type variable. We are not using the members as a feature for now so we'll drop that column

In [19]:
anime_data.drop(['members','rating','name'],axis=1,inplace=True)

In [20]:
# Creating new columns from comma separated genre column
df = pd.DataFrame(anime_data['genre'].str.split(',', expand=True).stack(),columns=['genre']).reset_index().drop('level_1',axis=1)
df['genre']=df['genre'].str.strip()
anime_genre_profile = pd.get_dummies(df).groupby('level_0').sum()
anime_data.drop(['genre'],axis=1,inplace=True)

In [21]:
# Creating Anime profile
anime_profile = pd.concat([pd.concat([anime_data,anime_genre_profile],axis=1),pd.get_dummies(anime_data['type'])],axis=1)
anime_profile.drop(['type'],axis=1,inplace=True)
anime_profile.set_index('anime_id',inplace=True)
anime_profile.drop(['episodes'],axis=1,inplace=True)

In [22]:
# Selecting the anime that are in anime profile dataframe 
user_rating = user_rating[user_rating['anime_id'].isin(anime_profile.index)]

In [27]:
user_rating.shape

(7813611, 3)

In [23]:
# Sample of the data
nusers = 10000
users = np.random.choice(user_rating['user_id'].unique(),size=nusers)
users = np.append(users,19)
sample = user_rating[user_rating['user_id'].isin(users)]

In [24]:
# creating a utility matrix between user and anime with ratings as the values
utility_matrix = sample.pivot(values='rating',columns='anime_id',index='user_id').fillna(0)

In [25]:
utility_matrix.columns = utility_matrix.columns.astype(str)

In [26]:
anime_profile.index = anime_profile.index.astype(str)

In [27]:
utility_matrix=utility_matrix.loc[(utility_matrix!=0).any(1)]

# User User Collaborative Filtering

In [28]:
utility_matrix = utility_matrix.replace(-1,0)

In [29]:
utility_matrix_norm = utility_matrix.apply(lambda x: x[x>0] - np.mean(x[x>0]),axis=1).fillna(0)

In [30]:
def top_sim_users(utility_matrix_norm,user_id,n=5):
    user = utility_matrix_norm.loc[user_id]
    target = user.values.reshape((user.shape[0],1))
    cos_sim = np.matmul(utility_matrix_norm,target).ravel()/(np.linalg.norm(target)*np.sqrt(np.square(utility_matrix_norm).sum(axis=1)))
    return cos_sim.sort_values(ascending=False).head(n+1)[1:]

In [31]:
def rating_pred(utility_matrix,utility_matrix_norm,user_id,anime_id,k=5):
    
    relev_users = utility_matrix_norm[utility_matrix_norm[str(anime_id)]!=0] # All relevant users who've watched this anime
    relev_users.loc[user_id,:] = utility_matrix_norm.loc[user_id,:]
    
    sim_users = top_sim_users(relev_users,user_id,k) # Top k most similar users who've watched this anime
    
    pred = 0
    sim = 0

    for sim_user in sim_users.index:
        similarity = sim_users[sim_user]
        if similarity>0:
            sim+=similarity
            rating = utility_matrix.loc[sim_user,str(anime_id)] 
            pred+=similarity*rating
        else:
#             print('no similar users, need to define a new metric')
            return 0
    return pred/sim

In [32]:
def reco(utility_matrix,utility_matrix_norm,user_id,how_many=10):
    ratings={}
    i=1
    similar_users = top_sim_users(utility_matrix_norm,user_id,10)
    anime_ids = []
    for elem in similar_users.index:
        anime_ids = np.append(anime_ids,utility_matrix.loc[elem,utility_matrix.loc[elem]!=0].index)
    for anime_id in set(anime_ids):
        ratings[anime_id] = rating_pred(utility_matrix,utility_matrix_norm,user_id,anime_id)
        i+=1
        if i%10==0:
            break
    
    ratings=pd.Series(ratings).sort_index()
    ratings.index = lookup_anime.loc[lookup_anime['anime_id'].isin(ratings.index),'name'].values
    
    return ratings.sort_values(ascending=False).head(how_many)

--------------------------Seen anime-------------------------
Cowboy Bebop
Cowboy Bebop: Tengoku no Tobira
Fullmetal Alchemist
Sen to Chihiro no Kamikakushi
Hotaru no Haka
Black Lagoon
Sennen Joyuu
Black Lagoon: The Second Barrage
Death Note
Code Geass: Hangyaku no Lelouch


In [39]:
print('----------------Recommended Anime------------------')
print(reco(utility_matrix,utility_matrix_norm,19))

seen_anime = utility_matrix.loc[19,utility_matrix.loc[19]!=0].index[:-1]
print('--------------------------Seen anime-------------------------')
i=1
for elem in seen_anime:
    print(lookup_anime.loc[lookup_anime['anime_id']==int(elem),'name'].values[0])
    if i%10==0:
        break
    i+=1




----------------Recommended Anime------------------
Mahou Shoujo Madoka★Magica                                             9.391655
Code Geass: Hangyaku no Lelouch                                        9.161348
Devil May Cry                                                          9.026813
Serial Experiments Lain                                                8.718679
Naruto: Akaki Yotsuba no Clover wo Sagase                              8.565496
Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!    8.471915
Death Note                                                             8.421401
FLCL                                                                   7.790119
Bokurano                                                               7.658658
dtype: float64
--------------------------Seen anime-------------------------
Cowboy Bebop
Cowboy Bebop: Tengoku no Tobira
Fullmetal Alchemist
Sen to Chihiro no Kamikakushi
Hotaru no Haka
Black Lagoon
Sennen Joyuu
Black Lagoon: T

# Item Item Collaborative Filtering

In [40]:
utility_matrix_item = utility_matrix.T
utility_matrix_item.columns = utility_matrix_item.columns.astype(str)
utility_matrix_item.index = utility_matrix_item.index.astype(int)
# utility_matrix_item_norm = utility_matrix_norm.T
# utility_matrix_item_norm.columns = utility_matrix_item_norm.columns.astype(str)
# utility_matrix_item_norm.index = utility_matrix_item_norm.index.astype(int)

In [41]:
utility_matrix_item_norm = utility_matrix_item.apply(lambda x: x[x>0] - np.mean(x[x>0]),axis=1).fillna(0)

In [42]:
# Now we should be able to use the same functions as in user-user collaborative filtering

In [43]:
top_sim_users(utility_matrix_item_norm,1,10)

anime_id
5       0.318404
205     0.225506
6       0.170149
227     0.157177
4037    0.139454
2164    0.138966
30      0.137720
2251    0.130999
467     0.126635
339     0.124744
dtype: float64

In [44]:
def reco_item(utility_matrix_item,utility_matrix_item_norm,user_id,how_many=10):
    ratings={}
    i=1
    for anime_id in utility_matrix_item_norm[utility_matrix_item_norm[str(user_id)]==0].index:
        ratings[anime_id] = rating_pred(utility_matrix_item,utility_matrix_item_norm,anime_id,user_id)
        i+=1
        if i%50==0:
            break
    ratings=pd.Series(ratings).sort_index()
    ratings.index = lookup_anime.loc[lookup_anime['anime_id'].isin(ratings.index),'name'].values
    
    return ratings.sort_values(ascending=False).head(how_many)

In [46]:
print('----------------Recommended Anime------------------')
print(reco_item(utility_matrix_item,utility_matrix_item_norm,19))

seen_anime = utility_matrix.loc[19,utility_matrix.loc[19]!=0].index[:-1]
print('--------------------------Seen anime-------------------------')
i=1
for elem in seen_anime:
    print(lookup_anime.loc[lookup_anime['anime_id']==int(elem),'name'].values[0])
    if i%10==0:
        break
    i+=1



----------------Recommended Anime------------------
Akira                                                    9.344397
Ghost in the Shell                                       9.298843
Azumanga Daioh                                           9.233281
Rozen Maiden: Träumend                                   9.124461
Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen    9.087763
Neon Genesis Evangelion                                  9.080146
Monster                                                  9.037122
Zipang                                                   8.914690
Ring ni Kakero 1                                         8.901915
Rozen Maiden                                             8.767891
dtype: float64
--------------------------Seen anime-------------------------
Cowboy Bebop
Cowboy Bebop: Tengoku no Tobira
Fullmetal Alchemist
Sen to Chihiro no Kamikakushi
Hotaru no Haka
Black Lagoon
Sennen Joyuu
Black Lagoon: The Second Barrage
Death Note
Code Geass: Hangyaku no Lelouch