In [1]:
import pandas as pd
import re
import numpy as np
import scipy as sp
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import gc
import operator
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
anime = pd.read_csv('data1/anime.csv')
user = pd.read_csv('data1/rating.csv')

In [3]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

anime['name'] = anime['name'].apply(text_cleaning)
user['rating'] = user['rating'].apply(lambda x: 0 if x == -1 else x)
user_sub = user[user['user_id'] < 20000]
user_sub = user_sub.apply(pd.to_numeric,downcast='integer')
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [4]:
gc.collect()

24

In [5]:
merged = user_sub.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)

In [6]:
gc.collect()

40

In [7]:
ints = merged.select_dtypes(include=['int'])
floats = merged.select_dtypes(include=['float'])
converted_int = ints.apply(pd.to_numeric,downcast='integer')
convMem = merged.members.astype('int32')
converted_flt = floats.apply(pd.to_numeric,downcast='float')
merged[converted_int.columns] = converted_int
merged[converted_flt.columns] = converted_flt
merged['members'] = convMem

In [8]:
del ints,floats,converted_int,convMem,converted_flt,user

In [9]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065569 entries, 0 to 2065568
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int16  
 1   anime_id     int32  
 2   user_rating  int8   
 3   name         object 
 4   genre        object 
 5   type         object 
 6   episodes     object 
 7   rating       float32
 8   members      int32  
dtypes: float32(1), int16(1), int32(2), int8(1), object(4)
memory usage: 108.3+ MB


In [10]:
gc.collect()

60

In [11]:
piv = merged.pivot_table(index=['user_id'], columns=['name'], values='user_rating').apply(pd.to_numeric,downcast='float')
piv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19999 entries, 1 to 19999
Columns: 9272 entries, 0 to ◯
dtypes: float32(9272)
memory usage: 707.5 MB


In [12]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)


# Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

In [13]:
gc.collect()

38

In [14]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [15]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [16]:
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [31]:
item_sim_df.head()

name,0,001,009 Re:Cyborg,009-1,009-1: RandB,00:08,07-Ghost,1+2=Paradise,100%,100-man-nen Chikyuu no Tabi: Bander Book,...,lilac (bombs Jun Togawa),makemagic,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.057501,-0.042011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.412652,-0.338727,0.0,0.412652,0.004715,0.0,0.0,0.0,0.0,0.262414
001,0.057501,1.0,-0.014186,0.0,0.0,0.479604,0.0,0.0,0.0,0.0,...,-0.139345,-0.114382,0.0,0.139345,0.001592,0.0,0.0,0.0,0.0,0.088613
009 Re:Cyborg,-0.042011,-0.014186,1.0,0.00461,7.7e-05,0.0,-0.005485,0.109641,0.0,0.0,...,0.101808,0.083569,-0.011581,-0.101808,-0.011307,-0.008718,-0.005874,-0.000834,-0.005329,-0.064742
009-1,0.0,0.0,0.00461,1.0,0.439418,0.0,0.011028,0.098094,0.119294,0.0,...,0.0,0.0,-0.018829,0.0,0.02183,0.013546,0.021746,-0.012171,0.007128,0.0
009-1: RandB,0.0,0.0,7.7e-05,0.439418,1.0,0.0,0.017293,0.128962,0.152821,0.0,...,0.0,0.0,-0.006543,0.0,-0.006992,-0.004108,0.038099,-0.036029,0.007506,0.0


In [32]:
item_sim_df.to_csv('data1/items.csv')

In [26]:
def top_animes1(n):
    x = input('Enter Anime Name:')
    anime_name = anime[anime['name'].str.contains(x, case=False)].sort_values(by='members', ascending=False).reset_index()['name'][0]
    count = 1
    print('If you like {}, you may also like:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:int(n)+1]:
        print('No. {}: {}'.format(count, item))
        count +=1

In [28]:
top_animes1(11)

Enter Anime Name:death
If you like Death Note, you may also like:

No. 1: Code Geass: Hangyaku no Lelouch R2
No. 2: Code Geass: Hangyaku no Lelouch
No. 3: Fullmetal Alchemist: Brotherhood
No. 4: Steins;Gate
No. 5: Shingeki no Kyojin
No. 6: Clannad: After Story
No. 7: Fullmetal Alchemist
No. 8: Tengen Toppa Gurren Lagann
No. 9: Great Teacher Onizuka
No. 10: Sen to Chihiro no Kamikakushi
No. 11: Durarara!!
