In [1]:
import pandas as pd
import re
import numpy as np
import scipy as sp
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import gc
import operator
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
anime = pd.read_csv('data1/anime.csv')
user = pd.read_csv('data1/rating.csv')

In [3]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

anime['name'] = anime['name'].apply(text_cleaning)
user['rating'] = user['rating'].apply(lambda x: 0 if x == -1 else x)
user_sub = user[user['user_id'] < 20000]
user_sub = user_sub.apply(pd.to_numeric,downcast='integer')
user_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065572 entries, 0 to 2065571
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int16
 1   anime_id  int32
 2   rating    int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 29.5 MB


In [4]:
gc.collect()

24

In [5]:
merged = user_sub.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)

In [6]:
gc.collect()

40

In [7]:
ints = merged.select_dtypes(include=['int'])
floats = merged.select_dtypes(include=['float'])
converted_int = ints.apply(pd.to_numeric,downcast='integer')
convMem = merged.members.astype('int32')
converted_flt = floats.apply(pd.to_numeric,downcast='float')
merged[converted_int.columns] = converted_int
merged[converted_flt.columns] = converted_flt
merged['members'] = convMem

In [8]:
del ints,floats,converted_int,convMem,converted_flt,user,user_sub

In [9]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065569 entries, 0 to 2065568
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int16  
 1   anime_id     int32  
 2   user_rating  int8   
 3   name         object 
 4   genre        object 
 5   type         object 
 6   episodes     object 
 7   rating       float32
 8   members      int32  
dtypes: float32(1), int16(1), int32(2), int8(1), object(4)
memory usage: 108.3+ MB


In [10]:
gc.collect()

60

In [11]:
piv = merged.pivot_table(index=['user_id'], columns=['name'], values='user_rating').apply(pd.to_numeric,downcast='float')
piv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19999 entries, 1 to 19999
Columns: 9272 entries, 0 to ◯
dtypes: float32(9272)
memory usage: 707.5 MB


In [12]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)


# Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

In [13]:
gc.collect()

38

In [14]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [15]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [16]:
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)

In [17]:
del user_sim_df, user_similarity

NameError: name 'user_sim_df' is not defined

In [None]:
from sys import getsizeof

In [None]:
# item_sim_df.to_csv('data1/items.csv')

In [18]:
def top_animes1(n):
    x = input('Enter Anime Name:')
    anime_name = anime[anime['name'].str.contains(x, case=False)].sort_values(by='members', ascending=False).reset_index()['name'][0]
    count = 1
    print('If you like {}, you may also like:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:int(n)+1]:
        print('No. {}: {}'.format(count, item))
        count +=1

In [19]:
top_animes1(10)

Enter Anime Name:Death Note
If you like Death Note, you may also like:

No. 1: Code Geass: Hangyaku no Lelouch R2
No. 2: Code Geass: Hangyaku no Lelouch
No. 3: Fullmetal Alchemist: Brotherhood
No. 4: Steins;Gate
No. 5: Shingeki no Kyojin
No. 6: Clannad: After Story
No. 7: Fullmetal Alchemist
No. 8: Tengen Toppa Gurren Lagann
No. 9: Great Teacher Onizuka
No. 10: Sen to Chihiro no Kamikakushi


In [35]:
 item_sim_df.index[1]

'001'

In [50]:
def df_maker(x,n):
    return item_sim_df.nlargest(n+1,x).index[n]

In [51]:
dfm('Kimi no Na wa.',1)

'Shigatsu wa Kimi no Uso'

In [52]:
def dfm(x,n):
    try:
        return df_maker(x,n)
    except:
        return np.nan

In [57]:
df = anime[['anime_id','name']]

In [None]:
df['1'] = df.apply(lambda x: dfm(x['name'],1), axis=1)
# df['2'] = df['name']
# df['3'] = df['name']
# df['4'] = df['name']
# df['5'] = df['name']
# df['6'] = df['name']
# df['7'] = df['name']
# df['8'] = df['name']
# df['9'] = df['name']
# df['10'] = df['name']
df.info()

In [None]:
df['1']

In [55]:
df['1'] = df.apply(lambda x: dfm(x['name'],1), axis=1)

In [56]:
df

Unnamed: 0,anime_id,name,1,2,3,4,5,6,7,8,9,10
0,32281,Kimi no Na wa.,Shigatsu wa Kimi no Uso,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.
1,5114,Fullmetal Alchemist: Brotherhood,Steins;Gate,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood
2,28977,Gintama°,Gintama: Enchousen,Gintama°,Gintama°,Gintama°,Gintama°,Gintama°,Gintama°,Gintama°,Gintama°,Gintama°
3,9253,Steins;Gate,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,Steins;Gate,Steins;Gate,Steins;Gate,Steins;Gate,Steins;Gate,Steins;Gate,Steins;Gate,Steins;Gate,Steins;Gate
4,9969,Gintama,Gintama: Enchousen,Gintama,Gintama,Gintama,Gintama,Gintama,Gintama,Gintama,Gintama,Gintama


In [None]:
df1['one']

In [None]:
x = input('Enter Anime Name:')
anime_name = anime[anime['name'].str.contains(x, case=False)].sort_values(by='members', ascending=False).reset_index()['name'][0]

In [None]:
items = pd.read_parquet('data1/itemsnew.parquet',columns=['name',anime_name])
items.head()

In [None]:
def top_animes1(n):
    anime_name = anime[anime['name'].str.contains(x, case=False)].sort_values(by='members', ascending=False).reset_index()['name'][0]
    count = 1
    print('If you like {}, you may also like:\n'.format(anime_name))
    for item in items.sort_values(by = anime_name, ascending = False).name[1:int(n)+1]:
        print('No. {}: {}'.format(count, item))
        count +=1

In [None]:
top_animes1(11)