In [1]:
import boto3
import pandas as pd
from sagemaker import get_execution_role
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

role = 'AmazonSageMaker-ExecutionRole-20200524T114773'

anime_df = pd.read_csv('s3://animerec/Anime_Recommender/data/anime.csv')
rating_df = pd.read_csv('s3://animerec/Anime_Recommender/data/rating.csv')
#Remove the -1's, which are no values for the ratings
rating_df = rating_df[rating_df['rating']!=-1]
anime_meta = pd.read_csv('s3://animerec/Anime_Recommender/data/AnimeList_Meta.csv')
users_meta = pd.read_csv('s3://animerec/Anime_Recommender/data/UserList_Meta.csv')

In [2]:
def weighted_rating(x,rating_count_col, avg_rating_col):
    m = x[rating_count_col].quantile(0.80)
    C = x[avg_rating_col].mean()
    v = x[rating_count_col]
    R = x[avg_rating_col]
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)


def full_anime_df(rating_df, anime_df, anime_meta):
    #Get the total number of ratings per anime
    count_ratings = rating_df.groupby('anime_id').count().rename(columns={'rating': 'num_ratings'})['num_ratings']
    
    #Combine the meta data with the anime data, and rating data
    anime_full = anime_df.merge(right=anime_meta, how='left', on='anime_id')
    anime_full = anime_full.merge(right=count_ratings, how='left', on='anime_id')
    anime_full = anime_full.drop(columns=['title','title_japanese','title_synonyms', 'type_x',
                                      'episodes_y', 'airing', 'score','scored_by', 'members_y', 'background',
                                     'licensor', 'premiered', 'broadcast', 'related', 'genre_x', 'aired_string'])
    anime_full = anime_full.rename(columns={'rating_x': 'avg_rating','rating_y': 'rating_type', 'genre_y':'genre', 
                                        'members_x': 'members', 'episodes_x':'episodes', 'type_y':'type', 0: 'weighted_rating'})
    anime_full = pd.concat([anime_full, weighted_rating(anime_full, 'members','avg_rating')], axis=1)
    anime_full = anime_full.rename(columns={0: 'weighted_rating'})
    
    #Shortening the rating type categories
    rating_type_dict = {'PG-13 - Teens 13 or older': 'PG-13', 'R - 17+ (violence & profanity)': 'R',
                   'PG - Children': 'PG', 'G - All Ages': 'G', 'R+ - Mild Nudity': 'R+', 
                   'Rx - Hentai':'RX', 'None': 'Unknown'}
    anime_full['rating_type'] = anime_full['rating_type'].map(rating_type_dict).fillna('Unknown')
    
    #Filling NaNs
    anime_full['genre'] = anime_full['genre'].fillna('Unknown')
    anime_full['studio'] = anime_full['studio'].fillna('Unknown')
    anime_full['producer'] = anime_full['producer'].fillna('Unknown')
    
    #Formatting the anime titles
    anime_full['name'] = anime_full['name'].str.title()
    anime_full['title_english'] = anime_full['title_english'].str.title()
    return anime_full


def sim_mat(anime_full, ver='basic'):
    '''
    Returns similarity matrix with cosine similarity based on the anime_dataframe provided

    INPUT - 
    anime_full: Formatted anime dataframe using the full_anime_df function, size (nxm)
    ver: version of similarity matrix available, select from:
        basic: 
        genre: The above + Genre based similarity
        adv: genre + popular studio & producer 
    
    OUTPUT -
    dataframe containing the similaries between each anime in anime_full, size (nxn)
    '''
    if ver=='basic':
        basic_rec = anime_full[['anime_id','type','source','rating_type']] #removed 'weighted_rating' to test
        basic_rec = pd.get_dummies(basic_rec, columns=['type','source','rating_type']).set_index('anime_id')
        # basic_rec = basic_rec.dropna(axis=0, subset=['weighted_rating']) 
        anime_similarity_cos = cosine_similarity(basic_rec)
        anime_similarity_cosdf = pd.DataFrame(anime_similarity_cos, index=basic_rec.T.columns, columns=basic_rec.T.columns)
        return anime_similarity_cosdf
    elif ver=='genre':
        ContBased_2 = anime_full[['anime_id','type','source','rating_type','weighted_rating']]
        vect = CountVectorizer()
        count_vect = vect.fit_transform(anime_full['genre'])
        features = vect.get_feature_names()
        genre_matrix = count_vect.toarray()
        genre_df = pd.DataFrame(genre_matrix, columns =features, index=anime_full['anime_id'])
        ContBased_2 = ContBased_2.merge(right=genre_df,how='inner',on='anime_id')
        ContBased_2 = pd.get_dummies(ContBased_2, columns=['type','source','rating_type']).set_index('anime_id')
        ContBased_2 = ContBased_2.dropna(axis=0, subset=['weighted_rating'])
        df = ContBased_2
        anime_similarity_cos = cosine_similarity(df)
        anime_similarity_cosdf = pd.DataFrame(anime_similarity_cos, index=df.T.columns, columns=df.T.columns)
        return anime_similarity_cosdf
    elif ver=='adv':
        ContBased_3 = anime_full[['anime_id','type','source','rating_type','weighted_rating','studio','producer']]
        vect = CountVectorizer()
        count_vect = vect.fit_transform(anime_full['genre'])
        features = vect.get_feature_names()
        genre_matrix = count_vect.toarray()
        genre_df = pd.DataFrame(genre_matrix, columns =features, index=anime_full['anime_id'])
        top_studios = ['Studio Chizu', 'Marvy Jack', 'Bandai Namco Pictures', 'White Fox', 'Purple Cow Studio Japan', 
        'Shuka', 'Egg Firm', 'Square Enix', 'Wit Studio', 'Lay-duce', 'Studio Ghibli', 'Graphinica', 'David Production', 
        'Bridge', 'Animation Do', 'P.A. Works', 'Kyoto Animation', 'Manglobe', 'Artland', 'Hoods Drifters Studio']
        top_producers = ['Studio Moriken', 'Quaras', 'Seikaisha', 'Mad Box', 'Forecast Communications', 'StudioRF Inc.', 
        'CIC', 'TAP', 'Miracle Robo', 'Madoka Partners', 'Animation Do', 'Studio Wombat', 'GYAO!', 
        'Shingeki no Kyojin Team', 'C &amp; I entertainment', 'Top-Insight International Co.', 'LTD.', 
        'East Japan Marketing &amp; Communications', 'Audio Highs','Banpresto']
        for studio in top_studios:
            ContBased_3[studio] = ContBased_3['studio'].transform(lambda x: 1 if studio in x else 0)
        for producer in top_producers:
            ContBased_3[producer] = ContBased_3['producer'].transform(lambda x: 1 if studio in x else 0)
        ContBased_3 = ContBased_3.merge(right=genre_df,how='inner',on='anime_id')
        ContBased_3 = pd.get_dummies(ContBased_3, columns=['type','source','rating_type']).set_index('anime_id')
        ContBased_3 = ContBased_3.drop(columns=['studio','producer'])
        ContBased_3 = ContBased_3.dropna(axis=0, subset=['weighted_rating'])
        df = ContBased_3.copy()
        anime_similarity_cos = cosine_similarity(df)
        anime_similarity_cosdf = pd.DataFrame(anime_similarity_cos, index=df.T.columns, columns=df.T.columns)
        return anime_similarity_cosdf
    else:
        return 'Please Select basic, genre, or adv for ver'

In [3]:
anime_full = full_anime_df(rating_df, anime_df, anime_meta)
sim_mat_basic = sim_mat(anime_full, ver='basic')
sim_mat_genre = sim_mat(anime_full, ver='genre')

In [4]:
filt = rating_df.groupby('user_id').count()['rating']
user_ids = filt[filt>50].reset_index()['user_id'].values
over_df = rating_df[rating_df['user_id'].isin(user_ids)]
remaining_df = rating_df[~rating_df['user_id'].isin(user_ids)]
over_df.groupby('user_id').count()['rating'].sort_values()
y=over_df['user_id']
X=over_df.drop(columns=['user_id'])

In [5]:
anime_train, anime_test, user_train, user_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify=y)

In [6]:
train_over_split = pd.concat([anime_train, user_train],axis=1)
train = pd.concat([train_over_split, remaining_df], axis=0)
test_df2 = pd.concat([anime_test, user_test],axis=1)
test_df2

Unnamed: 0,anime_id,rating,user_id
974670,1313,7,9020
2778851,10389,9,26106
1510424,18153,9,14557
286914,34240,9,2974
4842812,11433,9,46210
...,...,...,...
3003712,18679,8,27858
5471119,4081,8,51597
1941011,182,8,18848
4053948,902,8,38466


In [19]:
test_df = test_df2[test_df2['user_id'] != 42653]

In [20]:
test_df.groupby(['user_id','anime_id']).count()['rating'].sort_values(ascending=False)

user_id  anime_id
73515    12445       1
24563    3455        1
         121         1
         442         1
         594         1
                    ..
49748    2904        1
         2336        1
         2305        1
         2303        1
3        170         1
Name: rating, Length: 1135604, dtype: int64

In [24]:
test_users = test_df['user_id'].unique()
test_users = test_users[test_users!=42653]
test_anime = test_df['anime_id'].unique()
rating_df_red = rating_df[(rating_df['anime_id'].isin(test_anime))&(rating_df['user_id'].isin(test_users))]
sim_mat_red_basic = sim_mat_basic.iloc[sim_mat_basic.index.isin(test_anime), sim_mat_basic.columns.isin(test_anime)]
sim_mat_red_genre = sim_mat_genre.iloc[sim_mat_genre.index.isin(test_anime), sim_mat_genre.columns.isin(test_anime)]
rating_df.shape, rating_df_red.shape

((6337241, 3), (5674303, 3))

In [22]:
def pred_user_rating(rating_df, sim_mat, user_id, anime_id):
    anime_ids = rating_df[rating_df['user_id']==user_id]['anime_id'].values
    anime_ids2 = anime_ids[anime_ids!=anime_id]
    final_ids = anime_ids2[np.isin(anime_ids2, sim_mat.columns)]
    sim_ids = sim_mat.loc[anime_id, sim_mat.columns.isin(final_ids)].sort_values(ascending=False)[:10].index
    sims = sim_mat.loc[anime_id, sim_mat.columns.isin(sim_ids)]
    ratings = rating_df[(rating_df['user_id']==user_id) & (rating_df['anime_id'].isin(sim_ids))]['rating'].values
    pred_rating = np.sum(ratings*sims)/np.sum(sims)
    return pred_rating

In [None]:
# # Try only with top 10 similar anime
# import time
# start_time = time.time()

test_df['pred_basic'] = test_df.apply(lambda row: pred_user_rating(rating_df_red, sim_mat_red_basic, row['user_id'], row['anime_id']), axis=1)
rmse_basic = np.sqrt(mean_squared_error(test_df['rating'],test_df['pred_basic']))
print(rmse_basic)
# print("--- %s seconds ---" % (time.time() - start_time))



In [None]:
#Try only with top 10 similar anime
# import time
# start_time = time.time()

test_df['pred_genre'] = test_df.apply(lambda row: pred_user_rating(rating_df_red, sim_mat_red_genre, row['user_id'], row['anime_id']), axis=1)
rmse_genre = np.sqrt(mean_squared_error(test_df['rating'],test_df['pred_genre']))
print(rmse_genre)
# print("--- %s seconds ---" % (time.time() - start_time))