In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import scipy
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import surprise as sp
import time
import pickle
import os

# Data

In [19]:
#Importing the CSVs to Dataframe format
UsersDF = pd.read_csv('../raw_data/users_cleaned.csv')
AnimesDF = pd.read_csv('../raw_data/anime_cleaned.csv')
ScoresDF = pd.read_csv('../raw_data/animelists_cleaned.csv')

In [20]:
#Since ScoresDF is a huge DF (2GB of data) I`ll only take the columns that are important for the recommendation system
ScoresDF = ScoresDF[['username', 'anime_id', 'my_score', 'my_status']]

In [21]:
#Analysing all the possible values for the score, this will be used as a parameter later on
lower_rating = ScoresDF['my_score'].min()
upper_rating = ScoresDF['my_score'].max()
print('Range of ratings vary between: {0} to {1}'.format(lower_rating, upper_rating))

Range of ratings vary between: 0 to 10


In [22]:
#Counting how many relevant scores each user have done, resetting the index (so the series could become a DF again) and changing the column names
UsersAndScores = ScoresDF['username'].value_counts().reset_index().rename(columns={"username": "animes_rated", "index": "username"})

In [23]:
UsersSampled = UsersDF.sample(frac = 0.07)

In [24]:
UsersAndScoresSampled = pd.merge(UsersAndScores, UsersSampled, left_on = 'username', right_on = 'username', how = 'inner')

In [25]:
# Assuming you want to keep 'animes_rated_x' and drop 'animes_rated_y'
UsersAndScoresSampled = UsersAndScoresSampled.rename(columns={'animes_rated_x': 'animes_rated'})

# Drop the 'animes_rated_y' column
UsersAndScoresSampled = UsersAndScoresSampled.drop(columns=['animes_rated_y'])


In [26]:
#Grouping users whom had the same amount of animes ratedUserRatedsAggregated = UsersAndScoresSampled['animes_rated_x'].value_counts().reset_index().rename(columns={"animes_rated_x": "group_size", "index": "animes_rated"}).sort_values(by=['animes_rated'])
UserRatedsAggregated = UsersAndScoresSampled['animes_rated'].value_counts().reset_index().rename(columns={"animes_rated": "group_size", "index": "animes_rated"}).sort_values(by=['animes_rated'])

In [27]:

print(UsersAndScoresSampled.columns)  # This will print all column names of the DataFrame


Index(['username', 'animes_rated', 'anime_id', 'my_score', 'my_status',
       'user_id', 'user_watching', 'user_completed', 'user_onhold',
       'user_dropped', 'user_plantowatch', 'user_days_spent_watching',
       'gender', 'location', 'birth_date', 'access_rank', 'join_date',
       'last_online', 'stats_mean_score', 'stats_rewatched', 'stats_episodes',
       'number_of_users'],
      dtype='object')


In [28]:
#Counting how many relevant scores each anime has, resetting the index (so the series could become a DF again) and changing the column names
RatedsPerAnime = ScoresDF['anime_id'].value_counts().reset_index().rename(columns={"anime_id": "number_of_users", "index": "anime_id"})

In [29]:
#Grouping users whom had the same amount of animes rated
AnimeRatedsAggregated = RatedsPerAnime['number_of_users'].value_counts().reset_index().rename(columns={"number_of_users": "group_size", "index": "number_of_users"}).sort_values(by=['number_of_users'])

In [30]:
UsersAndScoresSampled[UsersAndScoresSampled['animes_rated'] >= 20]

Unnamed: 0,username,animes_rated,anime_id,my_score,my_status,user_id,user_watching,user_completed,user_onhold,user_dropped,...,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes,number_of_users
0,Exxorn,6536,28617,7,2,3979333,15,948,53,0,...,Male,Germany,1998-07-21 00:00:00,,2014-07-31 00:00:00,2018-05-16 01:07:09,3.43,170.0,7735,10898
1,Exxorn,6536,18097,0,6,3979333,15,948,53,0,...,Male,Germany,1998-07-21 00:00:00,,2014-07-31 00:00:00,2018-05-16 01:07:09,3.43,170.0,7735,978
2,Exxorn,6536,1126,0,6,3979333,15,948,53,0,...,Male,Germany,1998-07-21 00:00:00,,2014-07-31 00:00:00,2018-05-16 01:07:09,3.43,170.0,7735,1539
3,Exxorn,6536,33026,0,6,3979333,15,948,53,0,...,Male,Germany,1998-07-21 00:00:00,,2014-07-31 00:00:00,2018-05-16 01:07:09,3.43,170.0,7735,430
4,Exxorn,6536,24531,0,6,3979333,15,948,53,0,...,Male,Germany,1998-07-21 00:00:00,,2014-07-31 00:00:00,2018-05-16 01:07:09,3.43,170.0,7735,1688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2184438,schiiz0phrenia,20,4898,0,2,483569,0,15,1,0,...,Female,Washington~,1989-07-02 00:00:00,,2011-05-06 00:00:00,2011-11-19 17:16:00,9.00,5.0,446,43956
2184439,schiiz0phrenia,20,12049,0,6,483569,0,15,1,0,...,Female,Washington~,1989-07-02 00:00:00,,2011-05-06 00:00:00,2011-11-19 17:16:00,9.00,5.0,446,10476
2184440,schiiz0phrenia,20,849,0,6,483569,0,15,1,0,...,Female,Washington~,1989-07-02 00:00:00,,2011-05-06 00:00:00,2011-11-19 17:16:00,9.00,5.0,446,56787
2184441,schiiz0phrenia,20,5680,0,2,483569,0,15,1,0,...,Female,Washington~,1989-07-02 00:00:00,,2011-05-06 00:00:00,2011-11-19 17:16:00,9.00,5.0,446,47743


In [44]:

# Filter users and animes with more than 10 interactions before merging
UsersAndScoresSampled = UsersAndScoresSampled[UsersAndScoresSampled['animes_rated'] >= 1000]
RatedsPerAnime = RatedsPerAnime[RatedsPerAnime['number_of_users'] >= 10000]

# Create a list of valid usernames and anime_ids
valid_usernames = UsersAndScoresSampled['username'].unique()
valid_anime_ids = RatedsPerAnime['anime_id'].unique()

# Filter the ScoresDF before merging
ScoresDF = ScoresDF[ScoresDF['username'].isin(valid_usernames)]
ScoresDF = ScoresDF[ScoresDF['anime_id'].isin(valid_anime_ids)]

# No need to merge anymore, as we've already filtered the ScoresDF
ScoresDFHotStart = ScoresDF


In [40]:
# #Creating a dataframe of users  and animes with more than 10 interactions
# UserRatedsCutten = UsersAndScoresSampled[UsersAndScoresSampled['animes_rated'] >= 20]
# AnimeRatedsCutten = RatedsPerAnime[RatedsPerAnime['number_of_users'] >= 20]
# #Joining (merging) our new dataframes with the interactions one (this will already deal with the sample problem,
# #as it is an inner join). The "HotStart" name comes from a pun about solving the "Cold Start" issue
# ScoresDFHotStart = pd.merge(ScoresDF, UserRatedsCutten, left_on = 'username', right_on = 'username', how = 'inner')
# ScoresDFHotStart = pd.merge(ScoresDFHotStart, AnimeRatedsCutten, left_on = 'anime_id', right_on = 'anime_id', how = 'inner')

In [45]:
ScoresDFHotStart.shape

(2125486, 4)

In [34]:
ScoresDFHotStart.to_csv("ScoresDFHotStart-small.csv", index = False)
print("Data processing completed.")

Data processing completed.


In [46]:
ScoresDFHotStart

Unnamed: 0,username,anime_id,my_score,my_status
1200,MistButterfly,21,0,1
1201,MistButterfly,59,6,2
1203,MistButterfly,120,8,2
1205,MistButterfly,210,7,2
1206,MistButterfly,232,9,2
...,...,...,...,...
31055152,Montinihabato13,33988,0,1
31055159,Montinihabato13,34240,8,2
31055171,Montinihabato13,34561,0,1
31055174,Montinihabato13,34618,7,2


# Training, testing and results structure


In [48]:
#random_state = 42
reader = sp.Reader(rating_scale=(0, 10))
data = sp.Dataset.load_from_df(ScoresDFHotStart[['username', 'anime_id', 'my_score']], reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=.25)
analysis = defaultdict(list)

In [44]:
# als_param_grid = {'bsl_options': {'method': ['als'],
#                               'reg_i': [3,5],
#                               'reg_u': [10],
#                               'n_epochs': [20,25,30]
#                               }
#               }

# sgd_param_grid = {'bsl_options': {'method': ['sgd'],
#                               'reg': [0.005,0.01,0.015],
#                               'n_epochs': [20,25,30],
#                               'learning_rate' : [0.005]
#                               }
#               }

# als_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly, als_param_grid, measures=['rmse'], cv = 3, joblib_verbose = 0)

# sgd_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly, sgd_param_grid, measures=['rmse'], cv = 3, joblib_verbose = 0)

In [45]:
# als_gs.fit(data)
# print("Best RMSE score for ALS: ", als_gs.best_score['rmse'])
# print("Best parameters for ALS: ", als_gs.best_params['rmse'])

In [46]:
# sgd_gs.fit(data)
# # For SGD
# print("Best RMSE score for SGD: ", sgd_gs.best_score['rmse'])
# print("Best parameters for SGD: ", sgd_gs.best_params['rmse'])

In [49]:
%%time
trainset = data.build_full_trainset()
algo = sp.BaselineOnly()
algo.fit(trainset)
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

last_predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
last_predictions.drop('rui', inplace = True, axis = 1)

Estimating biases using als...
CPU times: user 15.5 s, sys: 182 ms, total: 15.7 s
Wall time: 15.8 s


In [50]:
%%time
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo_items = sp.KNNBaseline(sim_options=sim_options)
algo_items.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
CPU times: user 27 s, sys: 127 ms, total: 27.2 s
Wall time: 27.2 s


<surprise.prediction_algorithms.knns.KNNBaseline at 0x31b10bac0>

In [51]:
def get_item_recommendations(algo ,algo_items,anime_title, anime_id=100000, k=10):
    if anime_id == 100000:     
        anime_id = AnimesDF[AnimesDF['title'] == anime_title]['anime_id'].iloc[0]
        
    iid = algo_items.trainset.to_inner_iid(anime_id)
    neighbors = algo_items.get_neighbors(iid, k=k)
    raw_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors)
    df = pd.DataFrame(raw_neighbors, columns = ['Anime_ID'])
    df = pd.merge(df, AnimesDF, left_on = 'Anime_ID', right_on = 'anime_id', how = 'left')
    return df[['Anime_ID', 'title', 'genre']]

In [52]:
get_item_recommendations('Pokemon', k=10)

TypeError: get_item_recommendations() missing 1 required positional argument: 'anime_title'

In [None]:
# Save the model to a file
with open('model.pickle', 'wb') as f:
    pickle.dump(algo, f)

# Save the KNNBaseline model
with open('knn_model.pickle', 'wb') as f:
    pickle.dump(algo_items, f)


In [None]:
# Load the model from a file
with open('model.pickle', 'rb') as f:
    loaded_model = pickle.load(f)

# Load the KNNBaseline model
with open('knn_model.pickle', 'rb') as f:
    loaded_knn_model = pickle.load(f)


In [None]:
algo_items = loaded_knn_model



In [None]:
get_item_recommendations(loaded_knn_model,'Pokemon', k=10)