In [2]:
from collections import defaultdict
import pandas as pd
import numpy as np
import scipy
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import surprise as sp
import time
import pickle

# Data

In [3]:
#Importing the CSVs to Dataframe format
UsersDF = pd.read_csv('../raw_data/users_cleaned.csv')
AnimesDF = pd.read_csv('../raw_data/anime_cleaned.csv')
ScoresDF = pd.read_csv('../raw_data/animelists_cleaned.csv')

In [4]:
#Since ScoresDF is a huge DF (2GB of data) I`ll only take the columns that are important for the recommendation system
ScoresDF = ScoresDF[['username', 'anime_id', 'my_score', 'my_status']]

In [5]:
ScoresDF

Unnamed: 0,username,anime_id,my_score,my_status
0,karthiga,21,9,1
1,karthiga,59,7,2
2,karthiga,74,7,2
3,karthiga,120,7,2
4,karthiga,178,7,2
...,...,...,...,...
31284025,Yokonightcore,15611,9,1
31284026,Yokonightcore,27815,9,1
31284027,wargod,5945,8,2
31284028,JMc_SetoKai_LoVe,1316,9,2


In [6]:
#Analysing all the possible values for the score, this will be used as a parameter later on
lower_rating = ScoresDF['my_score'].min()
upper_rating = ScoresDF['my_score'].max()
print('Range of ratings vary between: {0} to {1}'.format(lower_rating, upper_rating))

Range of ratings vary between: 0 to 10


In [7]:
#Counting how many relevant scores each user have done, resetting the index (so the series could become a DF again) and changing the column names
UsersAndScores = ScoresDF['username'].value_counts().reset_index().rename(columns={"username": "animes_rated", "index": "username"})

In [8]:
UsersSampled = UsersDF.sample(frac = .10) #, random_state = 2)

In [9]:
UsersAndScoresSampled = pd.merge(UsersAndScores, UsersSampled, left_on = 'username', right_on = 'username', how = 'inner')

In [10]:
#Grouping users whom had the same amount of animes rated
UserRatedsAggregated = UsersAndScoresSampled['animes_rated'].value_counts().reset_index().rename(columns={"animes_rated": "group_size", "index": "animes_rated"}).sort_values(by=['animes_rated'])

KeyError: 'animes_rated'

In [None]:
#Counting how many relevant scores each anime has, resetting the index (so the series could become a DF again) and changing the column names
RatedsPerAnime = ScoresDF['anime_id'].value_counts().reset_index().rename(columns={"anime_id": "number_of_users", "index": "anime_id"})

In [None]:
#Grouping users whom had the same amount of animes rated
AnimeRatedsAggregated = RatedsPerAnime['number_of_users'].value_counts().reset_index().rename(columns={"number_of_users": "group_size", "index": "number_of_users"}).sort_values(by=['number_of_users'])

In [None]:
#Creating a dataframe of users  and animes with more than 10 interactions
UserRatedsCutten = UsersAndScoresSampled[UsersAndScoresSampled['animes_rated'] >= 10]
AnimeRatedsCutten = RatedsPerAnime[RatedsPerAnime['number_of_users'] >= 10]
#Joining (merging) our new dataframes with the interactions one (this will already deal with the sample problem,
#as it is an inner join). The "HotStart" name comes from a pun about solving the "Cold Start" issue
ScoresDFHotStart = pd.merge(ScoresDF, UserRatedsCutten, left_on = 'username', right_on = 'username', how = 'inner')
ScoresDFHotStart = pd.merge(ScoresDFHotStart, AnimeRatedsCutten, left_on = 'anime_id', right_on = 'anime_id', how = 'inner')

In [None]:
ScoresDFHotStart

In [None]:
ScoresDFHotStart.to_csv(f"{os.getcwd()}/processed_data/ScoresDFHotStart.csv", index = False)

# Training, testing and results structure


In [43]:
#random_state = 42
reader = sp.Reader(rating_scale=(0, 10))
data = sp.Dataset.load_from_df(ScoresDFHotStart[['username', 'anime_id', 'my_score']], reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=.25, random_state = random_state)
analysis = defaultdict(list)

In [44]:
# als_param_grid = {'bsl_options': {'method': ['als'],
#                               'reg_i': [3,5],
#                               'reg_u': [10],
#                               'n_epochs': [20,25,30]
#                               }
#               }

# sgd_param_grid = {'bsl_options': {'method': ['sgd'],
#                               'reg': [0.005,0.01,0.015],
#                               'n_epochs': [20,25,30],
#                               'learning_rate' : [0.005]
#                               }
#               }

# als_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly, als_param_grid, measures=['rmse'], cv = 3, joblib_verbose = 0)

# sgd_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly, sgd_param_grid, measures=['rmse'], cv = 3, joblib_verbose = 0)

In [45]:
# als_gs.fit(data)
# print("Best RMSE score for ALS: ", als_gs.best_score['rmse'])
# print("Best parameters for ALS: ", als_gs.best_params['rmse'])

In [46]:
# sgd_gs.fit(data)
# # For SGD
# print("Best RMSE score for SGD: ", sgd_gs.best_score['rmse'])
# print("Best parameters for SGD: ", sgd_gs.best_params['rmse'])

In [None]:
%%time
trainset = data.build_full_trainset()
algo = sp.BaselineOnly()
algo.fit(trainset)
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

last_predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
last_predictions.drop('rui', inplace = True, axis = 1)

Estimating biases using als...


In [None]:
%%time
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo_items = sp.KNNBaseline(sim_options=sim_options)
algo_items.fit(trainset)

In [None]:
def get_item_recommendations(algo_items,anime_title, anime_id=100000, k=10):
    if anime_id == 100000:     
        anime_id = AnimesDF[AnimesDF['title'] == anime_title]['anime_id'].iloc[0]
        
    iid = algo_items.trainset.to_inner_iid(anime_id)
    neighbors = algo_items.get_neighbors(iid, k=k)
    raw_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors)
    df = pd.DataFrame(raw_neighbors, columns = ['Anime_ID'])
    df = pd.merge(df, AnimesDF, left_on = 'Anime_ID', right_on = 'anime_id', how = 'left')
    return df[['Anime_ID', 'title', 'genre']]

In [None]:
get_item_recommendations('Pokemon', k=10)

In [None]:
# Save the model to a file
with open('model.pickle', 'wb') as f:
    pickle.dump(algo, f)

# Save the KNNBaseline model
with open('knn_model.pickle', 'wb') as f:
    pickle.dump(algo_items, f)


In [None]:
# Load the model from a file
with open('model.pickle', 'rb') as f:
    loaded_model = pickle.load(f)

# Load the KNNBaseline model
with open('knn_model.pickle', 'rb') as f:
    loaded_knn_model = pickle.load(f)


In [None]:
algo_items = loaded_knn_model



In [None]:
get_item_recommendations(loaded_knn_model,'Pokemon', k=10)