In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install scikit-surprise
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise import SVD
import os
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 12.7 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626511 sha256=82db5d519a900e13a22bba1e45ab4859a2afaf8d0005fb32da42b3562998fa1e
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [3]:
anime_df = pd.read_csv("/content/drive/MyDrive/anime.csv")
rating_df = pd.read_csv("/content/drive/MyDrive/rating.csv")

In [4]:
rating_df.loc[rating_df.rating == -1, 'rating'] = np.NaN
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [5]:
merge_df = pd.merge(anime_df,rating_df,on=['anime_id','anime_id'])
merge_df= merge_df[merge_df.user_id <= 20000]

In [6]:
df_merged = merge_df[merge_df['rating_x'] != -1]

In [7]:
df_merged.shape

(2065588, 9)

In [8]:
df_merged_new = df_merged[(df_merged['type'] == 'TV')]
df_merged_new.shape

(1412670, 9)

In [9]:
df_merged_new = df_merged_new[['user_id', 'name', 'rating_x']]
df_merged_new.shape

(1412670, 3)

In [10]:
# Reading the dataset
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_merged_new,reader)

In [11]:
trainset, testset = train_test_split(data, test_size=0.25,random_state=10)

In [12]:
# Algorithm 1 - KNNWithMeans

sim_options = {'name': ['cosine','pearson_baseline'],
               'user_based': [False]}
param_grid = {'sim_options': sim_options,
              'k': [10, 50]}

gs_knn = GridSearchCV(KNNWithMeans, param_grid, measures = ['rmse'], cv = 3, return_train_measures = True)
gs_knn.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity mat

In [13]:
gs_knn.cv_results

{'split0_test_rmse': array([0.02930437, 0.02930437, 0.02930437, 0.02930437]),
 'split0_train_rmse': array([1.67412070e-15, 1.58227821e-15, 1.44665411e-15, 1.53033091e-15]),
 'split1_test_rmse': array([0.02879989, 0.02879989, 0.02879989, 0.02879989]),
 'split1_train_rmse': array([1.69487877e-15, 1.61350139e-15, 1.46290003e-15, 1.55682743e-15]),
 'split2_test_rmse': array([0.02806065, 0.02806065, 0.02806065, 0.02806065]),
 'split2_train_rmse': array([1.72162704e-15, 1.63596810e-15, 1.49345622e-15, 1.58197928e-15]),
 'mean_test_rmse': array([0.02872164, 0.02872164, 0.02872164, 0.02872164]),
 'std_test_rmse': array([0.00051075, 0.00051075, 0.00051075, 0.00051075]),
 'mean_train_rmse': array([1.69687550e-15, 1.61058257e-15, 1.46767012e-15, 1.55637921e-15]),
 'std_train_rmse': array([1.94457054e-17, 2.20157618e-17, 1.94023142e-17, 2.10877420e-17]),
 'rank_test_rmse': array([1, 2, 3, 4]),
 'mean_fit_time': array([ 9.38624843,  9.68170929, 12.68529884, 12.45902753]),
 'std_fit_time': array([0.

In [14]:
# Getting best RMSE for training data

print("Best RMSE for train set:")
gs_knn.best_score['rmse']

Best RMSE for train set:


0.028721637947821774

In [15]:
# Getting best parameters for algorithm

gs_knn.best_params['rmse']

{'sim_options': {'name': 'cosine', 'user_based': False}, 'k': 10}

In [16]:
# Using best parameters to find RMSE by fitting on trainset

algo_knn = KNNWithMeans(k=10, sim_options={'name': 'cosine', 'user_based': False})
algo_knn.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f0c6f11df70>

In [17]:
# run the trained model against the testset
test_pred_knn = algo_knn.test(testset)

In [18]:
# get RMSE for test data
print("Item-based Model : Test Set")
accuracy.rmse(test_pred_knn, verbose=True)

Item-based Model : Test Set
RMSE: 0.0257


0.02566448129351555

In [19]:
test_pred_knn[:10]

[Prediction(uid=12588, iid='Ai no Wakakusa Yama Monogatari', r_ui=6.45, est=6.45, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=11956, iid='Kamigami no Asobi', r_ui=7.32, est=7.3199999999999985, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=1788, iid='Kantai Collection: KanColle', r_ui=7.04, est=7.040000000000003, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=4453, iid='Oniichan dakedo Ai Sae Areba Kankeinai yo ne!', r_ui=6.78, est=6.78, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=10049, iid='Senyuu.', r_ui=7.49, est=7.49, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=7543, iid='Shakugan no Shana II (Second)', r_ui=7.79, est=7.790000000000002, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=19994, iid='Mahoraba: Heartful days', r_ui=7.45, est=7.450000000000001, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=3391, iid='Kotoura-san', r_u

In [20]:
algo_knn.predict(3,"Naruto").est

7.8100000000000005

In [21]:
#SVD
from surprise import SVD
param_grid = {
    'n_epochs': [2, 4],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.2, 0.4]
}

gs3= GridSearchCV(SVD , param_grid, measures = ['rmse'], cv = 3)
gs3.fit(data)

In [22]:
print(gs3.best_score['rmse'])
print(gs3.best_params['rmse'])

0.15026639566962297
{'n_epochs': 4, 'lr_all': 0.005, 'reg_all': 0.2}


In [23]:
svd_algo = SVD( n_epochs = 4, lr_all = 0.005, reg_all = 0.2)
pred_svd = svd_algo.fit(trainset).test(testset)

In [24]:
print('RMSE on test set:')
accuracy.rmse(pred_svd, verbose=True)

RMSE on test set:
RMSE: 0.1468


0.14678720174569504

In [25]:
svd_algo.predict(3, 'Naruto').est

7.799398257027378