# Recommendation System

Building recommendation systemusing scikit-surprise (surprise library)

## Installing required libraries

In [None]:
!pip3 install numpy
!pip3 install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163503 sha256=63e549390d8c2fa525c7aa2d0fe1e6eb4812d72986161ba1d81d66021873c4d7
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


## Importing required libraries

In [None]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from collections import defaultdict

## Importing data

In [None]:
df = pd.read_csv ("./BookReviews.csv")

In [None]:
df.head()

Unnamed: 0,user_id,book_id,rating
0,4980305f36ab8c2ab831e401a185f28a,13573616,5
1,4980305f36ab8c2ab831e401a185f28a,12712367,5
2,4980305f36ab8c2ab831e401a185f28a,12309315,5
3,4980305f36ab8c2ab831e401a185f28a,11297580,5
4,4980305f36ab8c2ab831e401a185f28a,21412000,5


In [None]:
df.columns = ['user', 'item', 'rating']

In [None]:
df.shape

(1495009, 3)

To load a dataset from a pandas dataframe the load_from_df() method is used. The pandas dataframe must have three columns, corresponding to the user, item, and rating in the same order.

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)



### Matrix Factorization-based algorithms

In [None]:
#Splitting the dataset into train and test

trainset, testset = train_test_split(data, test_size=0.20)

#Singular Value Decomposition

In [None]:
# SVD algorithm.
algo = SVD(biased = True)

# Train SVD on the trainset, and predicting ratings
algo.fit(trainset)
predictions = algo.test(testset)

# Computing RMSE
print("Singular Value Decomposition RMSE: ", accuracy.rmse(predictions))

RMSE: 0.7048
Singular Value Decomposition RMSE:  0.7047975482934246


#Singular Value Decomposition++

In [None]:
# SVD++ algorithm.
algo = SVDpp()

# Training SVD++ on the trainset, and predicting ratings
algo.fit(trainset)
predictions = algo.test(testset)

# Computing RMSE
print("Singular Value Decomposition++ RMSE: ", accuracy.rmse(predictions))

RMSE: 0.7015
Singular Value Decomposition++ RMSE:  0.7014596660301454


#Non Negative Matrix Factorization

In [None]:
# NMF algorithm
algo = NMF()

# Training NMF on the trainset, and predicting ratings
algo.fit(trainset)
predictions = algo.test(testset)

# Computing RMSE
print("Non Matrix Factorization RMSE: ", accuracy.rmse(predictions))

RMSE: 0.7552
Non Matrix Factorization RMSE:  0.755196463662705


#Probabilistic Matrix Factorization

In [None]:
# PMF algorithm
algo = SVD(biased = False)

# Train SVD on the trainset, and predicting ratings
algo.fit(trainset)
predictions = algo.test(testset)

# Computing RMSE
print("Probabilistic Matrix Factorization RMSE: ", accuracy.rmse(predictions))

RMSE: 0.6972
Probabilistic Matrix Factorization RMSE:  0.6971668109923449


#Getting Prediction

In [None]:
#Getting a rating for a particular user and item

uid = str("8842281e1d1347389f2ab93d60773d4d ")  # raw user id (as in the ratings file).
iid = str(24815)  # raw item id (as in the ratings file).
r_ui = 5
# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui, verbose=True)

user: 8842281e1d1347389f2ab93d60773d4d  item: 24815      r_ui = 5.00   est = 4.01   {'was_impossible': True, 'reason': 'User and item are unknown.'}


In [None]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print("User ID: {} Top 10 Book Recommendations: {}".format(uid, [iid for (iid, _) in user_ratings]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
User ID: 6580486947afcb6614fd820795ad4b15 Top 10 Book Recommendations: [5989573, 18594409, 20578515, 29802, 13602241, 12191040, 29534432, 12213010, 16057353, 471899]
User ID: 022b51adae01601d384e5b61a07c994f Top 10 Book Recommendations: [22464457, 23012877, 12033364, 23017947, 17264824, 16126395, 8450597, 13228239, 2111326, 17727303]
User ID: 990baf554c33c40bbe24f12935076dfb Top 10 Book Recommendations: [122410, 24816, 122404, 5805, 1593072, 9721811, 28862528, 96358, 133017, 23519505]
User ID: a977436397448d238d4f3ae9b917ac9b Top 10 Book Recommendations: [784222, 6867949, 18691081, 5970382, 16158179, 263145, 6599093, 400636, 4888736, 1087204]
User ID: 3f2d33eb7acaada413398269aeab1b31 Top 10 Book Recommendations: [96358, 154798, 296901, 167010, 106586, 2473642, 306608, 15752115, 21330, 209968]
User ID: d6f533a2f49b0dcd6b4f70f37ad424be Top 10 Book Recommendations: [17131869, 15704307, 25451555, 25066780, 9341409, 920607, 29

In [None]:
param_grid = {
    "n_epochs": [10, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], refit=True, cv=5)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.7057321431444235
BEST params: 	 {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
