# References

- https://towardsdatascience.com/svd-where-model-tuning-goes-wrong-61c269402919
https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b
- https://auto-surprise.readthedocs.io/en/stable/

- https://surprise.readthedocs.io/en/stable/index.html
- https://surprise.readthedocs.io/en/stable/prediction_algorithms.html
- https://surprise.readthedocs.io/en/stable/matrix_factorization.html
- https://surprise.readthedocs.io/en/stable/model_selection.html

- https://github.com/pandas-profiling/pandas-profiling

- https://medium.com/datadriveninvestorhow-to-built-a-recommender-system-rs-616c988d64b2

- https://medium.com/@james_aka_yale/the-4-recommendation-engines-that-can-predict-your-movie-tastes-bbec857b8223

- https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/

- https://github.com/nikunjlad/Movie-Recommendation-System-Using-Surprise/blob/master/Movie%20Recommender%20System.ipynb

# Import libraries

In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 7.9MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617545 sha256=0d776090bec974504963f74f567c3e916742f95bcf1e85129ad404723fc36add
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import pandas_profiling as eda_pandas

import surprise
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold, RepeatedKFold
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

from surprise import SVD
from surprise import SVDpp

# Data setup

## google colab settings

(Below code lines can be commented if running using 'jupyter notebook')


In [3]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# cd '/content/gdrive/MyDrive/CMPE_256-Prg_Ass1_014749488'

/content/gdrive/MyDrive/CMPE_256-Prg_Ass1_014749488


## Load  data

### Train data

In [5]:
df = pd.read_csv('input/train.csv','\t')
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)
trainingSet = data.build_full_trainset()

In [6]:
dataset = pd.read_csv('input/train.csv','\t')

In [7]:
n_users = len(dataset.user_id.unique())
n_books = len(dataset.book_id.unique())

### Test data

In [8]:
dt = pd.read_csv('input/test.csv','\t')

# Preprocessing(Data transformation) for surprise

Once again load the training dataset

In [9]:
df = pd.read_csv('input/train.csv','\t')
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)
trainingSet = data.build_full_trainset()

In [10]:
df.sort_values(['rating'],ascending=False).head()

Unnamed: 0,user_id,book_id,rating
0,12726,7784,5
359885,30537,759611,5
151869,33101,375901,5
151871,13492,197084,5
648308,490,37190,5


# Training & Tuning Model

## Hyperparameter tuning

## Train model 

In [12]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.6581370228603654
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [24]:
# # algo=SVD(n_epochs=50,lr_all=0.01,reg_all =0.01,n_factors =250) #initial param
# algo=SVD(n_epochs=75,lr_all=0.01,reg_all =0.1,n_factors =150) #best identified param from hyperparameter tuning

algo=SVD(n_epochs=10,lr_all=0.005, reg_all=0.4,n_factors =150)
kf = KFold(n_splits=5)
# kf = RepeatedKFold(n_splits=5)

for trainset, testset in kf.split(data):
    print('in-progress')
    # train and test algorithm.
    algo.fit(trainingSet)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

in-progress
RMSE: 1.5490
in-progress
RMSE: 1.5509
in-progress
RMSE: 1.5515
in-progress
RMSE: 1.5527
in-progress
RMSE: 1.5509


## Evaluate training(model)

**Reference:** Adapted from surprise code shared in class by Prof. Magdalini  Eirinaki

In [35]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: 
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [36]:
df.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,20990,39963,0.0,1.860158,{'was_impossible': False},20,247,1.860158
1,36846,3636,4.0,2.729677,{'was_impossible': False},2,6298,1.270323
2,22122,6310,0.0,2.583284,{'was_impossible': False},5,2538,2.583284
3,15926,2647293,0.0,2.106736,{'was_impossible': False},4,59,2.106736
4,24960,23310699,0.0,2.333902,{'was_impossible': False},125,214,2.333902


In [37]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [38]:
df

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,20990,39963,0.0,1.860158,{'was_impossible': False},20,247,1.860158
1,36846,3636,4.0,2.729677,{'was_impossible': False},2,6298,1.270323
2,22122,6310,0.0,2.583284,{'was_impossible': False},5,2538,2.583284
3,15926,2647293,0.0,2.106736,{'was_impossible': False},4,59,2.106736
4,24960,23310699,0.0,2.333902,{'was_impossible': False},125,214,2.333902
...,...,...,...,...,...,...,...,...
139995,21470,631097,5.0,3.404168,{'was_impossible': False},211,98,1.595832
139996,20989,13429583,0.0,1.030356,{'was_impossible': False},407,3,1.030356
139997,25107,1900124,0.0,1.448457,{'was_impossible': False},1,97,1.448457
139998,23409,78418,4.0,3.376592,{'was_impossible': False},21,1073,0.623408


### Good predictions

In [39]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
18993,36367,663318,0.0,0.0,{'was_impossible': False},154,23,0.0
68837,16445,22676088,0.0,0.0,{'was_impossible': False},1162,20,0.0
77859,4772,15865376,0.0,0.0,{'was_impossible': False},115,26,0.0
85673,9089,13228487,0.0,0.0,{'was_impossible': False},740,71,0.0
65658,25901,15843645,0.0,0.0,{'was_impossible': False},9,57,0.0
57030,4827,17160901,0.0,0.0,{'was_impossible': False},56,27,0.0
72399,30763,1582795,0.0,0.0,{'was_impossible': False},160,30,0.0
99306,1030,79088,0.0,0.0,{'was_impossible': False},126,32,0.0
121,13645,33158525,0.0,0.0,{'was_impossible': False},19,128,0.0
125839,3427,16161561,0.0,0.0,{'was_impossible': False},38,27,0.0


### Bad predictions

In [40]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
125976,272,22675931,5.0,0.499248,{'was_impossible': False},43,34,4.500752
48436,12027,674749,5.0,0.497782,{'was_impossible': False},66,187,4.502218
106961,5355,707243,5.0,0.49626,{'was_impossible': False},349,22,4.50374
51071,4576,6455548,5.0,0.482457,{'was_impossible': False},177,48,4.517543
93524,31306,30312547,5.0,0.448596,{'was_impossible': False},603,69,4.551404
54065,22533,17262584,5.0,0.444796,{'was_impossible': False},133,22,4.555204
50052,3883,25817074,5.0,0.377173,{'was_impossible': False},155,85,4.622827
129958,20989,420180,5.0,0.29972,{'was_impossible': False},407,117,4.70028
50693,30907,18225037,5.0,0.262807,{'was_impossible': False},731,83,4.737193
101116,15795,22731743,5.0,0.256861,{'was_impossible': False},79,59,4.743139


# Rating Prediction


In [41]:
result=[]
result1=[]
id=[]

for index, row in dt.iterrows():
    id.append(str((row['user_id']))+'-'+str((row['book_id'])))
    result1.append(algo.predict(row['user_id'], row['book_id']).est)

In [42]:
result=pd.DataFrame({'user_id-book_id':pd.Series(id),'rating':pd.Series(result1) }) #as per o/p format

In [43]:
print(result)

       user_id-book_id    rating
0        20989-1832332  0.772055
1         37040-191139  3.031891
2       36167-28449164  1.963437
3        9398-24693869  0.856013
4           29848-8127  3.008135
...                ...       ...
299601     15976-38709  2.932356
299602     24853-11312  1.765252
299603  29982-10697427  2.960274
299604     6324-157993  2.100539
299605      27262-6310  2.730144

[299606 rows x 2 columns]


#### Store predictions

In [44]:
result.to_csv('output/predictions-svd-75.csv',index=False)

#### Evaluate Prediction results
https://www.kaggle.com/c/cmpe256-s21-book-recommendations/leaderboard 

**Team Name:** Sudha Vijayakumar