In [None]:
!pip install surprise



In [None]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the movielens-1M dataset
data = Dataset.load_builtin('ml-1m')

Dataset ml-1m could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip...
Done! Dataset ml-1m has been saved to /root/.surprise_data/ml-1m


In [None]:
# sample random trainset and testset
# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data, test_size=.20)

In [None]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [None]:
def recommendation(algo, trainset, testset):
  # Train the algorithm on the trainset, and predict ratings for the testset
  algo.fit(trainset)

  # Predictions on testing set
  test_predictions = algo.test(testset)
  test_rmse = accuracy.rmse(test_predictions)
  test_mae = accuracy.mae(test_predictions)
  
  return test_rmse, test_mae, test_predictions

In [None]:
# results = cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=5, verbose=False)

#### Experimenting

In [None]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)

Using ALS
Estimating biases using als...
RMSE: 0.8677
MAE:  0.6659


In [None]:
print('Using SGD')
# bsl_options = {'method': 'sgd',
#                'learning_rate': .00005,
#                }
algo = BaselineOnly()  # bsl_options=bsl_options
test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)

Using SGD
Estimating biases using als...
RMSE: 0.8735
MAE:  0.6718


##### Calculating predictions for the top methods:

In [None]:
# KNNBaseline

algo = KNNBaseline()
test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8763
MAE:  0.6660


In [None]:
# SlopeOne

algo = SlopeOne()
test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset)

RMSE: 0.9070
MAE:  0.7145


In [None]:
# SVD

algo = SVD()
test_svd_rmse, test_svd_mae, test_svd_pred  = recommendation(algo, trainset, testset)

RMSE: 0.8743
MAE:  0.6858


In [None]:
# SVDpp

algo = SVDpp()
test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)

RMSE: 0.8697
MAE:  0.6643


In [None]:
# BaselineOnly()

algo = BaselineOnly()
test_base_rmse, test_base_mae, test_base_pred  = recommendation(algo, trainset, testset)

Estimating biases using als...
RMSE: 0.8735
MAE:  0.6718


In [None]:
test_pred_df = pd.DataFrame(columns= ['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating', 'baseline_rating'])

In [None]:
num_test = len(test_base_pred)
print(num_test)

200042


##### Storing testing set predictions:

In [None]:
for i in range(num_test): 
  svd = test_svd_pred[i]
  slopeone = test_slopeone_pred[i]
  knn = test_knn_pred[i]
  svdpp = test_svdpp_pred[i]
  baseline = test_base_pred[i]
  df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]], columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating','baseline_rating'])
  # print(df)
  test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)

In [None]:
test_pred_df

Unnamed: 0,uid,iid,og_rating,svd_rating,knn_rating,svdpp_rating,slopeone_rating,baseline_rating
0,695,2791,4.0,3.507685,3.815840,3.936685,4.240711,4.146045
1,6016,3668,3.0,3.404877,3.557922,3.658979,3.390132,3.442060
2,5482,1221,5.0,4.622452,4.491665,4.451363,4.669042,4.554867
3,3389,2959,4.0,3.899992,3.217574,4.235280,3.559392,3.450094
4,4303,608,4.0,4.093749,4.250497,4.757454,4.282707,4.180708
...,...,...,...,...,...,...,...,...
200037,1447,3412,4.0,2.678937,3.412608,3.309891,3.192129,3.238168
200038,301,3396,4.0,4.292583,4.228340,4.594647,4.128157,4.114891
200039,984,3927,3.0,3.537646,3.446079,3.486974,3.514210,3.475889
200040,4672,2369,4.0,2.638634,2.882440,2.676785,2.742415,2.817915


In [None]:
test_pred_df.to_csv('test_prediction.csv')