In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import random
import scipy.spatial.distance
import surprise as sur

sns.set(font_scale=1.5)
plt.style.use('fivethirtyeight')


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Load in the data

In [2]:
df= pd.read_csv('/Volumes/external/Sangeetha-Project/df_sub.csv.gz', 
                       compression='gzip').astype({'rating':'int8', 'total_votes':'int32'})

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273661 entries, 0 to 273660
Data columns (total 5 columns):
reviewerId     273661 non-null object
asin           273661 non-null object
rating         273661 non-null int8
summary        273661 non-null object
total_votes    273661 non-null int32
dtypes: int32(1), int8(1), object(3)
memory usage: 7.6+ MB


In [4]:
len(np.unique(df.asin))

10982

In [5]:
len(np.unique(df.reviewerId))

2647

In [6]:
df.head()

Unnamed: 0,reviewerId,asin,rating,summary,total_votes
0,A2S166WSCFIFP5,000100039X,5,close to god,2
1,AWLFVCT9128JV,000100039X,5,The Lessons Of Life,10
2,A2NHD7LUXVGTD3,000100039X,5,a beautiful poetic commentary on what it is to...,1
3,A1K1JW1C5CUSUZ,000100039X,5,Transcend Human Subjectivity to Find Godlike O...,4
4,A38AAPXSJN4C5G,000100039X,5,Inspired and Inspiring,0


## Read in the data as a DataSet

In [7]:
reader = sur.Reader(rating_scale=(1,5))
data = sur.Dataset.load_from_df(df[['reviewerId', 'asin','rating']], reader)

## Fitting the model & testing the model - 0.8578705671177028

In [8]:
#Tested this algo with gridsearch, I enter the best parameters, rmse on gridsearch = 0.8578705671177028

algo = sur.SVD(random_state=1,
        biased=True,  # isolate sdasbiases
        reg_all=0.2,  # use regularisation (the same for all)
        n_epochs=20,  # number of epochs for stochastic gradient descent search
        n_factors=100,  # number of factors to retain in SVD
        lr_all=0.01
        )


In [9]:
raw_ratings = data.raw_ratings

#shuffle ratings if you want
np.random.seed(1)
random.shuffle(raw_ratings)


#section the data into training set and test set
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

print(len(A_raw_ratings))
print(len(B_raw_ratings))

#make the raw ratings contain only the training set
data.raw_ratings = A_raw_ratings

246294
27367


In [10]:
#Built a trainset out the training set
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a24775cd0>

In [11]:
# Compute score on training set
trainset_build = trainset.build_testset()
predictions_train = algo.test(trainset_build)

print('Training score ', end='   ')
print(sur.accuracy.rmse(predictions_train))

Training score    RMSE: 0.7874
0.7873797340548762


In [12]:
# Compute score on rated test set
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_test = algo.test(testset)
print('Test score (rated items) ', end=' ')
print(sur.accuracy.rmse(predictions_test))

Test score (rated items)  RMSE: 0.8423
0.8423239670513349


## Calculating the user item matrix

We will need to the train the algo on all of the available ratings to get the most accurate readings

In [17]:
data.raw_ratings = raw_ratings

#Built a trainset using the full data
trainset_full = data.build_full_trainset()
algo.fit(trainset_full)

# Compute score on training set
trainset_full_build = trainset_full.build_testset()
predictions_full_train = algo.test(trainset_full_build)

print('Training score ', end='   ')
print(sur.accuracy.rmse(predictions_full_train))

Training score    RMSE: 0.7930
0.7930445072914099


In [18]:
pu = algo.pu
qi = algo.qi
puqi = pu.dot(qi.T)

In [19]:
print(len(pu[0]))
print(len(pu))
print(len(qi[0]))
print(len(qi))

100
2647
100
10982


In [20]:
#Calculating the user-item matrix
mu = algo.default_prediction()
print(mu)
full_pred = mu + algo.bu.reshape(-1, 1) + algo.bi.reshape(1, -1) + puqi

4.089157753571024


In [21]:
#Matching the raw ids to the inner ids
user_baselines=[]

for user in np.unique(df.reviewerId):
    user_baselines.append((user, trainset_full.to_inner_uid(user), algo.bu[trainset_full.to_inner_uid(user)]))

user_baselines[:5]

#The pu tags appear in the same order as this as the users

[('A100NGGXRQF0AQ', 2146, -0.0027465579955411085),
 ('A102Z3T7NSM5KC', 322, 0.06780723622441669),
 ('A106016KSI0YQ', 2609, -0.3932609971069783),
 ('A106E1N0ZQ4D9W', 451, 0.16551405735308525),
 ('A10BZSGALQPS0V', 1125, -0.20571085125389585)]

In [22]:
len(user_baselines)

2647

In [23]:
item_baselines=[]

for item in np.unique(df.asin):
    item_baselines.append((item, trainset_full.to_inner_iid(item), algo.bi[trainset_full.to_inner_iid(item)]))

item_baselines[:5]

[('000100039X', 8630, 0.3066740979538097),
 ('0002007770', 226, 0.32823697601367985),
 ('0002051850', 3940, 0.2881340682331206),
 ('0002219417', 2859, 0.4929340216048414),
 ('000222383X', 8086, 0.4170779150833703)]

In [24]:
len(item_baselines)

10982

In [27]:
full_pred_df = pd.DataFrame(full_pred, index = [x for x,y,z in sorted(user_baselines, key=lambda x:x[1])], 
                         columns = [x for x,y,z in sorted(item_baselines, key=lambda x:x[1])])

In [29]:
full_pred_df

Unnamed: 0,0060850523,0385494238,0307388522,0505527847,0061836923,0099740915,0441008534,031603326X,0330535382,0385341679,...,0345518446,0373831366,0312676816,0375813616,0439877563,0061085189,0307729982,006082767X,0553444867,0439845009
AELSTEJJO90M7,4.036300,3.603112,3.831126,3.874736,3.774498,4.195686,3.890036,3.530156,4.078279,4.408721,...,3.486412,4.078169,4.127698,4.021172,3.961518,4.145382,4.029335,3.746988,3.822963,4.030790
A126KX6FVI4T66,3.765427,3.352257,3.583021,3.628474,3.522730,3.951597,3.638080,3.290707,3.789471,4.156648,...,3.293556,3.853606,3.922472,3.803840,3.756798,3.888793,3.752809,3.513516,3.524143,3.824007
A2UYGOYMV73826,4.374586,3.943061,4.163610,4.281162,4.128790,4.549766,4.248808,3.880898,4.394556,4.764575,...,3.930474,4.452787,4.574799,4.418840,4.367859,4.524611,4.325662,4.085989,4.199329,4.397622
A1XCSAX4BRT89Y,3.884194,3.468248,3.683034,3.753948,3.642610,4.079316,3.756652,3.438741,3.944300,4.269853,...,3.443043,3.954529,3.977731,3.941821,3.856035,3.979830,3.875260,3.605149,3.650247,3.905538
A1XRQ6YJ7HXQFQ,4.217301,3.800595,4.010178,4.099800,3.972840,4.401161,4.086311,3.749592,4.264938,4.608794,...,3.730378,4.289272,4.356955,4.251098,4.198507,4.338258,4.225089,3.940510,3.975060,4.250414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A22ZJ9Y723DO3R,4.080288,3.653857,3.875250,3.947834,3.843550,4.265254,3.952177,3.617317,4.121598,4.468312,...,3.604369,4.143372,4.212930,4.126953,4.062115,4.184525,4.066486,3.778184,3.828208,4.088037
A33TU21N460QIS,3.936996,3.511304,3.724410,3.792150,3.685020,4.125682,3.795920,3.441553,3.972460,4.321172,...,3.464547,4.030966,4.066935,3.950922,3.905624,4.055549,3.921071,3.667737,3.730302,3.965299
A2THU56YB588IA,4.008588,3.582712,3.783283,3.881585,3.757285,4.191622,3.865694,3.503102,4.003224,4.391590,...,3.606695,4.083995,4.101850,4.026687,3.935344,4.193947,4.014738,3.704745,3.811203,4.024941
A39QEKNEFSJ34Q,3.666510,3.247615,3.462915,3.528863,3.424047,3.859672,3.534011,3.201329,3.729217,4.060807,...,3.235957,3.726256,3.758660,3.712621,3.667230,3.845424,3.691245,3.398289,3.505796,3.685553


In [30]:
full_pred_df.to_csv('/Volumes/external/Sangeetha-Project/svd_est.csv.gz', 
                    index = True, header=True, compression='gzip')

In [37]:
sur.dump.dump('/Volumes/external/Sangeetha-Project/svd_dump_file', algo=algo)

In [39]:
_, test_algo = sur.dump.load('/Volumes/external/Sangeetha-Project/svd_dump_file')

In [40]:
_

In [41]:
test_algo.predict('A2NHD7LUXVGTD3', '0060515198')

Prediction(uid='A2NHD7LUXVGTD3', iid='0060515198', r_ui=None, est=4.326183695127225, details={'was_impossible': False})

### Checking the user-item matrix

In [31]:
item = '0060515198'
user = 'A2NHD7LUXVGTD3'

In [32]:
algo.predict('A2NHD7LUXVGTD3', '0060515198')

Prediction(uid='A2NHD7LUXVGTD3', iid='0060515198', r_ui=None, est=4.326183695127225, details={'was_impossible': False})

In [33]:
full_pred[trainset_full.to_inner_uid(user), trainset_full.to_inner_iid(item)]

4.326183695127225

In [34]:
full_pred_df.loc[user,item]

4.326183695127225

## Precision@K and Recall@K

In [36]:
predictions_full_train

[Prediction(uid='AELSTEJJO90M7', iid='0060850523', r_ui=5.0, est=4.036300359319754, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='0439284031', r_ui=5.0, est=4.2970804450610425, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='0307278867', r_ui=3.0, est=3.928658224134154, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='0099452243', r_ui=2.0, est=3.9903773225755477, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='030728364X', r_ui=4.0, est=4.378396315479712, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='0345476972', r_ui=4.0, est=4.108185851714784, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='0385511809', r_ui=5.0, est=3.863590627150929, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='0099459051', r_ui=5.0, est=4.121726835655795, details={'was_impossible': False}),
 Prediction(uid='AELSTEJJO90M7', iid='

## Getting top N recommendations

## Getting an example working

## Cross Validating all files to check which one results the best mse score

In [46]:
algo = BaselineOnly(bsl_options=bsl_options)
results = cross_validate(algo, data, measures=['rmse', 'mae'], cv=3, verbose=False)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...


In [57]:
temp = pd.DataFrame.from_dict(results).mean(axis=0)
temp

fit_time     2.109601
test_mae     0.659034
test_rmse    0.853200
test_time    1.028923
dtype: float64

In [64]:
temp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index = ['Algorithm']))

fit_time           2.1096
test_mae         0.659034
test_rmse          0.8532
test_time         1.02892
Algorithm    BaselineOnly
dtype: object

In [51]:
pd.Series([str(algo).split(' ')[0].split('.')[-1]])

0    BaselineOnly
dtype: object

In [None]:
benchmark = []
 
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), 
                  KNNWithZScore(), BaselineOnly(), CoClustering()]:
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

final_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
final_results

In [69]:
results = pd.read_csv('/Volumes/external/Sangeetha-Project/results_cv_recommnder.csv', index_col=0)

In [70]:
results

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BaselineOnly,0.855957,0.666846,1.05112,1.013781
SVDpp,0.859079,0.661183,353.472611,18.518866
SVD,0.862076,0.666452,12.785394,1.107035
KNNBaseline,0.868869,0.666314,1.774138,7.893475
KNNWithMeans,0.871516,0.666048,0.896398,7.516442
KNNWithZScore,0.875444,0.665424,1.186114,7.462321
CoClustering,0.891596,0.681144,6.491434,1.251335
SlopeOne,0.891981,0.682352,7.632689,13.265806
NMF,0.924378,0.716844,17.285174,0.939333
KNNBasic,0.94435,0.735248,0.768897,5.848725


I used this method to quickly obtain the top 4 algos that I should focus on. Using these four algos, I gridsearched them but spliting them into train and test set of split size 0.9 and 0.1.