# Collaborative Filtering Methods

In [5]:
import pandas as pd
from surprise import SVD, KNNWithZScore, NormalPredictor, BaselineOnly, CoClustering
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

### Using User-Restaurant-Rating dataframe

In [2]:
df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\vegas_review_reduced.pickle')
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'stars']], reader)

In [3]:
df.head()

Unnamed: 0,user_id,business_id,stars
3,ofKDkJKXSKZXu5xJNGiiBQ,5JxlZaqCnk1MnbgRirs40Q,1
16,2hRe26HSCAWbFRn5WChK-Q,d4qwVw4PcN-_2mK2o1Ro1g,1
21,RR-2nouBn408e3djxC470g,d4qwVw4PcN-_2mK2o1Ro1g,5
24,-Co-ReNx_lXT1xL_Rr0B2g,XZbuPXdyA0ZtTu3AzqtQhg,4
30,K5MSNpwRf0xKQSaC6gK9Cw,PL3cimEUfNHlenOGSOAdJg,4


#### SVD

In [32]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2717  1.2695  1.2714  1.2700  1.2723  1.2710  0.0010  
MAE (testset)     1.0133  1.0131  1.0144  1.0137  1.0147  1.0139  0.0006  
Fit time          83.10   92.00   102.96  92.55   90.97   92.31   6.33    
Test time         3.66    4.60    3.77    4.32    4.18    4.11    0.35    


{'test_rmse': array([1.27166144, 1.26949466, 1.27136272, 1.27004675, 1.27232614]),
 'test_mae': array([1.01327735, 1.01313134, 1.01440172, 1.01371879, 1.01473925]),
 'fit_time': (83.09863138198853,
  91.99651432037354,
  102.95522284507751,
  92.55313611030579,
  90.97034454345703),
 'test_time': (3.6630194187164307,
  4.602813243865967,
  3.765085220336914,
  4.3225624561309814,
  4.180122137069702)}

#### KNN

In [48]:
small_df = df.sample(n=10000, random_state=1)
small_data = Dataset.load_from_df(small_df[['user_id', 'business_id', 'stars']], reader)

In [49]:
algo = KNNWithZScore()
cross_validate(algo, small_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4422  1.4454  1.4194  1.4262  1.4322  1.4331  0.0097  
MAE (testset)     1.2080  1.2162  1.1802  1.1850  1.1937  1.1966  0.0136  
Fit time          1.63    1.67    1.76    2.00    1.84    1.78    0.13    
Test time         0.03    0.02    0.02    0.02    0.03    0.03    0.00    


{'test_rmse': array([1.44224555, 1.44536477, 1.41941864, 1.42616244, 1.43221419]),
 'test_mae': array([1.20803149, 1.21621846, 1.18021512, 1.18496074, 1.19370262]),
 'fit_time': (1.6308791637420654,
  1.6726994514465332,
  1.7642850875854492,
  2.002148151397705,
  1.8371763229370117),
 'test_time': (0.028003931045532227,
  0.0189974308013916,
  0.02425551414489746,
  0.02450728416442871,
  0.029996156692504883)}

#### Normal Predictor

In [4]:
algo = NormalPredictor()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8130  1.8104  1.8061  1.8023  1.8137  1.8091  0.0043  
MAE (testset)     1.4305  1.4292  1.4249  1.4205  1.4302  1.4271  0.0039  
Fit time          3.84    5.54    5.14    4.77    5.05    4.87    0.57    
Test time         5.58    5.64    4.37    5.39    4.93    5.18    0.48    


{'test_rmse': array([1.8129916 , 1.8104468 , 1.8061148 , 1.80228167, 1.81365983]),
 'test_mae': array([1.43046559, 1.42924764, 1.42492354, 1.42046196, 1.4302036 ]),
 'fit_time': (3.838209629058838,
  5.543445110321045,
  5.140373468399048,
  4.773275136947632,
  5.050962448120117),
 'test_time': (5.576751947402954,
  5.64225435256958,
  4.3715009689331055,
  5.390634298324585,
  4.930987119674683)}

In [5]:
algo = BaselineOnly()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2544  1.2507  1.2484  1.2481  1.2494  1.2502  0.0023  
MAE (testset)     1.0114  1.0092  1.0068  1.0069  1.0083  1.0085  0.0017  
Fit time          13.30   13.91   13.22   12.09   12.93   13.09   0.59    
Test time         7.97    5.23    6.11    3.23    4.53    5.41    1.59    


{'test_rmse': array([1.25441523, 1.25072836, 1.24835691, 1.24814139, 1.24944199]),
 'test_mae': array([1.0114234 , 1.00917126, 1.00683778, 1.00689251, 1.00827499]),
 'fit_time': (13.297805070877075,
  13.91073226928711,
  13.217430353164673,
  12.087652921676636,
  12.93212604522705),
 'test_time': (7.972185850143433,
  5.227759122848511,
  6.10561728477478,
  3.233851194381714,
  4.5279905796051025)}

In [6]:
algo = CoClustering()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3999  1.3958  1.3946  1.3990  1.3975  1.3974  0.0020  
MAE (testset)     1.0641  1.0631  1.0630  1.0641  1.0678  1.0644  0.0018  
Fit time          160.78  123.57  114.30  115.70  118.51  126.57  17.40   
Test time         3.25    2.70    2.73    3.19    3.36    3.05    0.27    


{'test_rmse': array([1.39989932, 1.39576749, 1.39463115, 1.39902667, 1.39752897]),
 'test_mae': array([1.06408132, 1.06308142, 1.06299298, 1.06406754, 1.06782673]),
 'fit_time': (160.78143739700317,
  123.56881475448608,
  114.29606199264526,
  115.69868016242981,
  118.50581479072571),
 'test_time': (3.24951171875,
  2.7047390937805176,
  2.7345917224884033,
  3.186108350753784,
  3.362250328063965)}

### Using Category Ratings

In [4]:
categories_df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\vegas_category_review.pickle')
reader = Reader(rating_scale=(1, 5))
c_data = Dataset.load_from_df(categories_df[['user_id', 'category', 'stars']], reader)

In [5]:
categories_df.head()

Unnamed: 0,user_id,stars,category
0,ofKDkJKXSKZXu5xJNGiiBQ,1,Restaurants
1,ofKDkJKXSKZXu5xJNGiiBQ,1,Mexican
2,m-BZLIIh5PCAKnzH0qj_0Q,3,Restaurants
3,m-BZLIIh5PCAKnzH0qj_0Q,3,Mexican
4,PKEzKWv_FktMm2mGPjwd0Q,4,Restaurants


In [10]:
small_c_df = df.sample(n=1000000, random_state=1)
small_c_data = Dataset.load_from_df(small_c_df[['user_id', 'category', 'stars']], reader)

In [9]:
algo = SVD()
cross_validate(algo, small_c_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3966  1.3899  1.3857  1.3847  1.3862  1.3886  0.0044  
MAE (testset)     1.1547  1.1444  1.1431  1.1429  1.1515  1.1473  0.0048  
Fit time          54.96   46.94   46.08   45.57   58.52   50.41   5.31    
Test time         2.26    1.40    1.73    1.90    2.35    1.93    0.35    


{'test_rmse': array([1.39663219, 1.38993431, 1.38572685, 1.38471795, 1.38615792]),
 'test_mae': array([1.15466075, 1.14435177, 1.14313368, 1.14291414, 1.15148105]),
 'fit_time': (54.96292185783386,
  46.942763566970825,
  46.07562756538391,
  45.566784620285034,
  58.51905369758606),
 'test_time': (2.2613441944122314,
  1.4007010459899902,
  1.7283918857574463,
  1.8999834060668945,
  2.3548383712768555)}

In [6]:
algo = KNNWithZScore()
cross_validate(algo, small_c_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3886  1.4189  1.3798  1.3928  1.4130  1.3986  0.0149  
MAE (testset)     1.1334  1.1736  1.1357  1.1476  1.1693  1.1519  0.0167  
Fit time          9.70    21.96   18.99   19.69   14.56   16.98   4.36    
Test time         0.47    0.68    0.60    0.43    0.48    0.53    0.09    


{'test_rmse': array([1.38859161, 1.41891015, 1.37983716, 1.39275633, 1.41304067]),
 'test_mae': array([1.13340233, 1.17359308, 1.13573035, 1.14757868, 1.16933245]),
 'fit_time': (9.695146322250366,
  21.962867498397827,
  18.993683099746704,
  19.688709497451782,
  14.561994791030884),
 'test_time': (0.4661130905151367,
  0.6772603988647461,
  0.6006078720092773,
  0.42693328857421875,
  0.4814774990081787)}

In [9]:
algo = NormalPredictor()
cross_validate(algo, c_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8016  1.8014  1.8030  1.8019  1.8031  1.8022  0.0007  
MAE (testset)     1.4192  1.4194  1.4208  1.4195  1.4209  1.4200  0.0007  
Fit time          15.84   17.42   16.95   17.91   16.81   16.99   0.69    
Test time         26.07   28.12   22.47   25.71   25.80   25.63   1.81    


{'test_rmse': array([1.80156075, 1.80143873, 1.80296683, 1.80192088, 1.80305664]),
 'test_mae': array([1.41923833, 1.41939777, 1.42079211, 1.41953588, 1.42086442]),
 'fit_time': (15.841813564300537,
  17.42082452774048,
  16.953715562820435,
  17.905550241470337,
  16.80893325805664),
 'test_time': (26.068778038024902,
  28.115633010864258,
  22.471868753433228,
  25.71026635169983,
  25.80151605606079)}

In [10]:
algo = BaselineOnly()
cross_validate(algo, c_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1653  1.1641  1.1652  1.1644  1.1651  1.1648  0.0005  
MAE (testset)     0.9529  0.9524  0.9532  0.9525  0.9532  0.9529  0.0003  
Fit time          44.91   48.66   51.31   48.29   49.70   48.57   2.11    
Test time         22.22   23.55   24.34   21.54   21.67   22.66   1.10    


{'test_rmse': array([1.16533905, 1.1640681 , 1.16515564, 1.16435088, 1.1650631 ]),
 'test_mae': array([0.95286283, 0.95242327, 0.95324807, 0.9525471 , 0.95324157]),
 'fit_time': (44.90551519393921,
  48.655609130859375,
  51.306873083114624,
  48.28827357292175,
  49.7005341053009),
 'test_time': (22.215725660324097,
  23.550837755203247,
  24.339679718017578,
  21.54015803337097,
  21.672481775283813)}

In [11]:
algo = CoClustering()
cross_validate(algo, c_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9805  0.9806  0.9782  0.9818  0.9832  0.9809  0.0016  
MAE (testset)     0.6596  0.6611  0.6598  0.6593  0.6612  0.6602  0.0008  
Fit time          439.13  466.70  461.79  452.33  446.92  453.37  9.94    
Test time         23.64   23.77   20.76   21.12   22.61   22.38   1.24    


{'test_rmse': array([0.98054281, 0.98056658, 0.97823815, 0.98179145, 0.98315279]),
 'test_mae': array([0.65958742, 0.66113369, 0.65983226, 0.65934298, 0.66123727]),
 'fit_time': (439.12625193595886,
  466.7014870643616,
  461.79219245910645,
  452.3313980102539,
  446.9171643257141),
 'test_time': (23.636879920959473,
  23.766121864318848,
  20.763012647628784,
  21.12356686592102,
  22.607990264892578)}

In [6]:
trainset, testset = train_test_split(data, test_size=0.25)
algo =  KNNWithZScore()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Computing the msd similarity matrix...


MemoryError: Unable to allocate 1.47 TiB for an array with shape (449202, 449202) and data type float64

In [20]:
trainset, testset = train_test_split(data, test_size=0.20)
algo = CoClustering()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 1.3951


1.3951261044134244

#### Comparing restaurant vs category approaches

In [2]:
rest_review_df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\vegas_review_reduced.pickle')

In [12]:
rest_df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\restaurant_in_vegas.pickle')

In [13]:
rest_df = rest_df[['business_id', 'categories']]

In [11]:
from sklearn.model_selection import train_test_split
rest_review_df = rest_review_df.sample(n=100000, random_state=1)
trainset, testset = train_test_split(rest_review_df, test_size=0.25)

In [14]:
cat_trainset = pd.DataFrame(columns=['user_id', 'category', 'stars'])
temp = trainset.merge(rest_df, how='inner', on='business_id')

In [15]:
i=0
for index, row in temp.iterrows():
    categories = row['categories'].split(", ")
    for c in categories:
        cat_trainset.loc[i] = [row['user_id'], c, row['stars']]
        i+=1

In [16]:
cat_trainset.head()

Unnamed: 0,user_id,category,stars
0,lJGYe2Jzxff5hxQAXFLohw,Seafood,4
1,lJGYe2Jzxff5hxQAXFLohw,American (Traditional),4
2,lJGYe2Jzxff5hxQAXFLohw,Food,4
3,lJGYe2Jzxff5hxQAXFLohw,Restaurants,4
4,lJGYe2Jzxff5hxQAXFLohw,Chicken Wings,4


In [2]:
trainset = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\cfsamples\\rest_trainset.pickle')
testset = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\cfsamples\\rest_testset.pickle')
cat_trainset = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\cfsamples\\cat_trainset.pickle')

In [6]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(trainset[['user_id', 'business_id', 'stars']], reader)
train_set = train_data.build_full_trainset()

test_data = Dataset.load_from_df(testset[['user_id', 'business_id', 'stars']], reader)
test_set = test_data.build_full_trainset()
test_set = test_set.build_testset()

In [7]:
algo = CoClustering()
predictions = algo.fit(train_set).test(test_set)
accuracy.rmse(predictions)

RMSE: 1.4665


1.4665094888875774

In [8]:
cat_train_data = Dataset.load_from_df(cat_trainset[['user_id', 'category', 'stars']], reader)
cat_train_set = cat_train_data.build_full_trainset()

In [9]:
cat_algo = CoClustering()
cat_algo.fit(cat_train_set)

<surprise.prediction_algorithms.co_clustering.CoClustering at 0x17599e5b4a8>

In [14]:
cat_test_set = testset.merge(rest_df, how='inner', on='business_id')

In [15]:
cat_test_set.head()

Unnamed: 0,user_id,business_id,stars,categories
0,n07_7NVp-V-cHBABNhOX5A,pHJu8tj3sI8eC5aIHLFEfQ,4,"Bars, Wine Bars, Nightlife, Venues & Event Spa..."
1,3CoNALzkmcfPKG0JO-lxlw,pHJu8tj3sI8eC5aIHLFEfQ,5,"Bars, Wine Bars, Nightlife, Venues & Event Spa..."
2,1DLvCQ0vQ87BPp3iqYgRZQ,pHJu8tj3sI8eC5aIHLFEfQ,4,"Bars, Wine Bars, Nightlife, Venues & Event Spa..."
3,wy5hMyNYXMPwiy_gPLbVcw,pHJu8tj3sI8eC5aIHLFEfQ,5,"Bars, Wine Bars, Nightlife, Venues & Event Spa..."
4,kbWxWF1a3jvjvfeUWqPJ8w,pHJu8tj3sI8eC5aIHLFEfQ,1,"Bars, Wine Bars, Nightlife, Venues & Event Spa..."


In [18]:
import math
sqr_errors = []
for index, row in cat_test_set.iterrows():
    categories = row['categories'].split(", ")
    predictions = []
    for c in categories:
        predictions.append(algo.predict(row['user_id'], c, r_ui=row['stars']).est)
    predicted_rating = sum(predictions)/len(predictions)
    sqr_errors.append((row['stars'] - predicted_rating)**2)

rmse = math.sqrt(sum(sqr_errors)/len(sqr_errors))
print("RMSE: ", rmse)

RMSE:  1.3992205169703416
