# Latent Factor Model

- point : 복잡한 사용자의 모델을 몇 개의 벡터로 간소화!

In [1]:
import pandas as pd
import numpy as np

## 1. Import Dataset

In [2]:
from surprise import Dataset

ModuleNotFoundError: No module named 'surprise'

In [3]:
data = Dataset.load_builtin('ml-100k')

In [4]:
df = pd.DataFrame(data.raw_ratings, columns = ['user', 'item', 'rate', 'id'])

In [5]:
df.head(10)

Unnamed: 0,user,item,rate,id
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
5,298,474,4.0,884182806
6,115,265,2.0,881171488
7,253,465,5.0,891628467
8,305,451,3.0,886324817
9,6,86,3.0,883603013


In [6]:
df_table = df.set_index(['user', 'item']).unstack()

In [7]:
df_table.iloc[200:230, 800:830].fillna('')

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,204,205,206,207,208,209,21,210,211,212,...,222,223,224,225,226,227,228,229,23,230
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
28,,,,,,4.0,,,,,...,5.0,5.0,,,,4.0,5.0,2.0,,4.0
280,3.0,,,,,,,2.0,,,...,3.0,,,4.0,3.0,3.0,3.0,3.0,,3.0
281,,,,,,,,,,,...,,,,,,,,,,
282,,,,,,,,,,,...,,,,,,,,,,
283,4.0,,,,5.0,4.0,3.0,5.0,4.0,,...,,,,,,,,,,
284,,,,,,,,,,,...,,,,,,,,,,
285,,4.0,,,,,,,,,...,4.0,,,,,,,,,
286,3.0,,,,4.0,4.0,,5.0,4.0,1.0,...,,,5.0,,,,3.0,1.0,,
287,,,,,4.0,,,,,,...,5.0,,,,,,,,,
288,,5.0,,,,,,3.0,5.0,,...,,3.0,,,,,,,,2.0


**Notice that the data is very sparse**

### import data from other sources

In [8]:
from surprise import Reader

In [9]:
sample = {'item' : [1, 1, 1, 2, 2],
                   'user' : [9, 32, 2, 45, 70],
                             'rating' : [3, 2, 4, 3, 1]}
df_sample = pd.DataFrame(sample)

In [10]:
reader = Reader(rating_scale = (1, 5))

data_sample = Dataset.load_from_df(df_sample, reader)

## Prediction Methods

In [14]:
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from surprise import accuracy

In [11]:
trainset = data.build_full_trainset()

In [18]:
from surprise import SVDpp
from surprise import BaselineOnly

### Compare various prediction methods

In [16]:
benchmark = []
for algo in [SVD(biased = False), SVD(), SVDpp(), BaselineOnly()]:
    results = cross_validate(algo, data, measures = ['RMSE'], cv = 3, verbose = False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis = 0)
    tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index = ['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.926138,124.052997,5.401475
SVD,0.943141,4.051576,0.400544
BaselineOnly,0.94744,0.321562,0.27696
SVD,0.964939,4.093576,0.35992


In [26]:
from surprise.model_selection import train_test_split

In [17]:
from surprise import accuracy

#### Baseline model

In [53]:
bsl_options = {'method' : 'als',
              'nepochs' : 10,
              'reg_u' : 12,
              'reg_i' : 5}
algo = BaselineOnly(bsl_options = bsl_options)
cross_validate(algo, data, measures = ['RMSE'], cv = 3, verbose = False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.94323623, 0.94784255, 0.93920308]),
 'fit_time': (0.22130393981933594, 0.2473447322845459, 0.26272010803222656),
 'test_time': (0.18218350410461426, 0.18238544464111328, 0.6595797538757324)}

In [55]:
bsl_options = {'method' : 'sgd'}
algo = BaselineOnly(bsl_options = bsl_options)
cross_validate(algo, data, measures = ['RMSE'], cv = 3, verbose = False)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...


{'test_rmse': array([0.94752304, 0.94237412, 0.9425633 ]),
 'fit_time': (0.499814510345459, 0.5208573341369629, 0.5305609703063965),
 'test_time': (0.1849346160888672, 0.18204498291015625, 0.18099594116210938)}

In [56]:
trainset, testset = train_test_split(data, test_size = 0.25)
algo = BaselineOnly(bsl_options = bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using sgd...
RMSE: 0.9468


0.9467806818010733

#### Funk-SVD model

In [40]:
transet = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [41]:
algo = SVD()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.5808


0.5808169316832961

In [42]:
pred_svd = pd.DataFrame(predictions)[['uid', 'iid', 'est']]

In [43]:
pred_svd_table = pred_svd.set_index(['uid', 'iid']).unstack()

**Prediction Result**

In [46]:
pred_svd_table.iloc[100:110, 100:110]

Unnamed: 0_level_0,est,est,est,est,est,est,est,est,est,est
iid,109,1090,1091,1092,1093,1094,1095,1096,1097,1098
uid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
19,3.145465,2.190086,2.854907,2.576532,2.986824,2.699053,2.568544,3.148369,3.423198,3.663579
190,2.967978,2.45172,2.745754,2.640921,3.029309,3.117871,2.531797,3.277453,3.497734,3.711869
191,3.446898,2.723432,2.927922,2.709914,3.305534,3.159788,2.866357,3.506339,3.740043,3.821798
192,3.656644,2.770982,2.669919,2.828578,3.425741,3.242299,2.891787,3.557015,3.631798,3.790804
193,3.150363,,2.456303,2.726295,3.259173,2.982732,2.700464,3.201041,3.332904,3.464171
194,2.796348,1.866077,,1.939223,2.550226,2.112925,2.021425,2.775934,3.007095,2.840994
195,,2.162508,2.810278,2.569431,2.810725,2.904533,2.434193,2.94008,3.331812,3.464917
196,3.371088,2.341964,2.802474,2.561321,3.058201,2.901485,2.660626,3.265452,3.77245,3.63933
197,3.73046,2.395874,2.571041,2.420362,2.919739,2.74903,2.666544,3.221336,3.125577,3.564515
198,2.882596,2.216491,2.499707,2.21312,2.899478,,2.407084,3.018912,3.394116,3.358498
