# Latent Factor Model

- point : 복잡한 사용자의 모델을 몇 개의 벡터로 간소화! -> Latent Factor

In [1]:
import pandas as pd
import numpy as np

## 1. Import Dataset

In [4]:
from surprise import Dataset

In [5]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\samsung/.surprise_data/ml-100k


### user & item & rating 

In [6]:
df = pd.DataFrame(data.raw_ratings, columns = ['user', 'item', 'rate', 'id'])

In [8]:
df.head()

Unnamed: 0,user,item,rate,id
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [9]:
df_table = df.set_index(['user', 'item']).unstack()

**data is very sparse**

In [10]:
df_table.iloc[200:230, 800:830].fillna('')

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,204,205,206,207,208,209,21,210,211,212,...,222,223,224,225,226,227,228,229,23,230
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
28,,,,,,4.0,,,,,...,5.0,5.0,,,,4.0,5.0,2.0,,4.0
280,3.0,,,,,,,2.0,,,...,3.0,,,4.0,3.0,3.0,3.0,3.0,,3.0
281,,,,,,,,,,,...,,,,,,,,,,
282,,,,,,,,,,,...,,,,,,,,,,
283,4.0,,,,5.0,4.0,3.0,5.0,4.0,,...,,,,,,,,,,
284,,,,,,,,,,,...,,,,,,,,,,
285,,4.0,,,,,,,,,...,4.0,,,,,,,,,
286,3.0,,,,4.0,4.0,,5.0,4.0,1.0,...,,,5.0,,,,3.0,1.0,,
287,,,,,4.0,,,,,,...,5.0,,,,,,,,,
288,,5.0,,,,,,3.0,5.0,,...,,3.0,,,,,,,,2.0


### import data from other sources

In [11]:
from surprise import Reader

In [12]:
sample = {'item' : [1, 1, 1, 2, 2],
                   'user' : [9, 32, 2, 45, 70],
                             'rating' : [3, 2, 4, 3, 1]}
df_sample = pd.DataFrame(sample)

In [13]:
reader = Reader(rating_scale = (1, 5))
data_sample = Dataset.load_from_df(df_sample, reader)

## 2. Prediction Methods

In [16]:
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from surprise import accuracy

In [17]:
trainset = data.build_full_trainset()

In [18]:
from surprise import SVDpp
from surprise import BaselineOnly

### Compare various prediction methods

In [19]:
benchmark = []

for algo in [SVD(biased = False), SVD(), SVDpp(), BaselineOnly()]:
    results = cross_validate(algo, data, measures = ['RMSE'], cv = 3, verbose = False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis = 0)
    tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index = ['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,fit_time,test_rmse,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,286.586181,0.928348,12.301625
SVD,9.441259,0.945863,0.516038
BaselineOnly,0.285504,0.947325,0.419094
SVD,9.063482,0.965173,0.444749


In [None]:
from surprise.model_selection import train_test_split

In [None]:
from surprise import accuracy

### (1) Baseline model

In [None]:
bsl_options = {'method' : 'als',
              'nepochs' : 10,
              'reg_u' : 12,
              'reg_i' : 5}
algo = BaselineOnly(bsl_options = bsl_options)
cross_validate(algo, data, measures = ['RMSE'], cv = 3, verbose = False)

In [None]:
bsl_options = {'method' : 'sgd'}
algo = BaselineOnly(bsl_options = bsl_options)
cross_validate(algo, data, measures = ['RMSE'], cv = 3, verbose = False)

In [None]:
trainset, testset = train_test_split(data, test_size = 0.25)
algo = BaselineOnly(bsl_options = bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

### (2) Funk-SVD model

In [None]:
transet = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [None]:
algo = SVD()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

In [None]:
pred_svd = pd.DataFrame(predictions)[['uid', 'iid', 'est']]

In [None]:
pred_svd_table = pred_svd.set_index(['uid', 'iid']).unstack()

### RESULT

In [None]:
pred_svd_table.iloc[100:110, 100:110]