### ПАКЕТ SURPRISE

* используйте данные MovieLens 1M
* можно использовать любые модели из пакета
* получите RMSE на тестовом сете 0.87 и ниже

#### Комментарий преподавателя :
В ДЗ на датасет 1М может не хватить RAM. Можно сделать на 100K. Качество RMSE предлагаю считать на основе CrossValidation (5 фолдов), а не отложенном датасете.

In [1]:
import numpy as np
import pandas as pd

from surprise import KNNWithMeans, SVD, SVDpp, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
users = pd.read_csv(
    'users.dat',
    delimiter='::',
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
    index_col=False
                   )
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [3]:
movies = pd.read_csv(
    'movies.dat',
    delimiter='::',
    names=['MovieID', 'Title', 'Genres'],
    index_col=False,
    encoding='latin-1'
                   )
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
ratings = pd.read_csv(
    'ratings.dat',
    delimiter='::',
    names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
    index_col=False
                   )
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
ratings['Rating'].mean()

3.581564453029317

In [6]:
#Таблица ratings соотвествует форме, необходимой для reader, поэтому передаю ее как есть
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['UserID', 'MovieID', 'Rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x2290f059fd0>

In [7]:
trainset, testset = train_test_split(data, test_size=.3)

In [56]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1b04e8e65e0>

In [57]:
test_pred = algo.test(testset)

In [58]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8954


0.8953987365625685

In [59]:
algo_2 = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': True})
algo_2.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1b030933340>

In [60]:
test_pred = algo_2.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9393


0.9393061984521548

In [61]:
algo_3 = KNNWithMeans(k=50, sim_options={'name': 'msd', 'user_based': True})
algo_3.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1b0307e6100>

In [62]:
test_pred = algo_3.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9305


0.9304768800094663

In [63]:
algo_4 = KNNWithMeans(k=50, sim_options={'name': 'pearson', 'user_based': True})
algo_4.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1b0307e6e20>

In [64]:
test_pred = algo_4.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9206


0.9205776044629125

In [65]:
algo_5 = KNNWithMeans(k=100, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_5.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1b030933ee0>

In [66]:
test_pred = algo_5.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8965


0.8965016080388305

In [51]:
algo_6 = SVD(n_epochs=25)
algo_6.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b0307e60a0>

In [52]:
test_pred = algo_6.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8880


0.8879619239294865

In [67]:
algo_7 = SVD(n_epochs=20)
algo_7.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b0307e6ca0>

In [68]:
test_pred = algo_7.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8845


0.8845490008079826

In [75]:
algo_8 = SVD(n_epochs=19)
algo_8.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b03032bd90>

In [76]:
test_pred = algo_8.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8836


0.8836274440370191

In [77]:
algo_9 = SVD(n_epochs=21)
algo_9.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b030926f10>

In [78]:
test_pred = algo_9.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8847


0.8846501323667194

In [102]:
algo_10 = SVD(n_factors=50, n_epochs=19)
algo_10.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b0303985b0>

In [103]:
test_pred = algo_10.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8812


0.8811724300966487

In [112]:
algo_11 = SVD(n_factors=50, n_epochs=19, lr_all=0.0055)
algo_11.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b0307e64c0>

In [113]:
test_pred = algo_11.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8807


0.8807177693684866

In [120]:
algo_12 = SVD(n_factors=50, n_epochs=19, lr_all=0.0055, reg_all=0.025)
algo_12.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b0307e6880>

In [121]:
test_pred = algo_12.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8777


0.8776595245486871

In [122]:
algo_13 = SVD(n_factors=50, n_epochs=19, lr_all=0.0055, reg_all=0.03)
algo_13.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b0307e6580>

In [123]:
test_pred = algo_13.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8762


0.8761971651019036

In [127]:
algo_14 = SVDpp()
algo_14.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1b0307e6d30>

In [128]:
test_pred = algo_14.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8704


0.8703979466526405

In [8]:
algo_15 = SVDpp(n_epochs=25)
algo_15.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2290f2be6a0>

In [9]:
test_pred = algo_15.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8683


0.8682511884967036

In [14]:
algo_16 = SVDpp(n_epochs=19)
algo_16.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2290f291d90>

In [15]:
test_pred = algo_16.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8660


0.8659753712758512

Таким образом, единственный алгоритм, который позволяет получить RMSE < 0.87, это SVDpp