In [1]:
import pandas as pd
import numpy as np
from surprise import KNNBasic, KNNWithMeans, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

In [4]:
movies = pd.read_csv('../input/movies.csv')
ratings = pd.read_csv('../input/ratings.csv')

In [5]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [6]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [7]:
movies_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_ratings.dropna(inplace=True)
movies_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [8]:
dataset = pd.DataFrame({
    'uid': movies_ratings.userId,
    'iid': movies_ratings.title,
    'rating': movies_ratings.rating
})

In [9]:
dataset.head() 

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [10]:
ratings.rating.min()

0.5

In [11]:
ratings.rating.max()

5.0

In [12]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [13]:
params = {'k':np.arange(10, 101, 10),
          'sim_options': {'name': ['pearson_baseline'], 'user_based': [True]}
         }
grid_algo = GridSearchCV(KNNBasic, params, measures=['rmse', 'mae'], cv=5, n_jobs=-1)
grid_algo.fit(data)

In [14]:
grid_algo.best_params

{'rmse': {'k': 30,
  'sim_options': {'name': 'pearson_baseline', 'user_based': True}},
 'mae': {'k': 50,
  'sim_options': {'name': 'pearson_baseline', 'user_based': True}}}

In [15]:
trainset, testset = train_test_split(data, test_size=.15)

In [16]:
algo = KNNBasic(k=40, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fecdd7b6e10>

In [17]:
predict = algo.test(testset)

In [18]:
accuracy.rmse(predict, verbose=True)

RMSE: 0.9634


0.9634474887663318

Если же делаем item-based, получаекм:

In [19]:
algo = KNNWithMeans(k=30, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fecdcaa1f60>

In [20]:
test_pred = algo.test(testset)

In [21]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8746


0.8745539845495566