In [1]:
import numpy as np
import pandas as pd

In [21]:
from __future__ import (absolute_import, division, print_function, unicode_literals)

In [2]:
def read_data(path):
    df = pd.read_csv(path)
#    df.columns = ['USER', 'ITEM', 'RATING', 'TIMESTAMP']
    return df

In [3]:
ratings = read_data('ratings.csv')

In [4]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [165]:
ratings['rating'].value_counts()

4    2139018
5    1983093
3    1370916
2     359257
1     124195
Name: rating, dtype: int64

In [72]:
print(len(ratings.user_id.unique()), len(ratings.book_id.unique()))

53424 10000


In [79]:
ratings_10k = ratings.sample(frac = 0.001)

In [80]:
print(len(ratings_10k.user_id.unique()), len(ratings_10k.book_id.unique()))

5646 3098


In [81]:
len(ratings_10k)/(len(ratings_10k.user_id.unique())*len(ratings_10k.book_id.unique()))

0.0003416554096468943

In [71]:
len(ratings)/(len(ratings.user_id.unique())*len(ratings.book_id.unique()))

0.011186880428271938

In [115]:
user_counts = pd.DataFrame(ratings.user_id.value_counts())

In [116]:
user_counts.user_id.quantile([0.3, 0.8])

0.3     99.0
0.8    133.0
Name: user_id, dtype: float64

In [117]:
user_counts.head()

Unnamed: 0,user_id
30944,200
12874,200
52036,199
28158,199
12381,199


In [142]:
users = list(user_counts[user_counts['user_id']>100].index)

In [147]:
top_users = users[0:10000]

In [158]:
# Function to pick a new dataset with the X most prolific reviewers and Y most reviewed books
#adam edited
def pick_users_books(df, num_users, num_books):
    user_counts = pd.DataFrame(df.user_id.value_counts()).sort_values('user_id', ascending=False)
    top_10K_users = list(user_counts[0:num_users].index)
    user_filtered_df = df[df.user_id.isin(top_10K_users)]
    filtered_book_counts = pd.DataFrame(user_filtered_df.book_id.value_counts()).sort_values('book_id', ascending = False)
    top_100_filtered_books = list(filtered_book_counts[0:num_books].index)
    filtered_df = user_filtered_df[user_filtered_df.book_id.isin(top_100_filtered_books)]
    print("New dataframe has {} users, {} items, and a sparsity of {}".format(len(filtered_df.user_id.unique()),len(filtered_df.book_id.unique()),len(filtered_df)/(len(filtered_df.user_id.unique())*len(filtered_df.book_id.unique()))))
    train, test = train_test_split(filtered_df, test_size = 0.2, random_state=42)
    return train, test
    
def get_all_subsets(df):
    train_500_20, test_500_20 = pick_users_books(df, 500, 20)
    train_2000_50, test_2000_50 = pick_users_books(df, 2000, 50)
    train_10000_100, test_10000_100 = pick_users_books(df, 10000, 100)
    return train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100

In [159]:
train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100 = get_all_subsets(ratings)

New dataframe has 487 users, 20 items, and a sparsity of 0.442607802875
New dataframe has 1981 users, 50 items, and a sparsity of 0.374558303887
New dataframe has 9980 users, 100 items, and a sparsity of 0.271965931864


In [28]:
new_ratings = pick_users_books(ratings, 2000, 30)

New dataframe has 1972 users, 30 items, and a sparsity of 0


In [40]:
from sklearn.model_selection import train_test_split

In [7]:
import surprise

In [85]:
import random
random.seed(42)

In [167]:
def grid_search_data(df, algorithm, parameter_grid, metrics):
    reader = surprise.Reader(rating_scale=(1,5))
    data = surprise.Dataset.load_from_df(df, reader)
    data.split(n_folds = 3)
    grid_search = surprise.GridSearch(algorithm, parameter_grid, measures = metrics)
    grid_search.evaluate(data)
    results_df = pd.DataFrame.from_dict(grid_search.cv_results)
    return results_df

In [169]:
# TESTING WITH THE WHOLE 10k USERS DATA JUST USER-BASED AND JUST RMSE
algorithm1 = surprise.prediction_algorithms.knns.KNNWithMeans
param_grid = {'k': [10, 50, 200], 'sim_options': {'name': ['msd', 'pearson', 'pearson_baseline'], 'min_support': [1, 3], 'user_based': [True]}}
KNN_means_results_full = grid_search_data(train_10000_100, algorithm1, param_grid, ['RMSE'])

[{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 3, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 3, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 3, u'name': u'msd', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 3, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 3, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u

In [172]:
KNN_means_results_full_df_user = convert_grid_results(KNN_means_results_full)
KNN_means_results_full_df_user

Unnamed: 0,RMSE,min_support,name,user_based
0,0.927773,1,msd,True
1,0.89882,1,msd,True
2,0.889167,1,msd,True
3,0.910945,3,msd,True
4,0.883578,3,msd,True
5,0.881968,3,msd,True
6,0.926526,1,pearson,True
7,0.896781,1,pearson,True
8,0.886311,1,pearson,True
9,0.91825,3,pearson,True


In [173]:
algorithm1 = surprise.prediction_algorithms.knns.KNNWithMeans
param_grid = {'k': [10, 50, 200], 'sim_options': {'name': ['msd', 'pearson', 'pearson_baseline'], 'min_support': [1, 3], 'user_based': [False]}}
KNN_means_results_full_items = grid_search_data(train_10000_100, algorithm1, param_grid, ['RMSE'])

[{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': False}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': False}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': False}, u'k': 200}, {u'sim_options': {u'min_support': 3, u'name': u'msd', u'user_based': False}, u'k': 10}, {u'sim_options': {u'min_support': 3, u'name': u'msd', u'user_based': False}, u'k': 50}, {u'sim_options': {u'min_support': 3, u'name': u'msd', u'user_based': False}, u'k': 200}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': False}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': False}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': False}, u'k': 200}, {u'sim_options': {u'min_support': 3, u'name': u'pearson', u'user_based': False}, u'k': 10}, {u'sim_options': {u'min_support': 3, u'name': u'pearson', u'user_based': False}, u'k': 50}, {u'sim_o

In [174]:
KNN_means_results_full_df_items = convert_grid_results(KNN_means_results_full_items)
KNN_means_results_full_df_items

Unnamed: 0,RMSE,min_support,name,user_based
0,0.878344,1,msd,False
1,0.888194,1,msd,False
2,0.888193,1,msd,False
3,0.878344,3,msd,False
4,0.888194,3,msd,False
5,0.888193,3,msd,False
6,0.865087,1,pearson,False
7,0.868604,1,pearson,False
8,0.868604,1,pearson,False
9,0.865087,3,pearson,False


In [180]:
KNN_means_results_full_df = pd.concat([KNN_means_results_full_df_user, KNN_means_results_full_df_items], ignore_index=True)
KNN_means_results_full_df.iloc[KNN_means_results_full_df['RMSE'].idxmin()]

RMSE           0.865087
min_support           1
name            pearson
user_based        False
Name: 24, dtype: object

In [181]:
KNN_means_results_full_df.to_csv('KNN_means_grid_results.csv')

In [144]:
KNN_basic_results_full = convert_grid_results(KNN_basic_results_full)

In [145]:
KNN_basic_results_full

Unnamed: 0,MAE,RMSE,k,min_support,name,user_based
0,0.724231,0.940356,10,1,msd,True
1,0.699783,0.906091,50,1,msd,True
2,0.720742,0.918262,200,1,msd,True
3,0.756389,0.98776,10,5,msd,True
4,0.769269,0.991301,50,5,msd,True
5,0.776959,0.99743,200,5,msd,True
6,0.790849,0.991834,10,1,pearson,True
7,0.768264,0.960135,50,1,pearson,True
8,0.768413,0.959607,200,1,pearson,True
9,0.798235,1.027444,10,5,pearson,True


In [150]:
algorithm = surprise.prediction_algorithms.knns.KNNWithMeans
param_grid = {'k': [10, 50, 200], 'sim_options': {'name': ['msd', 'pearson', 'pearson_baseline'], 'min_support': [1, 5], 'user_based': [True, False]}}
KNN_with_means_full = convert_grid_results(grid_search_data(train_10000_100, algorithm, param_grid, ['RMSE', 'MAE']))

[{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u

In [151]:
KNN_with_means_full

Unnamed: 0,MAE,RMSE,k,min_support,name,user_based
0,0.707187,0.92201,10,1,msd,True
1,0.686809,0.895225,50,1,msd,True
2,0.682618,0.891339,200,1,msd,True
3,0.689805,0.909207,10,5,msd,True
4,0.685781,0.903113,50,5,msd,True
5,0.691583,0.909969,200,5,msd,True
6,0.684972,0.919079,10,1,pearson,True
7,0.665134,0.89393,50,1,pearson,True
8,0.664192,0.891576,200,1,pearson,True
9,0.684776,0.919688,10,5,pearson,True


In [152]:
algorithm = surprise.prediction_algorithms.knns.KNNWithZScore
param_grid = {'k': [10, 50, 200], 'sim_options': {'name': ['msd', 'pearson', 'pearson_baseline'], 'min_support': [1, 5], 'user_based': [True, False]}}
KNN_with_zscore_full = convert_grid_results(grid_search_data(train_10000_100, algorithm, param_grid, ['RMSE', 'MAE']))

[{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 200}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u

ValueError: I/O operation on closed file

In [140]:
#Need to do results_df = pd.DataFrame.from_dict(grid_search.cv_results)
def convert_grid_results(df):
    my_dict = df.to_dict('list')
    min_support = []
    name = []
    user_based = []
    for x in range(len(my_dict['sim_options'])):
        y = my_dict['sim_options'][x]
        min_support.append(y['min_support'])
        name.append(y['name'])
        user_based.append(y['user_based'])
    del my_dict['params']
    del my_dict['scores']
    del my_dict['sim_options']
    my_dict['min_support'] = min_support
    my_dict['name'] = name
    my_dict['user_based'] = user_based
    my_df = pd.DataFrame.from_dict(my_dict)
    return my_df

In [29]:
reader = surprise.Reader(rating_scale=(1,5))

In [42]:
data = surprise.Dataset.load_from_df(train_ratings, reader)

In [47]:
pd.set_option('max_colwidth',100)

Unnamed: 0,MAE,RMSE,k,params,scores,sim_options
0,0.728966,0.941928,10,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}","{u'MAE': 0.728966240258, u'RMSE': 0.941928227992}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
1,0.705857,0.913228,30,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 30}","{u'MAE': 0.705857204072, u'RMSE': 0.913227742474}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
2,0.700889,0.907006,50,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}","{u'MAE': 0.700888543408, u'RMSE': 0.907005596044}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
3,0.760297,0.99139,10,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 10}","{u'MAE': 0.76029730044, u'RMSE': 0.991390484743}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
4,0.768088,0.992235,30,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 30}","{u'MAE': 0.76808817136, u'RMSE': 0.99223484538}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
5,0.772903,0.995781,50,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 50}","{u'MAE': 0.772902749466, u'RMSE': 0.995781050779}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
6,0.786464,0.987944,10,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}","{u'MAE': 0.786464391877, u'RMSE': 0.987943507298}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
7,0.76958,0.963227,30,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 30}","{u'MAE': 0.7695800303, u'RMSE': 0.963226815035}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
8,0.767056,0.95937,50,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}","{u'MAE': 0.767056114197, u'RMSE': 0.959369785109}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
9,0.799204,1.025064,10,"{u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 10}","{u'MAE': 0.799204268804, u'RMSE': 1.02506390957}","{u'min_support': 5, u'name': u'pearson', u'user_based': True}"


In [48]:
param_grid_2 = {'k': [20, 50, 70, 100, 150], 'sim_options': {'name': ['msd', 'pearson_baseline'], 'min_support': [1], 'user_based': [True]}}

In [51]:
grid_search_2 = surprise.GridSearch(algo, param_grid_2, measures = ['RMSE', 'MAE'])

[{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 20}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 70}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 100}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 150}, {u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 20}, {u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 70}, {u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 100}, {u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 150}]


In [52]:
grid_search_2.evaluate(data)

------------
Parameters combination 1 of 10
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 20}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9208
Mean MAE : 0.7128
------------
------------
Parameters combination 2 of 10
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9070
Mean MAE : 0.7009
------------
------------
Parameters combination 3 of 10
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 70}
Computing the msd sim

In [54]:
results_df2 = pd.DataFrame.from_dict(grid_search_2.cv_results)
results_df2

Unnamed: 0,MAE,RMSE,k,params,scores,sim_options
0,0.712761,0.920839,20,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 20}","{u'MAE': 0.712761113634, u'RMSE': 0.92083898771}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
1,0.700889,0.907006,50,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}","{u'MAE': 0.700888543408, u'RMSE': 0.907005596044}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
2,0.699758,0.904974,70,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 70}","{u'MAE': 0.699757982434, u'RMSE': 0.904974072278}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
3,0.702028,0.905823,100,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 100}","{u'MAE': 0.702028176043, u'RMSE': 0.905822929171}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
4,0.711136,0.911573,150,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 150}","{u'MAE': 0.711136465362, u'RMSE': 0.911573075844}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
5,0.714442,0.916862,20,"{u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 20}","{u'MAE': 0.714441781207, u'RMSE': 0.916861587753}","{u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}"
6,0.711494,0.911051,50,"{u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 50}","{u'MAE': 0.711493932251, u'RMSE': 0.911051465878}","{u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}"
7,0.712436,0.911733,70,"{u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 70}","{u'MAE': 0.71243623182, u'RMSE': 0.911733370374}","{u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}"
8,0.714273,0.91333,100,"{u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 100}","{u'MAE': 0.714273429679, u'RMSE': 0.913330466913}","{u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}"
9,0.716635,0.914807,150,"{u'sim_options': {u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}, u'k': 150}","{u'MAE': 0.716634713719, u'RMSE': 0.914806540231}","{u'min_support': 1, u'name': u'pearson_baseline', u'user_based': True}"


In [60]:
algo = surprise.prediction_algorithms.knns.KNNWithZScore
param_grid = {'k': [10, 30, 50], 'sim_options': {'name': ['msd', 'pearson', 'pearson_baseline'], 'min_support': [1, 5], 'user_based': [True]}}
grid_search_knn_z = surprise.GridSearch(algo, param_grid, measures=['MAE', 'RMSE'])

[{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 30}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 30}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 30}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 30}, {u'sim_options': {u'mi

In [61]:
grid_search_knn_z.evaluate(data)

------------
Parameters combination 1 of 18
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean MAE : 0.7154
Mean RMSE: 0.9375
------------
------------
Parameters combination 2 of 18
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 30}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean MAE : 0.7020
Mean RMSE: 0.9174
------------
------------
Parameters combination 3 of 18
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}
Computing the msd sim

In [62]:
results_knn_z = pd.DataFrame.from_dict(grid_search_knn_z.cv_results)
results_knn_z

Unnamed: 0,MAE,RMSE,k,params,scores,sim_options
0,0.715423,0.937455,10,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}","{u'MAE': 0.715422525348, u'RMSE': 0.937455189091}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
1,0.701981,0.917424,30,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 30}","{u'MAE': 0.701981425043, u'RMSE': 0.91742419526}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
2,0.698123,0.913268,50,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}","{u'MAE': 0.698122850452, u'RMSE': 0.913268385227}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
3,0.716903,0.957006,10,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 10}","{u'MAE': 0.716903213679, u'RMSE': 0.957005667849}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
4,0.712304,0.950014,30,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 30}","{u'MAE': 0.712304021592, u'RMSE': 0.950013668519}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
5,0.712783,0.950634,50,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 50}","{u'MAE': 0.712783070101, u'RMSE': 0.950633551959}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
6,0.689984,0.936597,10,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}","{u'MAE': 0.689984053931, u'RMSE': 0.936597199821}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
7,0.67886,0.918028,30,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 30}","{u'MAE': 0.678859666337, u'RMSE': 0.918027790773}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
8,0.674817,0.912916,50,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}","{u'MAE': 0.674817359406, u'RMSE': 0.912916240877}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
9,0.706012,0.954564,10,"{u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 10}","{u'MAE': 0.706011979267, u'RMSE': 0.954563649511}","{u'min_support': 5, u'name': u'pearson', u'user_based': True}"


In [63]:
algo = surprise.prediction_algorithms.knns.KNNWithMeans
param_grid = {'k': [10, 30, 50], 'sim_options': {'name': ['msd', 'pearson', 'pearson_baseline'], 'min_support': [1, 5], 'user_based': [True]}}
grid_search_knn_mean = surprise.GridSearch(algo, param_grid, measures=['MAE', 'RMSE'])

[{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 30}, {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 30}, {u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 30}, {u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 10}, {u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 30}, {u'sim_options': {u'mi

In [64]:
grid_search_knn_mean.evaluate(data)

------------
Parameters combination 1 of 18
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean MAE : 0.7109
Mean RMSE: 0.9302
------------
------------
Parameters combination 2 of 18
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 30}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean MAE : 0.6980
Mean RMSE: 0.9122
------------
------------
Parameters combination 3 of 18
params:  {u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}
Computing the msd sim

In [65]:
results_knn_mean = pd.DataFrame.from_dict(grid_search_knn_mean.cv_results)
results_knn_mean

Unnamed: 0,MAE,RMSE,k,params,scores,sim_options
0,0.710925,0.930242,10,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 10}","{u'MAE': 0.710925176595, u'RMSE': 0.930241904917}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
1,0.698016,0.912158,30,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 30}","{u'MAE': 0.69801574464, u'RMSE': 0.912157977967}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
2,0.694291,0.908162,50,"{u'sim_options': {u'min_support': 1, u'name': u'msd', u'user_based': True}, u'k': 50}","{u'MAE': 0.694291443262, u'RMSE': 0.908162363368}","{u'min_support': 1, u'name': u'msd', u'user_based': True}"
3,0.71766,0.952762,10,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 10}","{u'MAE': 0.717660101501, u'RMSE': 0.952761705724}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
4,0.713916,0.948218,30,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 30}","{u'MAE': 0.713915633236, u'RMSE': 0.948217630033}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
5,0.714094,0.948755,50,"{u'sim_options': {u'min_support': 5, u'name': u'msd', u'user_based': True}, u'k': 50}","{u'MAE': 0.714093968606, u'RMSE': 0.948755016393}","{u'min_support': 5, u'name': u'msd', u'user_based': True}"
6,0.691918,0.93601,10,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 10}","{u'MAE': 0.691918178425, u'RMSE': 0.936009829352}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
7,0.678385,0.91684,30,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 30}","{u'MAE': 0.678384935198, u'RMSE': 0.916840231167}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
8,0.6742,0.912068,50,"{u'sim_options': {u'min_support': 1, u'name': u'pearson', u'user_based': True}, u'k': 50}","{u'MAE': 0.67419951946, u'RMSE': 0.91206752072}","{u'min_support': 1, u'name': u'pearson', u'user_based': True}"
9,0.711109,0.956709,10,"{u'sim_options': {u'min_support': 5, u'name': u'pearson', u'user_based': True}, u'k': 10}","{u'MAE': 0.711109367272, u'RMSE': 0.956708962269}","{u'min_support': 5, u'name': u'pearson', u'user_based': True}"


In [49]:
def sample_users(df, min_rated):
    user_counts = pd.DataFrame(df['user_id'].value_counts())
    users = list(user_counts[user_counts['user_id'] >= min_rated].index)
    
    new_df = df[df.user_id.isin(users)]
    
    return new_df

In [140]:
sampled = sample_users(ratings, 100)

In [141]:
len(sampled.user_id.unique())

37084

In [80]:
print('sparsity is {.8f}'.format(100.0*len(ratings.USER)/(len(ratings.USER.unique())*len(ratings.ITEM.unique()/len(ratings.USER)))))

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [73]:
sampled_ratings = ratings.sample(1000000, random_state=42)

In [72]:
print('{} users and {} items'.format(len(ratings.USER.unique()), len(ratings.ITEM.unique())))

8026324 users and 2330066 items


In [42]:
item_counts = pd.DataFrame(ratings.ITEM.value_counts())

In [65]:
item_counts.ITEM.mean()

9.659449560656222

In [50]:
item_counts.head()

Unnamed: 0,ITEM
0439023483,21398
030758836X,19867
0439023513,14114
0385537859,12973
0007444117,12629


In [53]:
item_counts.ITEM.between(100,5000,inclusive=True).sum()

29243

In [7]:
#Same method as Mark's
def sample_books(df, n_samples, min_user_ratings, max_user_ratings, min_item_ratings, max_item_ratings):
    users_temp = pd.DataFrame(df['USER'].value_counts())
    users_list = list(users_temp[users_temp['USER'].between(min_user_ratings,max_user_ratings,inclusive=True)].index)
    
    items_temp = pd.DataFrame(df['ITEM'].value_counts())
    items_list = list(items_temp[items_temp['ITEM'].between(min_item_ratings,max_item_ratings,inclusive=True)].index)
    
    limited_df = df[df['USER'].isin(users_list) & df['ITEM'].isin(items_list)]
    
    sample = limited_df.sample(n_samples, random_state=42)
    return sample

In [8]:
sampled_df = sample_books(ratings, 100000, 30, 1000000, 75, 1000000)

In [10]:
len(sampled_df['ITEM'].unique())

30603

In [11]:
sampled_df.head()

Unnamed: 0,USER,ITEM,RATING,TIMESTAMP
17009092,A1KQ31B1998R79,1595547819,3.0,1361750400
5443680,A1M5ZT35YX6TIN,441008534,5.0,1244246400
11941185,A1JLU5H1CCENWX,985872586,5.0,1369353600
603401,A209J4NK7GR9S3,61491896,3.0,1276387200
6793222,A103Q9172H9419,578032147,4.0,1382745600


In [85]:
#Can first input the ratings dataframe, kind = 'USER', then run it again with the resulting df and kind = 'ITEM'
#returns the subset of users or items we have remaining, depending on kind and a new dataframe with only those remaining
def sample_kind(df, n_users, min_ratings, max_ratings, kind):
    counts = pd.DataFrame(df[kind].value_counts())
    counts = counts[counts[kind].between(min_ratings, max_ratings,inclusive=True)].reset_index()
    counts = counts.drop(kind, axis = 1)
#    subset = pd.DataFrame(counts.sample(n_users, random_state = 42))
    subset = pd.DataFrame(counts)
    new_df = subset.merge(right = df, how = 'left', left_on = 'index', right_on = kind, copy = False)
    new_df = new_df.drop('index', axis = 1)
    return subset, new_df

In [86]:
new_ratings = sample_kind(ratings, 100, 5, 100, 'ITEM')

AttributeError: 'tuple' object has no attribute 'shape'

In [88]:
items, new_ratings = new_ratings[0], new_ratings[1]

In [89]:
new_ratings.shape

(11496998, 4)

In [90]:
len(new_ratings.USER.unique())

5073473

In [91]:
users, new_ratings2 = sample_kind(new_ratings, 10000, 5, 100, 'USER')

In [93]:
new_ratings2.shape

(3873416, 4)

In [191]:
#returns the new dataframe with range restrictions before sampling, the remaining items, and the remaining users
def sample_merge(df, min_ratings, max_ratings, min_times_rated, max_times_rated):
    user_counts = pd.DataFrame(df['USER'].value_counts())
    user_counts = user_counts[user_counts['USER'].between(min_ratings, max_ratings, inclusive=True)].reset_index()
    # might have to delete next line
    user_counts = pd.DataFrame(user_counts.drop('USER', axis = 1))
    print(user_counts.isnull().sum())
    
    item_counts = pd.DataFrame(df['ITEM'].value_counts())
    item_counts = item_counts[item_counts['ITEM'].between(min_times_rated, max_times_rated, inclusive=True)].reset_index()
    # might have to delete next line
    item_counts = pd.DataFrame(item_counts.drop('ITEM', axis = 1))
    print(item_counts.isnull().sum())
    
    df2 = user_counts.merge(right = df, how = 'left', left_on = 'index', right_on = 'USER')
    df2.drop('index', axis = 1, inplace = True)
    df3 = item_counts.merge(right = df2, how = 'left', left_on = 'index', right_on = 'ITEM')
    df3.drop('index', axis = 1, inplace = True)
    df3.dropna(axis=0, inplace=True)
    
    return df3, item_counts, user_counts