In [127]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as msno
import surprise as surprise

In [155]:
data = pd.read_csv('nevada_climbs.csv')

In [156]:
data.head()

Unnamed: 0,users,ratings,route_id,name,grade,type
0,0,4,117859314,Sword of Wotan,5.12-,trad
1,1,3,116561977,Wrath of Sores,5.12c,sport
2,1,3,116561849,Waste Not Want Not,5.12b,sport
3,2,3,113987936,Scrum Felcher,5.10c,sport
4,3,3,113987936,Scrum Felcher,5.10c,sport


Get average ratings per climb

In [157]:
data['avg_rating'] = data.groupby('name').ratings.transform('mean')

Get average user ratings

In [158]:
data['avg_user_rating'] = data.groupby('users').ratings.transform('mean')

Get user rating std

In [159]:
data['user_rating_std'] = data.groupby('users').ratings.transform('std')

Get rating std overall

In [160]:
data['rating_std'] = data.groupby('name').ratings.transform('std')

Getting user std

In [161]:
data['user_rating_std'] = data.groupby('users').ratings.transform('std')

Mean coded for every user (rating - mean of ratings)

In [162]:
data['mean_normalized_rating'] = data['ratings'] - data['avg_user_rating']

In [163]:
data.sort_values('users', inplace = True)

In [137]:
data.rename(columns = {'ratings':'raw_ratings'}, inplace = True)

In [138]:
data.head()

Unnamed: 0,users,raw_ratings,route_id,name,grade,type,avg_rating,avg_user_rating,user_rating_std,rating_std,mean_normalized_rating
0,0,4,117859314,Sword of Wotan,5.12-,trad,4.0,2.88,0.881287,,1.12
78593,0,4,117859261,Forbidden Fruit,5.11c,trad,4.0,2.88,0.881287,,1.12
78592,0,3,117113230,Rattlesnake Surprise,5.11-,trad,3.0,2.88,0.881287,,0.12
78591,0,4,117859274,The Wrath,5.12a,trad,4.0,2.88,0.881287,,1.12
78590,0,1,117113213,Mosquitoes in My Mouth,5.9,trad,1.0,2.88,0.881287,,-1.88


In [139]:
df = data[['users', 'raw_ratings', 'route_id']]

In [140]:
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import KNNWithZScore

In [141]:
reader = surprise.Reader(line_format = 'user item rating', sep = '\t')
data = surprise.Dataset.load_from_df(df[['users', 'route_id', 'raw_ratings']], reader = reader)

In [142]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.626499,97.000563,4.118329
KNNBaseline,0.630445,3.69999,11.987581
BaselineOnly,0.638764,0.34734,0.414231
SVD,0.638825,8.423839,0.584
KNNBasic,0.669372,3.801553,12.272887
SlopeOne,0.67357,1.211908,3.45959
NMF,0.685952,9.697476,0.430234
CoClustering,0.691237,5.567149,0.505646
KNNWithZScore,0.716883,4.04019,11.94146
KNNWithMeans,0.719275,4.100542,10.785148


In [144]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = SVDpp()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.6215


0.6215342733434215

In [145]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [147]:
df.shape

(21197, 8)

In [251]:
df.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,641,105732260,3.0,2.935211,{'was_impossible': False},16,222,0.064789
1,716,105732257,3.0,3.378903,{'was_impossible': False},17,473,0.378903
2,329,105813275,2.0,2.598779,{'was_impossible': False},86,321,0.598779
3,1956,105732590,3.0,3.264059,{'was_impossible': False},0,266,0.264059
4,274,105732839,4.0,3.61443,{'was_impossible': False},118,80,0.38557


In [190]:
route_ids = pd.DataFrame(data['route_id'].unique())

In [195]:
route_ids = route_ids.rename(columns = {0:'route_ids'})

## Getting predicted ratings based on user ID

In [264]:
data[data['users'] == 38]

Unnamed: 0,users,ratings,route_id,name,grade,type,avg_rating,avg_user_rating,user_rating_std,rating_std,mean_normalized_rating
11982,38,2,105732281,Crimson Chrysalis,5.8+,trad,3.604366,2.723404,0.771845,0.630797,-0.723404
13474,38,2,105732791,Sunflower,5.9,trad,2.795455,2.723404,0.771845,0.667503,-0.723404
17332,38,3,106725290,Xyphoid Fever,5.1,trad,2.75,2.723404,0.771845,0.621582,0.276596
24824,38,3,105732788,C11H17NO3,5.8,trad,2.6,2.723404,0.771845,0.699206,0.276596
22526,38,3,105732287,Y2K,5.10a,trad,2.928,2.723404,0.771845,0.654622,0.276596
15558,38,2,105836970,Solar Flare,5.10-,trad,2.484375,2.723404,0.771845,0.642223,-0.723404
12706,38,4,106413414,Cayenne Corners,5.10d,trad,3.266667,2.723404,0.771845,0.883715,1.276596
17472,38,2,105732854,The Friar,5.9,trad,2.470588,2.723404,0.771845,0.64352,-0.723404
21398,38,4,107114058,Chocolate Flakes,5.10+,trad,3.153846,2.723404,0.771845,0.5547,1.276596
613,38,2,105787657,Peanut Butter and Jam,5.9,trad,2.894737,2.723404,0.771845,0.567131,-0.723404


In [283]:
pd.DataFrame(np.array(algo.predict(1,117859314)).reshape(1, 5))

Unnamed: 0,0,1,2,3,4
0,1,117859314,,2.52286,{'was_impossible': False}


In [None]:
for i in recommendations:
    recommendations.append(i.split('\t')[0])

In [247]:
def get_recommendations(user_id):
    recommendations = []
    for x in route_ids['route_ids']:
        recommendations.append(algo.predict(user_id, x))
    return recommendations
get_recommendations(641)

[Prediction(uid=641, iid=117859314, r_ui=None, est=2.4779812591926116, details={'was_impossible': False}),
 Prediction(uid=641, iid=117859261, r_ui=None, est=2.4779812591926116, details={'was_impossible': False}),
 Prediction(uid=641, iid=117113230, r_ui=None, est=2.4908374967324667, details={'was_impossible': False}),
 Prediction(uid=641, iid=117859274, r_ui=None, est=2.5666405563370653, details={'was_impossible': False}),
 Prediction(uid=641, iid=117113213, r_ui=None, est=2.2482607149932585, details={'was_impossible': False}),
 Prediction(uid=641, iid=117113260, r_ui=None, est=2.616946923768239, details={'was_impossible': False}),
 Prediction(uid=641, iid=117113203, r_ui=None, est=2.5690986147676913, details={'was_impossible': False}),
 Prediction(uid=641, iid=105889858, r_ui=None, est=2.4948263101347123, details={'was_impossible': False}),
 Prediction(uid=641, iid=105732251, r_ui=None, est=2.942862096941225, details={'was_impossible': False}),
 Prediction(uid=641, iid=105809181, r_u