In [12]:
import pandas as pd
import numpy as np

import plotly
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from plotly.offline import *


from surprise.prediction_algorithms import SVD, KNNWithMeans
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

In [2]:
# param_grid = {'n_factors':[20, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
#               'reg_all': [0.4, 0.6]}
# gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
# gs_model.fit(jokes)
#     svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
#     svd.fit(trainset)
#     predictions = svd.test(testset)
#     print(accuracy.rmse(predictions))

In [3]:
df_init = pd.read_csv('./Movie_data/ratings.csv')
df_init.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
df = df_init.drop(columns='timestamp')
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [5]:
from surprise import Reader, Dataset
reader = Reader()
df_3 = Dataset.load_from_df(df,reader)

In [6]:
dataset = df_3.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


In [22]:
ratings_dist = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = ratings_dist.index,
               text = ['{:.1f} %'.format(val) for val in (ratings_dist.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = ratings_dist.values,
               )
layout = dict(title = 'Distribution of Ratings',
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
graph_r = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(graph_r)

'temp-plot.html'

In [8]:
moive_rated_dist = df.groupby(['movieId'])['rating'].count()

# Create trace
trace = go.Histogram(x = moive_rated_dist.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Movie',
                   xaxis = dict(title = 'Number of Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig)

'temp-plot.html'

In [37]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), KNNWithMeans()]:
    # Perform cross validation
    results = cross_validate(algorithm, df_3, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.880398,3.890232,0.281906
KNNWithMeans,0.904388,0.103577,2.233333
