In [1]:
import pandas as pd
import numpy as np

import plotly
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from plotly.offline import *

from sklearn.metrics import ndcg_score

from surprise.prediction_algorithms import SVD, KNNWithMeans, KNNBasic
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

In [15]:
df_init = pd.read_csv('./Movie_data/ratings.csv')
df_init.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [16]:
df = df_init.drop(columns='timestamp')
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [17]:
df_count = df
df_count['count'] = 1
df_count

Unnamed: 0,userId,movieId,rating,count
0,1,1,4.0,1
1,1,3,4.0,1
2,1,6,4.0,1
3,1,47,5.0,1
4,1,50,5.0,1
...,...,...,...,...
100831,610,166534,4.0,1
100832,610,168248,5.0,1
100833,610,168250,5.0,1
100834,610,168252,5.0,1


In [18]:
from surprise import Reader, Dataset
reader = Reader()
df_3 = Dataset.load_from_df(df,reader)

ValueError: too many values to unpack (expected 3)

In [None]:
trainset, testset = train_test_split(df_3, test_size=0.2)

In [None]:
ratings_dist = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = ratings_dist.index,
               text = ['{:.1f} %'.format(val) for val in (ratings_dist.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = ratings_dist.values,
               )
layout = dict(title = 'Distribution of Ratings',
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
graph_r = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(graph_r)

In [None]:
moive_rated_dist = df.groupby(['movieId'])['rating'].count()

# Create trace
trace = go.Histogram(x = moive_rated_dist.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Movie',
                   xaxis = dict(title = 'Number of Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig)

In [None]:
moive_rated_dist = df.groupby(['userId'])['rating'].count()

# Create trace
trace = go.Histogram(x = moive_rated_dist.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Movie',
                   xaxis = dict(title = 'Number of Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig)

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), KNNWithMeans(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm, df_3, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [None]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

In [25]:
rating_count = df_count.groupby(['userId']).sum()
rating_count

Unnamed: 0_level_0,movieId,rating,count
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,430268,1013.0,232
2,2040158,114.5,29
3,275277,95.0,39
4,428140,768.0,216
5,15129,160.0,44
...,...,...,...
606,10806800,4078.0,1115
607,347939,708.0,187
608,3741665,2604.5,831
609,17877,121.0,37


In [None]:
def suggest_movies(x):
    pred_rating = {}
    for y in list(range(0, len(df['movieId']))):
        user_prediction = svd.predict(x, y)
        pred_rating.update({user_prediction[1]: user_prediction[3]})
    pred_rating = {k: v for k, v in sorted(pred_rating.items(), key=lambda item: item[1])}
    if 
        rec = list(pred_rating)[-6:-1]
    return rec, pred_rating

In [None]:
predictions = suggest_movies(456)[1]

In [None]:
pred = []
n=0
for key, value in predictions.items():
    pred.append(value)
    n = n+1

In [None]:
n

In [None]:
keys = [] 
values = [] 
items = predictions.items() 
for item in items: 
    keys.append(item[0]), values.append(item[1]) 

In [None]:
scores.dtype

In [None]:
np.asarray(df['rating']).astype(int)

In [None]:
def ndcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [None]:
scores = np.asarray(pred).round().astype(int)
true_relevance = df['rating'].values.astype(int)
ndcg_score(true_relevance, scores)

In [None]:
params = {'n_factors': [65, 75, 85],
         'reg_all': [0.03, 0.04, 0.045]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(df_3)
print(g_s_svd.best_score)
print(g_s_svd.best_params)


In [None]:
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


print(ndcg_at_k([0],5, method=1))
print(ndcg_at_k([1],5, method=1))
print(ndcg_at_k([1,0],5, method=1))
print(ndcg_at_k([0,1],5, method=1))
print(ndcg_at_k([0,1,1],5, method=1))
print(ndcg_at_k([0,1,1,1],5, method=1))