# K-NN #

### Importing Libraries ###

In [2]:
import pickle
import os

import pandas as pd
import numpy as np

from surprise import KNNBaseline
from surprise import Dataset                                                     
from surprise import Reader                                                      
from surprise import dump
from surprise.accuracy import rmse
import plotly.offline as plot
import plotly.plotly as py
import plotly.graph_objs as go
plot.offline.init_notebook_mode(connected=True)

In [3]:
train_file = os.path.expanduser('~') + '/Downloads/ml-100k/u1.base'
test_file = os.path.expanduser('~') + '/Downloads/ml-100k/u1.test'
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))


In [4]:
algo = KNNBaseline()                                                       

for trainset, testset in data.folds(): 
    algo.train(trainset)                             
    predictions = algo.test(testset)
    rmse(predictions)
                                                                               
    dump('./dump_file', predictions, trainset, algo)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9418
The dump has been saved as file ./dump_file


In [5]:
dump_obj = pickle.load(open('./dump_file', 'rb'))

In [6]:
predictions

[Prediction(uid='1', iid='6', r_ui=5.0, est=3.6891528990657045, details={'actual_k': 20, 'was_impossible': False}),
 Prediction(uid='1', iid='10', r_ui=3.0, est=3.8733142460361298, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='1', iid='12', r_ui=5.0, est=4.4802761852376403, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='1', iid='14', r_ui=5.0, est=4.1854628211126519, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='1', iid='17', r_ui=3.0, est=3.3782047630766718, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='1', iid='20', r_ui=4.0, est=3.5347206648385083, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='1', iid='23', r_ui=4.0, est=4.262590846191932, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='1', iid='24', r_ui=3.0, est=3.4717203386096851, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='1', iid='27', r_ui=2.0, est=3.3272381889116689, d

In [7]:
predictions = dump_obj['predictions']
trainset = dump_obj['trainset']
algo = dump_obj['algo']
print('algo: {0}, k = {1}, min_k = {2}'.format(algo['name'], algo['k'], algo['min_k']))

algo: KNNBaseline, k = 40, min_k = 1


In [8]:
df = pd.DataFrame(predictions)

## Best and Worst Predections ##

In [16]:
df['err'] = abs(df.est - df.r_ui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

## Histogram Plot ##

In [9]:
data = [go.Histogram(x=df.est-df.r_ui)]
layout = go.Layout(
    title='Frequency of residuals',
    xaxis=dict(
        title='Residuals'
    ),
    yaxis=dict(
        title='Frequency'
    )
)
fig = go.Figure(data=data, layout=layout)
plot.iplot(fig, filename='residual histogram')


## Box PLot ##

In [11]:
data = [go.Box(y=df.est - df.r_ui)]
layout = go.Layout(
    title='FREQUENCY OF RESIDUALS',
    xaxis=dict(
        title='RESIDUALS'
    ),
    yaxis=dict(
        title='FREQUENCY'
    )
)
fig = go.Figure(data=data, layout=layout)
plot.iplot(fig, filename='residuals')