In [None]:
!pip install --upgrade pip
!pip install cufflinks==0.8.2

In [None]:
import io, os
import re, json
import pickle, gzip
import itertools
import warnings
import pandas as pd
import numpy as np
import scipy as sp


import boto3
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import cufflinks as cf
import matplotlib.pyplot as plt
from PIL import  Image
%matplotlib inline
import seaborn as sns

from sagemaker import get_execution_role
warnings.filterwarnings("ignore")

# plotly + cufflinks work offline
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
bucket = 'slalom-ml'
prefix = 'tmp/sagemaker/demo/recsys/'

## Get Movie Data 

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip
%cd ml-100k
#!shuf ua.base -o ua.base.shuffled

In [None]:
!cat README

## Inspect Data and Exploratory Data Analysis

We observe the ratings are not in a matrix format, but are in a _long and skinny_ format.  We'll need to build the matrix ourselves.

We also observe there is a user dataset in **u.user**, providing some infor about gender, occupation, and zipcode. And information about the movie itself: title, release date, URL, and category in the **u.item** file.  Lastly, I think about how I've rated movies; I'm curious if there is any skew to the ratings themselves.  

In [None]:
user_column_names = ['user_id', 'age', 'gender', 'occupation', 'zip code']
film_column_names = ['film_id', 'title', 'release date', 'home release date', 'URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Noir','Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
data_column_names = ['user_id', 'film_id', 'rating', 'timestamp']
user_df = pd.read_csv('u.user', sep='|', names=user_column_names)
film_df = pd.read_csv('u.item', sep='|', names=film_column_names, encoding = "ISO-8859-1")

ua_data = pd.read_csv('ua.base', sep='\t', names=data_column_names).drop(['timestamp'], axis=1)
ua_test = pd.read_csv('ua.test', sep='\t', names=data_column_names).drop(['timestamp'], axis=1)

data_df = ua_data
test_df = ua_test
print('\nDESCRIPTION of Ratings data\n')
print(data_df.describe())

print('\n\nSAMPLE of UA Training (ratings)  data\n')
print(data_df.sample(n=5))

print('\n\nSAMPLE of UA Testing (ratings) data\n')
print(test_df.sample(n=5))

In [None]:
users = data_df.user_id.max()
films = data_df.film_id.max()

details = pd.DataFrame({'users': [users], 'films': [films]})
details.iplot(kind='bar')

### EDA
Is there a lot of skew in our data?  What does the rating distribution look like? 

In [None]:
data_df.groupby('rating').count()['film_id'].iplot()

In [None]:
tmp = data_df.groupby('user_id').count()
tmp = tmp.rename(columns={'film_id' : 'film_count'})
tmp.groupby('film_count').size().iplot(kind='bar')
#tmp.describe()

### Create and populate matrix for Matrix-Factorization
We observe our dataset has 1682 films rated by 943 users. That will be the size of our matrix. We also want to know about the sparsity of our matrix; so we'll calculate that too.

In [81]:
def build_matrix(user_max, film_max, df, name=''):
    print('Building {name} Matrix'.format(name=name))

    matrix = np.zeros((user_max, film_max))
    for row in df.itertuples():
        matrix[row.user_id - 1, row.film_id - 1] = row.rating
        
    sparsity = float(len(matrix.nonzero()[0]))
    sparsity /= (matrix.shape[0] * matrix.shape[1])
    sparsity *= 100
    print('Sparsity: {:4.2f}%'.format(sparsity))
    return matrix

training_matrix = build_matrix(users, films, data_df, name='UA Training')
testing_matrix  = build_matrix(users, films, test_df, name='UA Testing')

# Validate we have a disjoint training/testing datasets
assert(np.all((training_matrix * testing_matrix) == 0))

Building UA Training Matrix
Sparsity: 5.71%
Building UA Testing Matrix
Sparsity: 0.59%


In [142]:
# Source: https://gist.github.com/EthanRosenthal/a0816d8fea4394baf732
from numpy.linalg import solve

class ExplicitMF():
    def __init__(self, ratings, iterations=[10], n_factors=40, item_reg=0.0, user_reg=0.0, verbose=False):
        """
        Train a matrix factorization model to predict all empty entries in a matrix.
        The terminology assumes a ratings matrix which is ~ USER x ITEM
        
        Params
        ======
        ratings : (ndarray)
            User x Item matrix with corresponding ratings
        
        n_factors : (int)
            Number of latent factors (to assume) in factorization model
        
        item_reg : (float)
            Regularization term for item latent factors
        
        user_reg : (float)
            Regularization term for user latent factors
        
        verbose : (bool)
            Whether or not to printout training progress
        """
        
        self.ratings = ratings
        self.n_users, self.n_items = ratings.shape
        self.n_factors = n_factors
        self.item_reg = item_reg
        self.user_reg = user_reg
        self.iterations = iterations
        self._v = verbose


    def als_step(self, latent_vectors, fixed_vecs, ratings, _lambda, type='user'):
        """ One of two ALS steps. Solve for the latent vectors specified by type. """

        if type == 'user':
            # Precompute
            YTY = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(YTY.shape[0]) * _lambda
            for u in range(latent_vectors.shape[0]):
                latent_vectors[u, :] = solve((YTY + lambdaI), ratings[u, :].dot(fixed_vecs))
        
        elif type == 'item':
            # Precompute
            XTX = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(XTX.shape[0]) * _lambda
            for i in range(latent_vectors.shape[0]):
                latent_vectors[i, :] = solve((XTX + lambdaI), ratings[:, i].T.dot(fixed_vecs))

        return latent_vectors

    
    
    def train(self, n_iter = 10):
        """ Train model for n_iter iterations from scratch."""
        # initialize latent vectors
        self.user_vecs = np.random.random((self.n_users, self.n_factors))
        self.item_vecs = np.random.random((self.n_items, self.n_factors))        
        self.partial_train(n_iter)

        
    
    def partial_train(self, n_iter):
        """ Train model for n_iter iterations. Can be called multiple times for further training. """
        while (n_iter):
            if (self._v): print('\titerations left: {}'.format(n_iter))
            self.user_vecs = self.als_step(self.user_vecs, self.item_vecs, self.ratings, self.user_reg, type='user')
            self.item_vecs = self.als_step(self.item_vecs, self.user_vecs, self.ratings, self.item_reg, type='item')
            n_iter = n_iter - 1
    
    
    
    def predict_all(self):
        """ Predict ratings for every user and item. """
        predictions = np.zeros((self.user_vecs.shape[0], self.item_vecs.shape[0]))
        for u in range(self.user_vecs.shape[0]):
            for i in range(self.item_vecs.shape[0]):
                predictions[u, i] = self.predict(u, i)
                
        return predictions
    
    
    
    def predict(self, u, i):
        """ Single user and item prediction. """
        return self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
    
    
    
    def calculate_learning_curve(self, test_matrix):
        """
        Track MSE as a function of training iterations.
        
        Params
        ======
        test : (2D ndarray)
            Testing dataset (assumed to be USER x ITEM).
        
        The function creates two new class attributes:
        
        train_mse : (list)
            Training data MSE values for each value of iterations
        test_mse : (list)
            Test data MSE values for each value of iterations
        """

        print ("Calculate learning curve")

        self.iterations.sort()
        self.train_mse = []
        self.test_mse  = []
        iter_diff = 0

        for (i, n_iter) in enumerate(self.iterations):
            print ('{}, {}'.format(i, n_iter))
            if self._v:
                print('Iteration: {}'.format(n_iter))
            if i == 0:
                print('i = 0; train({})'.format(n_iter - iter_diff))
                self.train(n_iter - iter_diff)
            else:
                print('partial_train({})'.format(n_iter - iter_diff))
                self.partial_train(n_iter - iter_diff)

            predictions = self.predict_all()

            self.train_mse += [get_mse(predictions, self.ratings)]
            self.test_mse  += [get_mse(predictions, test_matrix)]
            if (self._v):
                print('Train mse: ' + str(self.train_mse[-1]))
                print('Test mse:  ' + str(self.test_mse[-1]))
            iter_diff = n_iter

In [143]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred   = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [None]:
iter_array = [1, 2, 5, 10, 15, 25, 50, 75, 100]
MF_ALS = ExplicitMF(ratings_matrix, n_factors=40, user_reg=0.0, item_reg=0.0, iterations=iter_array, verbose=True)
MF_ALS.calculate_learning_curve(testing_matrix)

In [146]:
def plot_learning_curve(model):
    # create our data traces (training MSE and testing MSE)
    trace_training = go.Scatter(x=model.iterations, y=model.train_mse, name='training')
    trace_testing  = go.Scatter(x=model.iterations, y=model.test_mse,  name='testing')
    layout = dict(
        title="MovieLens Learning Curve", 
        xaxis=dict(title="Iterations"),
        yaxis=dict(title="Mean Squared Error")
    )

    fig  = dict(data=[trace_training, trace_testing], layout=layout)
    py.iplot(fig)

In [None]:
plot_learning_curve(MF_ALS)