In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import svd_tests as t
%matplotlib inline

# Read in the datasets
movies = pd.read_csv('movies_clean.csv')
reviews = pd.read_csv('reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

# Create user-by-item matrix
user_items = reviews[['user_id', 'movie_id', 'rating', 'timestamp']]
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

# Create data subset
user_movie_subset = user_by_movie[[73486, 75314,  68646, 99685]].dropna(axis=0)

[[10. 10. 10. 10.]
 [10.  4.  9. 10.]
 [ 8.  9. 10.  5.]
 [ 9.  8. 10. 10.]
 [10.  5.  9.  9.]
 [ 6.  4. 10.  6.]
 [ 9.  8. 10.  9.]
 [10.  5.  9.  8.]
 [ 7.  8. 10.  8.]
 [ 9.  5.  9.  7.]
 [ 9.  8. 10.  8.]
 [ 9. 10. 10.  9.]
 [10.  9. 10.  8.]
 [ 5.  8.  5.  8.]
 [10.  8. 10. 10.]
 [ 9.  9. 10. 10.]
 [ 9.  8.  8.  8.]
 [10.  8.  1. 10.]
 [ 5.  6. 10. 10.]
 [ 8.  7. 10.  7.]]


In [78]:
ratings_mat = np.matrix(user_movie_subset)
np.count_nonzero(~np.isnan(ratings_mat))

80

In [2]:
user_movie_subset.shape

(20, 4)

In [79]:
def FunkSVD(ratings_mat, latent_features=4, learning_rate=0.0001, iters=100):
    '''
    This function performs matrix factorization using a basic form of FunkSVD with no regularization
    
    INPUT:
    ratings_mat - (numpy array) a matrix with users as rows, movies as columns, and ratings as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate 
    iters - (int) the number of iterations
    
    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    movie_mat - (numpy array) a latent feature by movie matrix
    '''
    
    # Set up useful values to be used through the rest of the function
    print(ratings_mat.shape)
    n_users = ratings_mat.shape[0]
    n_movies = ratings_mat.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(ratings_mat))
    
    # initialize the user and movie matrices with random values
    user_mat = np.random.rand(n_users,latent_features)
    movie_mat = np.random.rand(latent_features,n_movies)
    movie_mat = movie_mat.T 

    print("Shape of user matrx = ",user_mat.shape)
    print("Shape of movie matrix = ",movie_mat.shape)

    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # header for running results
    print("Optimization Statistics")
    print("Iterations | Mean Squared Error ")
    
    for i in range(iters):
        # update our sse
        sse_accum = 0
        
        # For each user-movie pair
        for row in range(n_users):
            for column in range(n_movies):
                if(not np.isnan(ratings_mat.item(row,column))):
                    #print(ratings_mat.item(row,column))
                    actual = ratings_mat.item(row,column)
                    #print(row,column)
                    pred = np.dot(user_mat[row],movie_mat[column])
                    error = actual-pred
                    sse_accum += error**2

                    user_mat[row] = user_mat[row] + 2*learning_rate*error*movie_mat[column]
                    movie_mat[column] = movie_mat[column] +  2*learning_rate*error*user_mat[row]    
                # if the rating exists
                    
                    # compute the error as the actual minus the dot product of the user and movie latent features

                    # Keep track of the total sum of squared errors for the matrix
                    
                    # update the values in each matrix in the direction of the gradient
        old_sse = sse_accum
        print("Iteration = ",i+1," Error = ",old_sse/num_ratings)
        
    return user_mat, movie_mat 

In [80]:
user_mat, movie_mat = FunkSVD(ratings_mat, latent_features=4, learning_rate=0.005, iters=250)

(20, 4)
Shape of user matrx =  (20, 4)
Shape of movie matrix =  (4, 4)
Optimization Statistics
Iterations | Mean Squared Error 
Iteration =  1  Error =  47.68873528688228
Iteration =  2  Error =  17.322149596682877
Iteration =  3  Error =  4.3747366781617645
Iteration =  4  Error =  2.817888415056924
Iteration =  5  Error =  2.6942561935065714
Iteration =  6  Error =  2.651585813633287
Iteration =  7  Error =  2.618874673512891
Iteration =  8  Error =  2.5872204943086574
Iteration =  9  Error =  2.5542028277975217
Iteration =  10  Error =  2.5189496549501795
Iteration =  11  Error =  2.4809985207279217
Iteration =  12  Error =  2.4400382083064303
Iteration =  13  Error =  2.3958578103288843
Iteration =  14  Error =  2.348339921808173
Iteration =  15  Error =  2.2974602740344396
Iteration =  16  Error =  2.243284334237894
Iteration =  17  Error =  2.1859579203395354
Iteration =  18  Error =  2.1256910439350087
Iteration =  19  Error =  2.0627357430409856
Iteration =  20  Error =  1.9973

In [48]:
ratings_mat[0, 0] = np.nan
type(ratings_mat.item(0,0))

float

In [54]:
user_mat, movie_mat = FunkSVD(ratings_mat, latent_features=4, learning_rate=0.005, iters=250)

Optimization Statistics
Iterations | Mean Squared Error 
Iteration =  1  Error =  3604.670986419464
Iteration =  2  Error =  1358.766012516817
Iteration =  3  Error =  370.8403479574037
Iteration =  4  Error =  225.89449796048615
Iteration =  5  Error =  210.28836458660888
Iteration =  6  Error =  204.64107778967312
Iteration =  7  Error =  199.7866963414769
Iteration =  8  Error =  194.733229468784
Iteration =  9  Error =  189.27245289883317
Iteration =  10  Error =  183.33876052488566
Iteration =  11  Error =  176.91402449975357
Iteration =  12  Error =  170.01238208266082
Iteration =  13  Error =  162.67871841285645
Iteration =  14  Error =  154.98736020996822
Iteration =  15  Error =  147.03777613948583
Iteration =  16  Error =  138.94678258441775
Iteration =  17  Error =  130.83808716377993
Iteration =  18  Error =  122.83096459992377
Iteration =  19  Error =  115.03033168969614
Iteration =  20  Error =  107.52023878662708
Iteration =  21  Error =  100.3618390518998
Iteration =  2

In [55]:
# Run this cell to see if you were able to predict for the missing value
preds = np.dot(user_mat, movie_mat)
print("The predicted value for the missing rating is {}:".format(preds[0,0]))
print()
print("The actual value for the missing rating is {}:".format(ratings_mat[0,0]))
print()
assert np.isnan(preds[0,0]) == False
print("That's right! You just predicted a rating for a user-movie pair that was never rated!")
print("But if you look in the original matrix, this was actually a value of 10. Not bad!")

The predicted value for the missing rating is 5.434455208883741:

The actual value for the missing rating is nan:

That's right! You just predicted a rating for a user-movie pair that was never rated!
But if you look in the original matrix, this was actually a value of 10. Not bad!


In [74]:
first_1000_users = np.matrix(user_by_movie.head(1000))
first_1000_users.shape

(1000, 31245)

In [75]:
user_mat, movie_mat = FunkSVD(first_1000_users, latent_features=4, learning_rate=0.005, iters=20)

(1000, 31245)
Shape of user matrx =  (1000, 4)
Shape of movie matrix =  (31245, 4)
Optimization Statistics
Iterations | Mean Squared Error 
Iteration =  1  Error =  254443.739932123
Iteration =  2  Error =  117755.06874536072
Iteration =  3  Error =  80348.73484448594
Iteration =  4  Error =  61859.56006797303
Iteration =  5  Error =  50282.16049464488
Iteration =  6  Error =  42206.28564273156
Iteration =  7  Error =  36201.00410491364
Iteration =  8  Error =  31547.344517583362
Iteration =  9  Error =  27838.49907928627
Iteration =  10  Error =  24822.283049278165
Iteration =  11  Error =  22330.986614641974
Iteration =  12  Error =  20247.384439600988
Iteration =  13  Error =  18486.703639412113
Iteration =  14  Error =  16986.090360268034
Iteration =  15  Error =  15697.91888932078
Iteration =  16  Error =  14585.318223325394
Iteration =  17  Error =  13619.141531051459
Iteration =  18  Error =  12775.932940154826
Iteration =  19  Error =  12036.571012123348
Iteration =  20  Error 

In [81]:
# How many actual ratings exist in first_1000_users
num_ratings = np.count_nonzero(~np.isnan(first_1000_users))
print("The number of actual ratings in the first_1000_users is {}.".format(num_ratings))
print()

# How many ratings did we make for user-movie pairs that didn't have ratings
ratings_for_missing = first_1000_users.shape[0]*first_1000_users.shape[1] - num_ratings
print("The number of ratings made for user-movie pairs that didn't have ratings is {}".format(ratings_for_missing))

The number of actual ratings in the first_1000_users is 10852.

The number of ratings made for user-movie pairs that didn't have ratings is 31234148


In [82]:
# Test your results against the solution
assert num_ratings == 10852, "Oops!  The number of actual ratings doesn't quite look right."
assert ratings_for_missing == 31234148, "Oops!  The number of movie-user pairs that you made ratings for that didn't actually have ratings doesn't look right."

# Make sure you made predictions on all the missing user-movie pairs
preds = np.dot(user_mat, movie_mat)
assert np.isnan(preds).sum() == 0
print("Nice job!  Looks like you have predictions made for all the missing user-movie pairs! But I still have one question... How good are they?")

Nice job!  Looks like you have predictions made for all the missing user-movie pairs! But I still have one question... How good are they?
