In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import svds

%matplotlib inline
np.random.seed(0)
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
from sklearn.utils import shuffle

# Dataset: Movielens 100k

https://grouplens.org/datasets/movielens/

A secondary notebook to experiment with GD

In [2]:
movies_df = pd.read_csv('./ml-latest-small/movies.csv', names=['MovieID', 'Title', 'Genres'], header=0)
ratings_df = pd.read_csv('./ml-latest-small/ratings.csv', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], header=0)
tags_df = pd.read_csv('./ml-latest-small/tags.csv', names=['UserID', 'MovieID', 'tag', 'Timestamp'], header=0)
links_df = pd.read_csv('./ml-latest-small/links.csv', names=['MovieID', 'imdbId', 'tmbdId'], header=0)

In [3]:
links_df = pd.read_csv('./ml-latest-small/links.csv', names=['MovieID', 'imdbId', 'tmbdId'], header=0)

### Train-test split. 

Each user's latest ratings (according to timestamp) are set as the test data. 

In [87]:
def train_test_split(data, test_size):
    '''Creates a train/test split from a pandas DataFrame that contains rating data. The dataframe should have
    columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']. For each user, items are sorted by timestamp before sending to
    test set. Hence test set will contain users latest ratings.
    
    Args:
        data: Ratings dataframe
        train_ratio: float between 0 and 1
    
    '''
    
    
    train = pd.DataFrame(columns=data.columns)
    test = pd.DataFrame(columns=data.columns)
    users = list(set(data['UserID']))
    dummy_movieid = list(ratings_df.MovieID.unique()) #To ensure our matrix later contains all MovieID.
    dummy_movieid = pd.DataFrame({'UserID':999, 'MovieID':dummy_movieid, 'Rating':0, 'Timestamp':0})
    for u in users:
        temp = data[data['UserID'] == u]
        n = len(temp)
        test_size = int(test_size*n)
        temp = temp.sort_values('Timestamp', ascending=False).reset_index(drop=True)
        dummy_train = temp.iloc[:test_size]
        dummy_test = temp.iloc[test_size:]
        train = pd.concat([train, dummy_train])
        test = pd.concat([test, dummy_test])
    train = train.append(dummy_movieid)
    test = test.append(dummy_movieid)

    
    return train, test
    

### Factorizing the matrix and minimizing error with gradient descent.

In [5]:
 def initialize_random_matrices(ratings, K):
        '''
        Returns 2 matrices of size N,K and M,K that contains random floats from 0 to 1. 

        Args:
            ratings: N by M matrix containing ratings. Rows are users, columns are products.
            K: Number of latent factors to discover
            
        '''
        R = np.array(ratings)
        N = len(R) 
        M = len(R[0])  
        P = np.random.random((N,K))
        Q = np.random.random((M,K)) #Note: We will transpose this matrix later.
        return P, Q


Quick demonstration of the GD in action. In this demonstration I've set the learning rate to be quite high (0.01) with a small amount of steps (5), and to print the product of the 2 smaller matrixes. Let's observe how the product of the component matrices converge towards the original matrix.

In [6]:
def matrix_factorization(R, P, Q, K, steps=1000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)): # iterate over rows (or users)
            for j in xrange(len(R[i])): # for each user, iterate all items. 30 times.
                #Hence, iterate over entire row of matrix R, before moving on to next row.
                if R[i][j] > 0: # If a given element in a matrix is > 0, aka user has actually rated an item
                    # eij is the error for a given element in matrix R
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j]) 
                    # Iterate over the user:feature matrix.
                    for k in xrange(K): # K is number of latent features. 
                        # Updating each indivudual feature of the matrix P and matrix Q. 
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])): #for elements in the original matrix:
                if R[i][j] > 0: #if element > 0 (aka if a rating is there):
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2) #sum up the square of the error
                    for k in xrange(K): 
                        #for each element in R matrx, iterate K also
                            e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        print '==============='
        print np.dot(P,Q)
        if e < 0.001:
            break
    return P, Q.T



In [7]:
test_P = np.random.random((10,10))
test_Q = np.random.random((10,10))

test_R = np.random.randint(0,6, (10,10))

print 'This is the ground truth matrix'
print test_R

matrix_factorization(test_R, test_P, test_Q, 5, steps=5, alpha=0.01)

This is the ground truth matrix
[[2 4 3 3 1 1 5 5 1 1]
 [2 0 3 1 4 3 1 1 2 0]
 [0 2 4 0 4 3 3 0 0 3]
 [5 3 0 3 0 5 2 1 0 4]
 [0 3 4 4 1 0 2 4 4 2]
 [4 3 2 3 0 5 5 5 0 0]
 [5 3 4 4 5 2 2 4 1 3]
 [3 3 1 2 5 2 3 5 2 3]
 [4 4 2 5 1 2 5 0 3 0]
 [2 4 3 2 1 2 1 4 5 1]]
[[ 3.22088405  3.85021204  3.07641667  3.12552275  3.9120916   2.96848505
   3.42858391  2.8978212   3.23573653  2.08611764]
 [ 3.42643422  3.15843631  2.88755411  2.56510073  3.96938339  2.22013415
   2.90226031  2.24551108  2.31522888  2.08013568]
 [ 3.5840489   3.40940576  3.2535554   3.02333637  4.16399847  2.99057265
   3.48947508  2.63752093  2.75099807  2.39349639]
 [ 3.38816005  3.79459314  2.90629338  2.91923434  3.68275086  3.13576247
   3.04614564  2.24186472  2.98310482  2.04816713]
 [ 2.31329371  2.82325193  2.41216176  2.12574213  2.68559878  1.81942521
   2.80080972  1.97871924  2.47845576  1.71582651]
 [ 2.77026012  3.22633959  2.67256735  2.26183412  2.68519016  2.09351318
   2.67218552  2.1941573   2.4550941  

(array([[ 0.40992751,  0.42949825,  0.46110784,  0.13047875,  0.32697438,
          0.64589411,  0.43758721,  0.891773  ,  0.96366276,  0.38344152],
        [ 0.46677015,  0.41135125,  0.30653357,  0.69678008, -0.31092781,
          0.0871293 ,  0.0202184 ,  0.83261985,  0.77815675,  0.87001215],
        [ 0.97043833,  0.71750185,  0.428843  ,  0.81139294,  0.03821421,
          0.63992102,  0.14335329,  0.94466892,  0.52184832,  0.41466194],
        [ 0.27082142,  0.86907626,  0.45554291,  0.88616309, -0.12057672,
          0.6176355 ,  0.61209572,  0.616934  ,  0.94374808,  0.6818203 ],
        [ 0.49127401,  0.65230772,  0.87926543,  0.05068005,  0.92043856,
          0.67063787,  0.21038256,  0.1289263 ,  0.31542835,  0.36371077],
        [ 0.913725  ,  0.77797309,  1.2587    ,  0.49252727,  0.63771653,
          0.16130952,  0.65310833,  0.2532916 ,  0.46631077,  0.24442559],
        [ 0.73581954,  0.21207747,  1.00111539,  0.64381136,  0.49866972,
          0.36872517,  0.8209932

In [8]:
def matrix_factorization(R, P, Q, K, steps=1000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)): # iterate over rows (or users)
            for j in xrange(len(R[i])): # for each user, iterate all items. 30 times.
                #Hence, iterate over entire row of matrix R, before moving on to next row.
                if R[i][j] > 0: # If a given element in a matrix is > 0, aka user has actually rated an item
                    # eij is the error for a given element in matrix R
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j]) 
                    # Iterate over the user:feature matrix.
                    for k in xrange(K): # K is number of latent features. 
                        # Updating each indivudual feature of the matrix P and matrix Q. 
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])): #for elements in the original matrix:
                if R[i][j] > 0: #if element > 0 (aka if a rating is there):
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2) #sum up the square of the error
                    for k in xrange(K): 
                        #for each element in R matrx, iterate K also
                            e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T



### Now let's create our train and test sets. Test 20 latent factors.

In [88]:
import timeit
start_time = timeit.default_timer()

train, test = train_test_split(ratings_df, 0.7)

train_matrix = train.pivot(index='UserID', columns = 'MovieID', values='Rating')
test_matrix = test.pivot(index='UserID', columns = "MovieID", values='Rating')
train_matrix.drop(999, inplace=True)
test_matrix.drop(999, inplace=True)
test_matrix.fillna(0,inplace=True)
train_matrix.fillna(0, inplace=True)



train_matrix = np.array(train_matrix)
test_matrix = np.array(test_matrix)

ground_truth = ratings_df.pivot(index='UserID', columns='MovieID', values='Rating')


elapsed = timeit.default_timer() - start_time

In [92]:
test.shape

(9072, 4)

In [11]:
P, Q = initialize_random_matrices(train_matrix, 20)

print P.shape
print Q.shape

(671, 20)
(9066, 20)


In [12]:
import timeit
start_time = timeit.default_timer()


result_P, result_Q = matrix_factorization(train_matrix, P, Q, 20, steps=100)


elapsed = timeit.default_timer() - start_time
print elapsed

1528.25876904


In [13]:
print result_P.shape
print result_Q.shape
result_Q = result_Q.T
print result_Q.shape

(671, 20)
(9066, 20)
(20, 9066)


In [14]:
result = np.dot(result_P, result_Q)

In [15]:
result

array([[ 2.7955247 ,  2.12994515,  2.48271815, ...,  2.53698947,
         3.13278153,  2.69160778],
       [ 3.89258677,  3.69984534,  3.02682759, ...,  3.59490059,
         4.34165606,  4.08231956],
       [ 3.85433844,  3.2413135 ,  2.95995802, ...,  3.32502532,
         3.5075291 ,  3.75279253],
       ..., 
       [ 3.871197  ,  3.98846892,  3.69473605, ...,  3.7778753 ,
         4.28127537,  3.71942885],
       [ 3.84962331,  3.79075648,  3.10786541, ...,  4.14327625,
         4.53302461,  4.49214904],
       [ 3.96592745,  3.96348059,  3.4620479 , ...,  3.74878023,
         4.52847665,  5.02011101]])

In [16]:
ground_truth

MovieID,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,3.0,,,,,,,,,3.0,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,4.0,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [18]:
print ratings_df.pivot(index = 'UserID', columns='MovieID', values='Rating').shape
print result.shape

(671, 9066)
(671, 9066)


In [68]:
ground_truth_list = []
result_list = []

ground_truth_array = np.array(ground_truth)
result_array = np.array(result)

i, j = np.where(test_matrix != 0)
locators = zip(i,j)

for i in locators:
    ground_truth_list.append(ground_truth_array[i])
    result_list.append(result_array[i])

ground_truth_array = np.array(ground_truth_list)
result_array = np.array(result_list)

rmse = np.sqrt(sum((ground_truth_array - result_array) ** 2) / len(ground_truth_array))

print 'Rmse is %s ' % rmse





                                              
    

Rmse is 1.0118604462 


### Function to recommend movies, given a user

In [70]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    '''Generates top k predictions for a user. 
    Args:
    predictions_df: Dataframe of predicted ratings where columns = products, rows = users.
    userID: ID of the user to generate recommendations for.
    movies_df: Dataframe of movies, where movie ID column is named "MovieID"
    original_ratings_df: Original dataframe of matrix where columns = products, rows = users
    '''
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print 'User {0} has already rated {1} movies.'.format(userID, user_full.shape[0])
    print 'Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      ) 
    return user_full, recommendations



### Let's try testing with 4 folds.

In [84]:
def create_fold1(data, test_size):
        
    train = pd.DataFrame(columns=data.columns)
    test = pd.DataFrame(columns=data.columns)
    users = list(set(data['UserID']))
    dummy_movieid = list(ratings_df.MovieID.unique()) #To ensure our matrix later contains all MovieID.
    dummy_movieid = pd.DataFrame({'UserID':999, 'MovieID':dummy_movieid, 'Rating':0, 'Timestamp':0})
    for u in users:
        temp = data[data['UserID'] == u]
        n = len(temp)
        test_size = int(test_size*n)
        temp = temp.sort_values('Timestamp', ascending=False).reset_index(drop=True)
        dummy_train = temp.iloc[:test_size]
        dummy_test = temp.iloc[test_size:]
        train = pd.concat([train, dummy_train])
        test = pd.concat([test, dummy_test])
    train = train.append(dummy_movieid)
    test = test.append(dummy_movieid)

    
    return train, test
    

In [85]:
X_train, X_test = create_fold1(ratings_df, 0.5)

In [86]:
print X_train.shape
print X_test.shape

(109060, 4)
(9076, 4)
