In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import svds

%matplotlib inline
np.random.seed(0)
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
from sklearn.utils import shuffle

# Dataset: Movielens 100k

https://grouplens.org/datasets/movielens/

A secondary notebook to experiment with GD

In [2]:
movies_df = pd.read_csv('./ml-latest-small/movies.csv', names=['MovieID', 'Title', 'Genres'], header=0)
ratings_df = pd.read_csv('./ml-latest-small/ratings.csv', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], header=0)
tags_df = pd.read_csv('./ml-latest-small/tags.csv', names=['UserID', 'MovieID', 'tag', 'Timestamp'], header=0)
links_df = pd.read_csv('./ml-latest-small/links.csv', names=['MovieID', 'imdbId', 'tmbdId'], header=0)

In [3]:
links_df = pd.read_csv('./ml-latest-small/links.csv', names=['MovieID', 'imdbId', 'tmbdId'], header=0)

### Train-test split. 

Each user's latest ratings (according to timestamp) are set as the test data. 

In [4]:
def train_test_split(data, train_ratio):
    '''Creates a train/test split from a pandas DataFrame that contains rating data. The dataframe should have
    columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']. For each user, items are sorted by timestamp before sending to
    test set. Hence test set will contain users latest ratings.
    
    Args:
        data: Ratings dataframe
        train_ratio: float between 0 and 1
    
    '''
    
    
    train = pd.DataFrame(columns=data.columns)
    test = pd.DataFrame(columns=data.columns)
    users = list(set(data['UserID']))
    dummy_movieid = list(ratings_df.MovieID.unique()) #To ensure our matrix later contains all MovieID.
    dummy_movieid = pd.DataFrame({'UserID':999, 'MovieID':dummy_movieid, 'Rating':0, 'Timestamp':0})
    for u in users:
        temp = data[data['UserID'] == u]
        n = len(temp)
        test_size = int(train_ratio*n)
        temp = temp.sort_values('Timestamp', ascending=False).reset_index(drop=True)
        dummy_train = temp.iloc[:test_size]
        dummy_test = temp.iloc[test_size:]
        train = pd.concat([train, dummy_train])
        test = pd.concat([test, dummy_test])
    train = train.append(dummy_movieid)
    test = test.append(dummy_movieid)

    
    return train, test
    

In [5]:
train, test = train_test_split(ratings_df, 0.5)

In [6]:
print train.shape
print test.shape

(58904, 4)
(59232, 4)


### Factorizing the matrix and minimizing error with gradient descent.

In [7]:
 def initialize_random_matrices(ratings, K):
        '''
        Returns 2 matrices of size N,K and M,K that contains random floats from 0 to 1. 

        Args:
            ratings: N by M matrix containing ratings. Rows are users, columns are products.
            K: Number of latent factors to discover
            
        '''
        R = np.array(ratings)
        N = len(R) 
        M = len(R[0])  
        P = np.random.random((N,K))
        Q = np.random.random((M,K)) #Note: We will transpose this matrix later.
        return P, Q


Quick demonstration of the GD in action. In this demonstration I've set the learning rate to be quite high (0.01) with a small amount of steps (5), and to print the product of the 2 smaller matrixes. Let's observe how the product of the component matrices converge towards the original matrix.

In [12]:
def matrix_factorization(R, P, Q, K, steps=1000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)): # iterate over rows (or users)
            for j in xrange(len(R[i])): # for each user, iterate all items. 30 times.
                #Hence, iterate over entire row of matrix R, before moving on to next row.
                if R[i][j] > 0: # If a given element in a matrix is > 0, aka user has actually rated an item
                    # eij is the error for a given element in matrix R
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j]) 
                    # Iterate over the user:feature matrix.
                    for k in xrange(K): # K is number of latent features. 
                        # Updating each indivudual feature of the matrix P and matrix Q. 
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])): #for elements in the original matrix:
                if R[i][j] > 0: #if element > 0 (aka if a rating is there):
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2) #sum up the square of the error
                    for k in xrange(K): 
                        #for each element in R matrx, iterate K also
                            e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        print '=============== Step %s =============' % step
        print np.dot(P,Q)
        if e < 0.001:
            break
    return P, Q.T



In [None]:
test_P = np.random.random((4,2))
test_Q = np.random.random((2,6))

test_R = np.random.randint(0,6, (4,6))

print 'This is the ground truth matrix'
print test_R

matrix_factorization(test_R, test_P, test_Q, 5, steps=5, alpha=0.01)

In [14]:
def matrix_factorization(R, P, Q, K, steps=1000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)): # iterate over rows (or users)
            for j in xrange(len(R[i])): # for each user, iterate all items. 30 times.
                #Hence, iterate over entire row of matrix R, before moving on to next row.
                if R[i][j] > 0: # If a given element in a matrix is > 0, aka user has actually rated an item
                    # eij is the error for a given element in matrix R
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j]) 
                    # Iterate over the user:feature matrix.
                    for k in xrange(K): # K is number of latent features. 
                        # Updating each indivudual feature of the matrix P and matrix Q. 
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])): #for elements in the original matrix:
                if R[i][j] > 0: #if element > 0 (aka if a rating is there):
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2) #sum up the square of the error
                    for k in xrange(K): 
                        #for each element in R matrx, iterate K also
                            e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T



### Now let's create our train and test sets. Test 20 latent factors.

In [15]:
import timeit
start_time = timeit.default_timer()

train, test = train_test_split(ratings_df, 0.7)

train_matrix = train.pivot(index='UserID', columns = 'MovieID', values='Rating')
test_matrix = test.pivot(index='UserID', columns = "MovieID", values='Rating')
train_matrix.drop(999, inplace=True)
test_matrix.drop(999, inplace=True)
test_matrix.fillna(0,inplace=True)
train_matrix.fillna(0, inplace=True)



train_matrix = np.array(train_matrix)
test_matrix = np.array(test_matrix)

ground_truth = ratings_df.pivot(index='UserID', columns='MovieID', values='Rating')


elapsed = timeit.default_timer() - start_time

In [17]:
P, Q = initialize_random_matrices(train_matrix, 20)

print P.shape
print Q.shape

(671, 20)
(9066, 20)


In [18]:
import timeit
start_time = timeit.default_timer()


result_P, result_Q = matrix_factorization(train_matrix, P, Q, 20, steps=500)


elapsed = timeit.default_timer() - start_time
print elapsed

7202.86177588


In [13]:
print result_P.shape
print result_Q.shape
result_Q = result_Q.T
print result_Q.shape

(671, 20)
(9066, 20)
(20, 9066)


In [14]:
result = np.dot(result_P, result_Q)

In [15]:
result

array([[ 2.7955247 ,  2.12994515,  2.48271815, ...,  2.53698947,
         3.13278153,  2.69160778],
       [ 3.89258677,  3.69984534,  3.02682759, ...,  3.59490059,
         4.34165606,  4.08231956],
       [ 3.85433844,  3.2413135 ,  2.95995802, ...,  3.32502532,
         3.5075291 ,  3.75279253],
       ..., 
       [ 3.871197  ,  3.98846892,  3.69473605, ...,  3.7778753 ,
         4.28127537,  3.71942885],
       [ 3.84962331,  3.79075648,  3.10786541, ...,  4.14327625,
         4.53302461,  4.49214904],
       [ 3.96592745,  3.96348059,  3.4620479 , ...,  3.74878023,
         4.52847665,  5.02011101]])

In [16]:
ground_truth

MovieID,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,3.0,,,,,,,,,3.0,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,4.0,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [18]:
print ratings_df.pivot(index = 'UserID', columns='MovieID', values='Rating').shape
print result.shape

(671, 9066)
(671, 9066)


In [68]:
ground_truth_list = []
result_list = []

ground_truth_array = np.array(ground_truth)
result_array = np.array(result)

i, j = np.where(test_matrix != 0)
locators = zip(i,j)

for i in locators:
    ground_truth_list.append(ground_truth_array[i])
    result_list.append(result_array[i])

ground_truth_array = np.array(ground_truth_list)
result_array = np.array(result_list)

rmse = np.sqrt(sum((ground_truth_array - result_array) ** 2) / len(ground_truth_array))

print 'Rmse is %s ' % rmse





                                              
    

Rmse is 1.0118604462 


### Trying: 40 latent factors

In [26]:
P_40, Q_40 = initialize_random_matrices(train_matrix, 40)



In [30]:
print P_40.shape
print Q_40.shape

(671, 40)
(9066, 40)


In [None]:
start_time = timeit.default_timer()


result_P_40, result_Q_40 = matrix_factorization(train_matrix, P_40, Q_40, 40, steps=500)

elapsed = timeit.default_timer() - start_time
print elapsed





### Try creating 4 . WIP

In [120]:
def train_test_split(data, train_ratio):
    '''Creates a train/test split from a pandas DataFrame that contains rating data. The dataframe should have
    columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']. For each user, items are sorted by timestamp before sending to
    test set. Hence test set will contain users latest ratings.
    
    Args:
        data: Ratings dataframe
        train_ratio: float between 0 and 1
    
    '''
    
    
    train = pd.DataFrame(columns=data.columns)
    test = pd.DataFrame(columns=data.columns)
    users = list(set(data['UserID']))
    dummy_movieid = list(ratings_df.MovieID.unique()) #To ensure our matrix later contains all MovieID.
    dummy_movieid = pd.DataFrame({'UserID':999, 'MovieID':dummy_movieid, 'Rating':0, 'Timestamp':0})
    for u in users:
        temp = data[data['UserID'] == u]
        n = len(temp)
        test_size = int(train_ratio*n)
        temp = temp.sort_values('Timestamp', ascending=False).reset_index(drop=True)
        dummy_train = temp.iloc[:test_size]
        dummy_test = temp.iloc[test_size:]
        train = pd.concat([train, dummy_train])
        test = pd.concat([test, dummy_test])
    train = train.append(dummy_movieid)
    test = test.append(dummy_movieid)

    
    return train, test
    

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,0.0,0.0,0.0,0.0
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


ValueError: Must pass DataFrame with boolean values only