In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat

# Load Movie Data 

In [2]:
mat = loadmat('Data/ex8_movies.mat')
Y = mat['Y']
R = mat['R']

In [3]:
print('Average rating of the first movie: {}'.format(np.mean(Y[0, R[0]])))

Average rating of the first movie: 4.52067868504772


In [4]:
mat2 = loadmat('Data/ex8_movieParams.mat')
X = mat2['X']
theta = mat2['Theta']

# Small Sample Data 

In [5]:
num_users = 4
num_movies = 5
num_features = 3

In [6]:
X_s = X[:num_movies, :num_features]
theta_s = theta[:num_users, :num_features]
Y_s = Y[:num_movies, :num_users]
R_s = R[:num_movies, :num_users]

# Cost, Gradient & Regularization 

In [7]:
def cost_function(X, Y, R, theta, lmbd=0):
    J_ij = Y - np.dot(X, theta.T)
    J_ij_rated = np.multiply(R, J_ij**2)
    
    # cost function 
    J = np.sum(J_ij_rated)/2
    
    # gradient 
    X_grad = np.matmul(J_ij_rated, theta)
    theta_grad = np.matmul(X.T, J_ij_rated).T  
    
    # regularization of cost function 
    J_reg = np.sum(np.multiply(X, X)) + np.sum(np.multiply(theta, theta))
    J_tot = J + (lmbd/2) * J_reg
    
    # regularization of gradients 
    X_grad_reg = X_grad + (lmbd/2) * X 
    theta_grad_reg = theta_grad + (lmbd/2) * theta
    
    return J_tot, X_grad_reg, theta_grad_reg

In [8]:
# Without regularization 
cost_function(X_s, Y_s, R_s, theta_s)

(22.224603725685675,
 array([[  8.20892934, -27.06295833,   6.31125296],
        [  1.13103477,  -6.67369063,   1.04186212],
        [  2.42745536, -14.32324317,   2.23607076],
        [  0.51546346,  -3.04150121,   0.47482346],
        [  2.26336254, -13.35501059,   2.08491529]]),
 array([[ 31.25698589, -12.30737457,  21.91029487],
        [  8.87638892,  -3.38768347,  10.10738551],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]]))

In [9]:
# With regularization 
cost_function(X_s, Y_s, R_s, theta_s, lmbd=1.5)

(31.34405624427422,
 array([[  8.99544347, -27.3631323 ,   7.20684254],
        [  1.7166732 ,  -6.96291006,   1.43276046],
        [  2.90858701, -14.73413356,   2.17322348],
        [  0.85567683,  -3.64166504,   0.98518443],
        [  2.96651596, -13.27544317,   2.35638001]]),
 array([[ 31.4710686 , -13.57057338,  22.10749894],
        [  9.25514883,  -3.72866981,  10.34548234],
        [ -0.32393742,  -0.35910337,   0.63503333],
        [  0.54644879,  -0.20392043,   0.2451327 ]]))

# Loading Movies Names 

In [10]:
with open('Data/movie_ids.txt') as file: 
    fid = file.read()
    
fid = fid.split('\n')[:-1]

In [11]:
movie_list = [text.split(' ', 1) for text in fid]
movie_list = pd.DataFrame(movie_list, 
                          columns=['Index', 'Movie']).set_index('Index')
movie_list.sample(5)

Unnamed: 0_level_0,Movie
Index,Unnamed: 1_level_1
1265,Star Maps (1997)
216,When Harry Met Sally... (1989)
830,Power 98 (1995)
1158,"Fille seule, La (A Single Girl) (1995)"
18,"White Balloon, The (1995)"


# My Ratings 

In [12]:
my_ratings = np.zeros(len(fid))

my_ratings[7] = 3
my_ratings[12] = 5
my_ratings[54] = 4
my_ratings[64] = 5
my_ratings[66] = 3
my_ratings[69] = 5
my_ratings[183] = 4
my_ratings[226] = 5
my_ratings[355] = 5

my_ratings

array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
movie_list[my_ratings.astype('bool')]

Unnamed: 0_level_0,Movie
Index,Unnamed: 1_level_1
8,Babe (1995)
13,Mighty Aphrodite (1995)
55,"Professional, The (1994)"
65,What's Eating Gilbert Grape (1993)
67,Ace Ventura: Pet Detective (1994)
70,Four Weddings and a Funeral (1994)
184,Army of Darkness (1993)
227,Star Trek VI: The Undiscovered Country (1991)
356,"Client, The (1994)"


# My Recommendations 