In [147]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio 
 
mat = sio.loadmat('ex8_movies.mat')
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [148]:
Y,R = mat['Y'],mat['R']
Y.shape,R.shape

((1682, 943), (1682, 943))

In [149]:
data = sio.loadmat('ex8_movieParams.mat')
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [150]:
X,Theta,nu,nm,nf = data['X'],data['Theta'],data['num_users'],data['num_movies'],data['num_features']
X.shape,Theta.shape,nu,nm,nf

((1682, 10),
 (943, 10),
 array([[943]], dtype=uint16),
 array([[1682]], dtype=uint16),
 array([[10]], dtype=uint8))

In [151]:
nu = int(nu)
nm = int(nm)
nf = int(nf)
nu,nm,nf

(943, 1682, 10)

In [152]:
def serialize(X,Theta):
    return np.append(X.flatten(),Theta.flatten())

In [153]:
def deserialize(params,nm,nu,nf):
    X = params[:nm*nf].reshape(nm,nf)
    Theta = params[nm*nf:].reshape(nu,nf)
    return X,Theta

In [154]:
def costFunction(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nm,nu,nf)
    cost = 0.5 * np.square((X @ Theta.T - Y) * R).sum()
 
    reg1=  0.5 * lamda * np.square(X).sum()
    reg2=  0.5 * lamda * np.square(Theta) .sum()
    return cost + reg1 + reg2

In [155]:
users = 4
movies = 5
features = 3
X_sub = X[:movies,:features]
Theta_sub = Theta[:users,:features]
Y_sub = Y[:movies,:users]
R_sub = R[:movies,:users]

In [156]:
cost1 = costFunction(serialize(X_sub,Theta_sub ),Y_sub,R_sub,movies,users,features,lamda = 0)
cost1

22.224603725685675

In [157]:
cost2 = costFunction(serialize(X_sub,Theta_sub ),Y_sub,R_sub,movies,users,features,lamda = 1.5)
cost2

31.344056244274217

In [158]:
def costGradient(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nu,nm,nf)
    X_grad = ((Theta @ X.T - Y) * R).T @ Theta + lamda * X
    Theta_grad = ((Theta @ X.T  - Y) * R) @ X + lamda * Theta 
    return serialize(X_grad,Theta_grad)
 
grad1 = costGradient(serialize(X_sub,Theta_sub ),Y_sub,R_sub,movies,users,features,lamda = 0)

In [159]:
my_ratings = np.zeros((nm, 1))
 
my_ratings[9] = 5 
my_ratings[66] = 5
my_ratings[96] = 5
my_ratings[121] = 4
my_ratings[148] = 4
my_ratings[285] = 3
my_ratings[490] = 4
my_ratings[599] = 4
my_ratings[643] = 4
my_ratings[958] = 5
my_ratings[1117] = 3

In [160]:
def normalizeRatings(Y,R):
    Y_mean = (Y.sum(axis = 1)/R.sum(axis = 1)).reshape(-1,1)
    Y_norm = (Y - Y_mean)*R
    return Y_norm,Y_mean
 
Y_norm, Y_mean= normalizeRatings(Y,R)

In [161]:
X = np.random.random((nm,nf))
Theta = np.random.random((nu,nf))
params = serialize(X,Theta)
lamda = 5
Y.shape

(1682, 943)

In [162]:
from scipy.optimize import minimize
res = minimize(x0 = params,
               fun = costFunction,
               args = (Y_norm,R,nm,nu,nf,lamda),
               method = 'TNC',
               jac = costGradient,
               options = {'maxfun': 100})

In [163]:
params_fit=res.x

In [164]:
fit_X,fit_Theta=deserialize(params_fit,nm,nu,nf)

In [165]:
Y_pred=fit_X@fit_Theta.T

In [166]:
y_pred=Y_pred[:,-1]+Y_mean.flatten()

In [167]:
index=np.argsort(-y_pred)
# np.argsort(y_pred)[::-1]

In [168]:
index[:10]

array([1292, 1535, 1499, 1188, 1598, 1466, 1121,  426, 1652,  518],
      dtype=int64)

In [171]:
movies=[]

with open('movie_ids.txt','r',encoding='latin 1') as f:
    for line in f:
        tokens=line.strip().split(' ')
        movies.append(' '.join(tokens[1:]))


In [172]:
len(movies)

1682

In [173]:
for i in range(10):
    print(index[i],movies[index[i]],y_pred[index[i]])

1292 Star Kid (1997) 5.33078118762216
1535 Aiqing wansui (1994) 5.321218015411277
1499 Santa with Muscles (1996) 5.266718658797148
1188 Prefontaine (1997) 5.240603214698998
1598 Someone Else's America (1995) 5.215628109983913
1466 Saint of Fort Washington, The (1993) 5.199067495413139
1121 They Made Me a Criminal (1939) 5.153490403220272
426 To Kill a Mockingbird (1962) 5.069016610640901
1652 Entertaining Angels: The Dorothy Day Story (1996) 5.013292925697334
518 Treasure of the Sierra Madre, The (1948) 4.908243609945259
