In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.optimize as opt
from sklearn.metrics import classification_report  #这个包是评价报告

In [2]:
data_1=sio.loadmat('/Users/yangguangqiang/Music/career-2021/ML-startup/吴恩达/ML-homework-main/ex8-anomaly detection and recommendation/data/ex8_movies.mat')
data_rate=data_1['Y']
data_did=data_1['R']
data_rate.shape,data_did.shape    # 1682 movies and 943 users

((1682, 943), (1682, 943))

In [3]:
data_2=sio.loadmat('/Users/yangguangqiang/Music/career-2021/ML-startup/吴恩达/ML-homework-main/ex8-anomaly detection and recommendation/data/ex8_movieParams.mat')
data_x=data_2['X']
data_theta=data_2['Theta']
data_x.shape,data_theta.shape   # 10 个电影的判断 features

((1682, 10), (943, 10))

In [4]:
def combine(X,theta):
    return np.concatenate((X.ravel(),theta.ravel()))     # ravel():将多维数组转换为一维数组
def decombine(para,num_movie,num_user,num_feature):
    return para[:num_movie*num_feature].reshape(num_movie,num_feature) , para[num_movie*num_feature:].reshape(num_user,num_feature)

In [5]:
def cost(parameter,Y,R,num_feature):  # parameter : store X and theta
    movie_n,user_n=Y.shape
    X,theta=decombine(parameter,movie_n,user_n,num_feature)
    # R==1时，才计算损失
    inner= (X@theta.T-Y)*R    # X:num_movie,num_feature ; theta:num_user,num_feature ; R: num_movie,num_user
    return (inner**2).sum()/2

def regularized_cost(parameter,Y,R,num_feature,fi=1):  #增加学习率
    third=(parameter**2).sum()/2
    return cost(parameter,Y,R,num_feature)+third    

In [6]:
def gradient(parameter,Y,R,num_feature):
    movie_n,user_n=Y.shape
    X,theta=decombine(parameter,movie_n,user_n,num_feature)
    
    inner=(X@theta.T-Y)*R 
    x_grad=inner @ theta   # inner: num_movie,num_user ; theta:num_user,num_feature
    theta_grad=  inner.T @ X           # inner: num_movie,num_user ; X:num_movie,num_feature
    
    return combine(x_grad,theta_grad)

def regularized_gradient(parameter,Y,R,num_feature,fi=1):
    third= fi*parameter
    return  gradient(parameter,Y,R,num_feature)+third 

In [None]:
### prepare data we need

In [14]:
ratings = np.zeros(1682)  #add new users score
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

In [15]:
Y = np.insert(data_rate, 0, ratings, axis=1)
R = np.insert(data_did, 0, ratings != 0, axis=1)
Y.shape, R.shape

((1682, 944), (1682, 944))

In [16]:
n_feature = 50 ; n_movie, n_user = Y.shape  ; fi = 10

X = np.random.standard_normal((n_movie, n_feature))    # initialize X  and theta
theta = np.random.standard_normal((n_user, n_feature))   # set there are 50 features
X.shape, theta.shape

((1682, 50), (944, 50))

In [17]:
## 均值规范化
param = combine(X, theta)

Y_norm = Y - Y.mean()
Y.mean(), Y_norm.mean()

(0.222338595094621, 4.6862111343939375e-17)

In [18]:
### start to train model

In [19]:
result= opt.minimize(fun=regularized_cost, x0=param, args=(Y_norm, R, n_feature, fi),
                  method='TNC', jac=regularized_gradient)

In [20]:
result

     fun: 22354.954453714294
     jac: array([-9.38706287,  1.9944128 , -3.26817454, ...,  1.19219805,
        2.15462181, -1.23924965])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 223
     nit: 15
  status: 1
 success: True
       x: array([-0.10462144,  0.06016626, -0.09774011, ..., -0.22601338,
       -0.04967282,  0.86790336])

In [None]:
### predictions

In [24]:
X_trained, theta_trained =decombine(result.x, n_movie, n_user, n_feature)
prediction = X_trained @ theta_trained.T+Y.mean()
prediction

array([[3.77692478, 4.49422069, 4.27651842, ..., 4.4010995 , 4.5385381 ,
        5.43011502],
       [2.66783916, 3.23761623, 2.57845811, ..., 2.79063916, 3.55624699,
        4.01193114],
       [2.23524393, 3.84370854, 1.74031206, ..., 2.31295274, 2.65612175,
        3.15443128],
       ...,
       [0.54384763, 0.65863084, 0.52682817, ..., 0.52381907, 0.60440925,
        0.62043069],
       [0.65349248, 0.77706383, 0.70539754, ..., 0.69769557, 0.89914988,
        0.79658246],
       [0.62407387, 1.11534957, 0.81464207, ..., 0.74149189, 0.84014084,
        0.84049639]])

In [None]:
### for user 0 , recommend top 10

In [25]:
movie_list = []   # get movie name list
with open('/Users/yangguangqiang/Music/career-2021/ML-startup/吴恩达/ML-homework-main/ex8-anomaly detection and recommendation/data/movie_ids.txt', encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))
movie_list = np.array(movie_list)

In [36]:
index= np.argsort(prediction[:,0])[::-1]
for id in index[:10]:
    print((prediction[id,0],movie_list[id]))

(4.418256696939427, 'Forrest Gump (1994)')
(4.268612254141631, 'Star Wars (1977)')
(4.175845335026573, 'Titanic (1997)')
(4.0562781211542625, 'Air Force One (1997)')
(4.0533777283130075, 'Shawshank Redemption, The (1994)')
(4.018085301807867, 'Return of the Jedi (1983)')
(3.9764433951496048, 'Usual Suspects, The (1995)')
(3.802771044676928, 'Good Will Hunting (1997)')
(3.7769247826739685, 'Toy Story (1995)')
(3.775380523148549, 'Empire Strikes Back, The (1980)')
