<a href="https://colab.research.google.com/github/renatavel/goit_ds_homeworks/blob/main/hw_10_add.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from scipy.io import loadmat

In [2]:
def load_movie_list(movie_list):
  with open(movie_list, encoding='ISO-8859-1') as fd:
    movies = fd.readlines()

  movieNames = []
  for movie in movies:
    parts = movie.split()
    movieNames.append(''.join(parts[1:]).strip())
  return movieNames

In [4]:
names = load_movie_list('movie_ids.txt')
names_df = pd.DataFrame(names).rename(lambda x: 'movie_name', axis=1).rename_axis('movie_number')
names_df.head()

Unnamed: 0_level_0,movie_name
movie_number,Unnamed: 1_level_1
0,ToyStory(1995)
1,GoldenEye(1995)
2,FourRooms(1995)
3,GetShorty(1995)
4,Copycat(1995)


In [6]:
data = loadmat('movies.mat')
Y, R = data['Y'], data['R']

In [7]:
pd.DataFrame(Y).head().iloc[:, 0:5].rename(lambda x: f'movie_{x}', axis=0).rename(lambda x: f'user_{x}', axis=1)


Unnamed: 0,user_0,user_1,user_2,user_3,user_4
movie_0,5,4,0,0,4
movie_1,3,0,0,0,3
movie_2,4,0,0,0,0
movie_3,3,0,0,0,0
movie_4,3,0,0,0,0


In [8]:
num_movies, num_users = Y.shape
avg_ratings = np.zeros(num_movies)

for movie_number in range(num_movies):
  binary_mask = R[movie_number]
  row_selected_elements = Y[movie_number][binary_mask]
  avg_ratings[movie_number] = np.mean(row_selected_elements).round(1)

movie_avg_rating_df = pd.DataFrame(avg_ratings).rename(lambda x: 'avg_rating', axis=1).rename_axis('movie_number')
pd.concat([names_df, movie_avg_rating_df], axis=1)

Unnamed: 0_level_0,movie_name,avg_rating
movie_number,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ToyStory(1995),4.5
1,GoldenEye(1995),2.6
2,FourRooms(1995),3.6
3,GetShorty(1995),2.3
4,Copycat(1995),2.7
...,...,...
1677,Mat'isyn(1997),0.0
1678,B.Monkey(1998),0.0
1679,SlidingDoors(1998),0.0
1680,YouSoCrazy(1994),0.0


In [9]:
num_features = 10
learning_rate = 0.01
lambda_val = 10

In [10]:
def cofi_cost_func(params, Y, R, num_users, num_movies, num_features, lambda_val=0):
  X = params[:num_movies * num_features].reshape(num_movies, num_features)
  W = params[num_movies * num_features:].reshape(num_users, num_features)

  error = (X @ W.T - Y) * R
  J = 1 / 2 * np.sum(error**2)

  reg_X = 1 / 2 * lambda_val * np.sum(X**2)
  reg_W = 1 / 2 * lambda_val * np.sum(W**2)
  J += reg_X + reg_W

  X_grad = (error @ W) + lambda_val * X
  W_grad = (error.T @ X) + lambda_val * W

  grad = np.concatenate((X_grad.ravel(), W_grad.ravel()))

  return J, grad

In [11]:
def normalization(Y, R):
  m, n = Y.shape
  Ymean = np.zeros(m)
  Ynorm = np.zeros(Y.shape)

  for i in range(m):
        index = R[i, :] == 1
        if np.any(index):
            Ymean[i] = np.mean(Y[i, index])
            Ynorm[i, index] = Y[i, index] - Ymean[i]
        else:
            Ymean[i] = 0

  return Ynorm, Ymean

In [12]:
Ynorm, Ymean = normalization(Y, R)

In [13]:
print("Чи всі значення в межах 0-5?", np.all((Ymean[:10] >= 0) & (Ymean[:10] <= 5)))

Чи всі значення в межах 0-5? True


In [14]:
X = np.random.rand(num_movies, num_features)
W = np.random.rand(num_users, num_features)

initial_parameters = np.concatenate([X.ravel(), W.ravel()])

In [15]:
from scipy.optimize import minimize
result = minimize(lambda x: cofi_cost_func(x, Ynorm, R, num_users, num_movies, num_features, lambda_val), initial_parameters, method='TNC', jac=True)

optimal_parameters = result.x
X = optimal_parameters[:num_movies * num_features].reshape(num_movies, num_features)
W = optimal_parameters[num_movies * num_features:].reshape(num_users, num_features)

In [18]:
p = X @ W.T
my_predictions = np.clip(p[:, 0] + Ymean, 0, 5)

ix = np.argsort(my_predictions)[::-1]

recommendations = []
for i in range(50):
  j = ix[i]
  movie_name = names[j]
  predicted_rating = my_predictions[j].round(1)
  recommendations.append({'ids': j, 'movie_name': movie_name, 'predicted_rating': predicted_rating})


predicted_df = pd.DataFrame(recommendations).set_index('ids').sort_values(by='predicted_rating', ascending = False)
predicted_df

Unnamed: 0_level_0,movie_name,predicted_rating
ids,Unnamed: 1_level_1,Unnamed: 2_level_1
49,StarWars(1977),5.0
1188,Prefontaine(1997),5.0
1200,MarleneDietrich:ShadowandLight(1996),5.0
99,Fargo(1996),5.0
510,LawrenceofArabia(1962),5.0
1292,StarKid(1997),5.0
180,ReturnoftheJedi(1983),5.0
167,MontyPythonandtheHolyGrail(1974),5.0
168,"WrongTrousers,The(1993)",5.0
171,"EmpireStrikesBack,The(1980)",5.0
