In [16]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, SVDpp, NMF
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
from scipy import optimize
from scipy.io import loadmat

# Load data for the recommender system
tags_df = pd.read_csv('C:\\Users\\Сергій\\Documents\\Code\\Current_task\\Math-test\\tags.csv')
movies_df = pd.read_csv('C:\\Users\\Сергій\\Documents\\Code\\Current_task\\Math-test\\movies.csv')
links_df = pd.read_csv('C:\\Users\\Сергій\\Documents\\Code\\Current_task\\Math-test\\links.csv')
ratings_df = pd.read_csv('C:\\Users\\Сергій\\Documents\\Code\\Current_task\\Math-test\\ratings_filtered.csv')

# Merge files based on movieId
merged_df = movies_df.merge(links_df, on='movieId', how='inner')
merged_df = merged_df.merge(tags_df, on='movieId', how='inner')
merged_df = merged_df.merge(ratings_df[['userId', 'movieId', 'rating']], on=['userId', 'movieId'], how='inner')

# Define a Reader for the surprise library
reader = Reader(rating_scale=(0, 5))

# Create a dataset from the data
data = Dataset.load_from_df(merged_df[['userId', 'movieId', 'rating']], reader)

# Split the dataset into a training and testing set for the recommender system
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build and train an SVD model for the recommender system
model = SVD()
model.fit(trainset)

# Predict ratings for the test set for the recommender system
predictions = model.test(testset)

# Calculate the Root Mean Squared Error (RMSE) for the recommender system
rmse = accuracy.rmse(predictions)
print(f'RMSE for Recommender System (SVD): {rmse}')

# Load user-item rating matrix for matrix factorization
ratings_matrix = np.array([
    [5, 4, 0, 0, 1],
    [0, 0, 5, 4, 2],
    [3, 0, 0, 0, 5],
    [0, 2, 4, 0, 0],
    [1, 0, 0, 3, 4]
])

# Number of users and items
num_users, num_items = ratings_matrix.shape

# Initialize user and item matrices with random values
num_factors = 3  # Number of factors
user_matrix = np.random.rand(num_users, num_factors)
item_matrix = np.random.rand(num_items, num_factors)

# Gradient descent for matrix factorization
learning_rate = 0.01
num_iterations = 100

for _ in range(num_iterations):
    # Make predictions for matrix factorization
    predicted_ratings = np.dot(user_matrix, item_matrix.T)

    # Calculate loss for matrix factorization
    loss = np.sum((ratings_matrix - predicted_ratings) ** 2)

    # Calculate gradients for matrix factorization
    user_gradient = -2 * np.dot((ratings_matrix - predicted_ratings), item_matrix)
    item_gradient = -2 * np.dot((ratings_matrix - predicted_ratings).T, user_matrix)

    # Update user and item matrices for matrix factorization
    user_matrix -= learning_rate * user_gradient
    item_matrix -= learning_rate * item_gradient

# Predict ratings for new users and items for matrix factorization
new_user_ratings = np.dot(user_matrix[-1, :], item_matrix.T)
print(f'Predicted Ratings for New User (Matrix Factorization): {new_user_ratings}')

# Define a list of algorithms for collaborative filtering
algorithms = [SVD(), SVDpp(), NMF()]

# Iterate over algorithms and perform cross-validation
for algo in algorithms:
    results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    print(f"Results for {algo.__class__.__name__}:")
    print("RMSE:", np.mean(results['test_rmse']))
    print("MAE:", np.mean(results['test_mae']))

# Collaborative Filtering - Your Movie Ratings and Recommendations
def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, lambda_=0.0):
    # Unfold the X and Theta matrices from params
    X = params[:num_movies * num_features].reshape(num_movies, num_features)
    Theta = params[num_movies * num_features:].reshape(num_users, num_features)

    # Compute the cost J
    predictions = np.dot(X, Theta.T)
    error = (predictions - Y) * R  # Apply mask R to only consider rated movies
    squared_error = error**2
    regularization_term = (lambda_ / 2) * (np.sum(Theta**2) + np.sum(X**2))
    J = (1 / 2) * np.sum(squared_error) + regularization_term

    # Compute the gradients
    X_grad = np.dot(error, Theta) + lambda_ * X
    Theta_grad = np.dot(error.T, X) + lambda_ * Theta

    # Unroll the gradients into a single vector
    grad = np.concatenate([X_grad.ravel(), Theta_grad.ravel()])

    return J, grad

def normalizeRatings(Y, R):
    # Compute Ymean, the mean of ratings
    m, n = Y.shape
    Ymean = np.zeros(m)
    Ynorm = np.zeros_like(Y)

    for i in range(m):
        idx = R[i, :] == 1
        Ymean[i] = np.mean(Y[i, idx])
        Ynorm[i, idx] = Y[i, idx] - Ymean[i]

    return Ynorm, Ymean

# Step 1: Read 'movie_ids.txt' to create a list of movie names
with open('C:\\Users\\Сергій\\Documents\\Code\\Current_task\\Math-test\\movie_ids.txt', 'r', encoding='ISO-8859-1') as file:
    movie_names = [line.strip().split(' ', 1)[1] for line in file]

# Step 2: Load 'movies.mat' to explore its contents and find the correct key
movies_data = loadmat('C:\\Users\\Сергій\\Documents\\Code\\Current_task\\Math-test\\movies.mat')

# Access the ratings data from the 'Y' and 'R' matrices
Y = movies_data['Y']
R = movies_data['R']

# Add your movie ratings and perform collaborative filtering (replace this with your ratings)
n_m = len(movie_names)
my_ratings = np.zeros(n_m)

# Set your movie ratings here (for example)
my_ratings[0] = 4
my_ratings[97] = 2
# Add more ratings as needed

# Add your ratings to the data matrix
Y = np.hstack([my_ratings[:, None], Y])
R = np.hstack([(my_ratings > 0)[:, None], R])

# Normalize Ratings
Ynorm, Ymean = normalizeRatings(Y, R)

# Useful Values
num_movies, num_users = Y.shape
num_features = 7

# Set Initial Parameters (Theta, X)
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)

initial_parameters = np.concatenate([X.ravel(), Theta.ravel()])

# Set Regularization
lambda_ = 10
res = optimize.minimize(lambda x: cofiCostFunc(x, Ynorm, R, num_users,
                                      num_movies, num_features, lambda_),
               initial_parameters,
               method='TNC',
               jac=True,
               options={'maxfun': 100})

theta = res.x

# Unfold the returned theta back into U and W
X = theta[:num_movies*num_features].reshape(num_movies, num_features)
Theta = theta[num_movies*num_features:].reshape(num_users, num_features)

# Make recommendations by computing the predictions matrix
p = np.dot(X, Theta.T)
my_predictions = p[:, 0] + Ymean

# Print recommendations and original ratings
print('Top recommendations for you:')
sorted_indices = np.argsort(my_predictions)[::-1]

for i in range(10):
    movie_idx = sorted_indices[i]
    print(f'Predicted rating: {my_predictions[movie_idx]:.1f}, Movie: {movie_names[movie_idx]}')

print('\nOriginal ratings provided:')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Rated {my_ratings[i]:.1f} for {movie_names[i]}')


RMSE: 0.4088
RMSE for Recommender System (SVD): 0.40875669882614696
Predicted Ratings for New User (Matrix Factorization): [ 1.28976263 -0.60246509  0.52589108  2.21178455  4.17114842]
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3974  0.4421  0.3770  0.4347  0.4815  0.4265  0.0364  
MAE (testset)     0.2202  0.2420  0.2132  0.2344  0.2525  0.2325  0.0143  
Fit time          0.09    0.08    0.08    0.09    0.08    0.09    0.00    
Test time         0.02    0.03    0.01    0.02    0.01    0.02    0.01    
Results for SVD:
RMSE: 0.42654103749526645
MAE: 0.2324793440187359
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3983  0.3695  0.3959  0.4124  0.3564  0.3865  0.0205  
MAE (testset)     0.1981  0.1880  0.1845  0.1956  0.1890  0.1910  0.0050  
Fit time          4.16    3.97    3.55