In [1]:
# Ensure all necessary imports are done
import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping

# Load your dataset and movies dataframes
dataset = pd.read_csv('/home/shahaf.hen@Digital-Grenoble.local/Downloads/ml-latest-small/ratings.csv')
movies_df = pd.read_csv('/home/shahaf.hen@Digital-Grenoble.local/Downloads/ml-latest-small/movies.csv')

# Prepare the data for the model
X = [dataset["userId"].to_numpy(), dataset["movieId"].to_numpy()]
y = dataset["rating"].to_numpy()

# Define your model creation function if it's not already defined
def get_mf_bias_l2_reg_model(nb_users, nb_movies, k, lambda_):
    from keras.models import Model
    from keras.layers import Embedding, Input, Reshape, Dot, Add, Flatten
    from keras.regularizers import l2

    dim_embeddings = k

    u = Input(shape=(1,), dtype='int32', name='u__user_id')
    i = Input(shape=(1,), dtype='int32', name='i__movie_id')

    p_u = Embedding(nb_users, dim_embeddings, embeddings_regularizer=l2(lambda_), name='p_u__user_embedding')(u)
    p_u = Reshape((dim_embeddings,), name='p_u__user_embedding_reshaped')(p_u)

    q_i = Embedding(nb_movies, dim_embeddings, embeddings_regularizer=l2(lambda_), name='q_i__movie_embedding')(i)
    q_i = Reshape((dim_embeddings,), name='q_i__movie_embedding_reshaped')(q_i)

    b_u = Embedding(nb_users, 1, embeddings_regularizer=l2(lambda_), name='b_u__user_bias')(u)
    b_u = Reshape((1,), name='b_u__user_bias_reshaped')(b_u)

    b_i = Embedding(nb_movies, 1, embeddings_regularizer=l2(lambda_), name='b_i__movie_bias')(i)
    b_i = Reshape((1,), name='b_i__movie_bias_reshaped')(b_i)

    r_hat = Dot(axes=1)([p_u, q_i])
    r_hat = Add()([r_hat, b_u, b_i])
    r_hat = Flatten()(r_hat)

    model = Model(inputs=[u, i], outputs=r_hat)
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])

    return model

# Get the number of unique users and movies
nb_users = dataset['userId'].nunique()
nb_movies = dataset['movieId'].nunique()

# Train the model
model = get_mf_bias_l2_reg_model(nb_users, nb_movies, k=15, lambda_=2e-05)

early_stopping = EarlyStopping(monitor='val_mse', patience=10, verbose=1, restore_best_weights=True)
model.fit(X, y, epochs=500, batch_size=512, validation_split=0.1, callbacks=[early_stopping])

# Define the recommendation function
def get_top10_for_users(model, user_ids, dataset, movies_df, diversity_factor=0.5):
    recommendations = {}

    for user_id in user_ids:
        all_movie_ids = dataset['movieId'].unique()
        rated_movie_ids = dataset.loc[dataset['userId'] == user_id, 'movieId'].values
        unrated_movie_ids = np.setdiff1d(all_movie_ids, rated_movie_ids)

        user_unrated_pairs = pd.DataFrame({
            'userId': np.full_like(unrated_movie_ids, user_id),
            'movieId': unrated_movie_ids
        })

        predictions = model.predict([user_unrated_pairs['userId'], user_unrated_pairs['movieId']])
        user_unrated_pairs['prediction'] = predictions.flatten()
        user_unrated_pairs = pd.merge(user_unrated_pairs, movies_df, on='movieId', how='left')
        sorted_predictions = user_unrated_pairs.sort_values(by='prediction', ascending=False)

        num_top_rated = int(10 * (1 - diversity_factor))
        num_diverse = 10 - num_top_rated

        top_rated_predictions = sorted_predictions.head(num_top_rated)
        diverse_predictions = sorted_predictions.iloc[num_top_rated:].sample(num_diverse, random_state=42)

        final_recommendations = pd.concat([top_rated_predictions, diverse_predictions]).sort_values(by='prediction', ascending=False)
        top10_predictions = final_recommendations.head(10)

        ten_best_movies = top10_predictions['title'].tolist()
        ten_best_ratings = top10_predictions['prediction'].tolist()

        recommendations[user_id] = (ten_best_movies, ten_best_ratings)

    return recommendations

# Example usage
user_ids = [1, 2, 3, 4, 5]  #  user IDs
recommendations = get_top10_for_users(model, user_ids, dataset, movies_df)

for user_id, (movies, ratings) in recommendations.items():
    print(f"Recommendations for User {user_id}:")
    for movie, rating in zip(movies, ratings):
        print(f"{movie}: {rating:.2f}")
    print()


2024-05-17 12:52:26.555535: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-17 12:52:26.555585: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-17 12:52:26.557681: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-17 12:52:26.569913: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-17 12:52:29.661047: I tensorflow/core

Epoch 1/500


2024-05-17 12:52:31.369706: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-05-17 12:52:31.939221: I external/local_xla/xla/service/service.cc:168] XLA service 0x7eec546a5680 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-17 12:52:31.939267: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1660, Compute Capability 7.5
2024-05-17 12:52:31.951464: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-17 12:52:31.983407: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1715943152.161632  103483 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 25: early stopping
Recommendations for User 1:
Streetcar Named Desire, A (1951): 5.51
Shawshank Redemption, The (1994): 5.45
Hoop Dreams (1994): 5.42
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964): 5.39
Guess Who's Coming to Dinner (1967): 5.38
Frances (1982): 2.35
Texas Chainsaw Massacre 2, The (1986): 2.26
Mortuary (1983): 0.60
Imperium (2016): 0.60
Conan the Barbarian (2011): 0.60

Recommendations for User 2:
Schindler's List (1993): 3.41
Streetcar Named Desire, A (1951): 3.39
Forrest Gump (1994): 3.38
Pulp Fiction (1994): 3.37
Hoop Dreams (1994): 3.37
Reservoir Dogs (1992): 3.24
Gigantic (A Tale of Two Johns) (2002): 2.56
Wild Strawberries (Smultron

In [2]:

def get_top10_for_users_with_diversity(model, user_ids, dataset, movies_df, diversity_factor=0.5):
    recommendations = {}

    for user_id in user_ids:
        all_movie_ids = dataset['movieId'].unique()
        rated_movie_ids = dataset.loc[dataset['userId'] == user_id, 'movieId'].values
        unrated_movie_ids = np.setdiff1d(all_movie_ids, rated_movie_ids)

        user_unrated_pairs = pd.DataFrame({
            'userId': np.full_like(unrated_movie_ids, user_id),
            'movieId': unrated_movie_ids
        })

        predictions = model.predict([user_unrated_pairs['userId'], user_unrated_pairs['movieId']])
        user_unrated_pairs['prediction'] = predictions.flatten()
        user_unrated_pairs = pd.merge(user_unrated_pairs, movies_df, on='movieId', how='left')
        sorted_predictions = user_unrated_pairs.sort_values(by='prediction', ascending=False)

        num_top_rated = int(10 * (1 - diversity_factor))
        num_diverse = 10 - num_top_rated

        top_rated_predictions = sorted_predictions.head(num_top_rated)
        diverse_predictions = sorted_predictions.iloc[num_top_rated:].sample(num_diverse, random_state=42)

        final_recommendations = pd.concat([top_rated_predictions, diverse_predictions]).sort_values(by='prediction', ascending=False)
        top10_predictions = final_recommendations.head(10)

        ten_best_movies = top10_predictions['title'].tolist()
        ten_best_ratings = top10_predictions['prediction'].tolist()

        # Get user's usual recommendations
        user_ratings = dataset.loc[dataset['userId'] == user_id].merge(movies_df, on='movieId', how='left')
        usual_recommendations = user_ratings.sort_values(by='rating', ascending=False).head(10)['title'].tolist()

        # Calculate variety increase
        variety_increase = len(set(ten_best_movies) - set(usual_recommendations))

        recommendations[user_id] = {
            'recommended_movies': ten_best_movies,
            'predicted_ratings': ten_best_ratings,
            'variety_increase': variety_increase
        }

    return recommendations

def save_recommendations_to_file(recommendations, filename_prefix='user_recommendations_'):
    for user_id, data in recommendations.items():
        filename = f"{filename_prefix}{user_id}.txt"
        with open(filename, 'w') as f:
            f.write(f"Recommendations for User {user_id}:\n\n")
            for movie, rating in zip(data['recommended_movies'], data['predicted_ratings']):
                f.write(f"{movie}: {rating:.2f}\n")
            f.write(f"\nVariety Increase: {data['variety_increase']}\n")

# Example usage:
user_ids = [1, 2, 3, 4, 5]  # Replace with actual user IDs
recommendations = get_top10_for_users_with_diversity(model, user_ids, dataset, movies_df)

# Save recommendations to text files
save_recommendations_to_file(recommendations)




    get_top10_for_users_with_diversity Function:
        This function generates movie recommendations for each user while considering diversity. It takes the following parameters:
            model: The trained matrix factorization model.
            user_ids: A list of user IDs for which recommendations are to be generated.
            dataset: The dataset containing user-movie ratings.
            movies_df: A DataFrame containing movie information, including titles.
            diversity_factor: A parameter controlling the balance between top-rated and diverse movie recommendations.
        For each user:
            It finds the movies the user hasn't rated.
            Predicts ratings for these unrated movies using the model.
            Sorts the predictions to get the highest-rated movies.
            Introduces diversity by selecting a portion of top-rated movies and a portion of randomly selected movies from the remaining list.
            Calculates the variety increase, i.e., the number of recommended movies that are not in the user's usual top-rated movies.

    save_recommendations_to_file Function:
        This function saves the recommendations to text files for each user. It takes the following parameters:
            recommendations: A dictionary containing recommendations for each user.
            filename_prefix: A prefix to be used for the filenames of the text files containing recommendations.
        For each user in the recommendations dictionary:
            It creates a text file named filename_prefix + user_id.txt.
            Writes the recommendations for that user to the text file, along with the variety increase information.

    Example Usage:
        Define a list of user IDs for which you want to generate recommendations.
        Call the get_top10_for_users_with_diversity function with the model, user IDs, dataset, and movies DataFrame to generate recommendations.
        Call the save_recommendations_to_file function to save the recommendations to text files.

In [4]:
import numpy as np
from itertools import combinations

def get_top_movies_for_users(recommendations, num_users=5, num_movies=50):
    top_movies_matrix = np.zeros((num_users, num_movies), dtype=int)

    for idx, (user_id, data) in enumerate(recommendations.items()):
        top_movies = data['recommended_movies'][:num_movies]
        for movie in top_movies:
            movie_idx = movies_df.index[movies_df['title'] == movie][0]  # Corrected line
            top_movies_matrix[idx, movie_idx] = 1

    return top_movies_matrix

def find_farthest_users(top_movies_matrix):
    num_users = top_movies_matrix.shape[0]
    distances = np.zeros((num_users, num_users))

    # Calculate cosine similarity matrix
    for i, j in combinations(range(num_users), 2):
        vector_i = top_movies_matrix[i]
        vector_j = top_movies_matrix[j]
        cosine_similarity = np.dot(vector_i, vector_j) / (np.linalg.norm(vector_i) * np.linalg.norm(vector_j))
        distances[i, j] = cosine_similarity
        distances[j, i] = cosine_similarity  # cosine similarity is symmetric

    # Find the farthest pairs of users
    farthest_pairs = []
    for _ in range(5):
        farthest_idx = np.unravel_index(np.argmax(distances), distances.shape)
        farthest_pairs.append((farthest_idx[0], farthest_idx[1], distances[farthest_idx]))
        distances[farthest_idx] = -1  # Mark as visited to find the next farthest pair

    return farthest_pairs

# Example Usage:
num_users = 5
num_movies = 50

# Get recommendations for 5 users
user_ids = [1, 2, 3, 4, 5]  # Replace with actual user IDs
recommendations = get_top10_for_users_with_diversity(model, user_ids, dataset, movies_df)

# Get top movies matrix
top_movies_matrix = get_top_movies_for_users(recommendations, num_users, num_movies)

# Find farthest users
farthest_pairs = find_farthest_users(top_movies_matrix)

# Display farthest pairs
for idx, (user1, user2, distance) in enumerate(farthest_pairs, 1):
    print(f"Farthest Pair {idx}:")
    print(f"User {user1+1} and User {user2+1} (Distance: {distance:.2f})")




IndexError: index 841 is out of bounds for axis 1 with size 50

In [7]:
import numpy as np
from itertools import combinations

def get_top_movies_for_users(recommendations, num_users=5, num_movies=50):
    top_movies_matrix = np.zeros((num_users, num_movies), dtype=int)
    movie_ids = movies_df['movieId'].values  # Get movie IDs from the DataFrame

    for idx, (user_id, data) in enumerate(recommendations.items()):
        top_movies = data['recommended_movies'][:num_movies]
        for movie in top_movies:
            movie_id = movies_df[movies_df['title'] == movie]['movieId'].values
            if len(movie_id) > 0:  # Check if movie ID exists
                movie_idx = np.where(movie_ids == movie_id[0])[0][0]  # Find movie index
                top_movies_matrix[idx, movie_idx] = 1

    return top_movies_matrix

def calculate_cosine_similarity_matrix(matrix):
    # Calculate cosine similarity matrix
    cosine_similarity_matrix = np.dot(matrix, matrix.T) / (np.linalg.norm(matrix, axis=1)[:, np.newaxis] * np.linalg.norm(matrix, axis=1))
    return cosine_similarity_matrix

def find_farthest_vectors(cosine_similarity_matrix):
    num_vectors = cosine_similarity_matrix.shape[0]
    farthest_pairs = []

    # Find the farthest pairs of vectors
    for i, j in combinations(range(num_vectors), 2):
        distance = 1 - cosine_similarity_matrix[i, j]  # Distance is 1 - cosine similarity
        farthest_pairs.append((i, j, distance))

    farthest_pairs.sort(key=lambda x: x[2], reverse=True)  # Sort by distance in descending order

    return farthest_pairs[:5]  # Return the top 5 farthest pairs

# Example Usage:
num_users = 5
num_movies = 50

# Get recommendations for 5 users
user_ids = [1, 2, 3, 4, 5]  # Replace with actual user IDs
recommendations = get_top10_for_users_with_diversity(model, user_ids, dataset, movies_df)

# Get top movies matrix
top_movies_matrix = get_top_movies_for_users(recommendations, num_users, num_movies)

# Calculate cosine similarity matrix
cosine_similarity_matrix = calculate_cosine_similarity_matrix(top_movies_matrix)

# Find farthest vectors
farthest_pairs = find_farthest_vectors(cosine_similarity_matrix)

# Display farthest pairs
for idx, (vector1, vector2, distance) in enumerate(farthest_pairs, 1):
    print(f"Farthest Pair {idx}:")
    print(f"Vector {vector1+1} and Vector {vector2+1} (Distance: {distance:.2f})")




IndexError: index 841 is out of bounds for axis 1 with size 50

In [9]:
def get_top_movies_for_users_with_recommendations(recommendations, movies_df, num_users=5, num_movies=50):
    top_movies_matrix = np.zeros((num_users, num_movies), dtype=int)
    recommended_movies = []

    movie_ids = movies_df['movieId'].values  # Get movie IDs from the DataFrame

    for idx, (user_id, data) in enumerate(recommendations.items()):
        top_movies = data['recommended_movies'][:num_movies]
        recommended_movies.append(top_movies)  # Store recommended movies for display
        for movie in top_movies:
            movie_id = movies_df[movies_df['title'] == movie]['movieId'].values
            if len(movie_id) > 0:  # Check if movie ID exists
                movie_idx = np.where(movie_ids == movie_id[0])[0][0]  # Find movie index
                if movie_idx < num_movies:  # Ensure movie index is within bounds
                    top_movies_matrix[idx, movie_idx] = 1

    return top_movies_matrix, recommended_movies

# Example Usage:
num_users = 5
num_movies = 50

# Get recommendations for 5 users
user_ids = [1, 2, 3, 4, 5]  # Replace with actual user IDs
recommendations = get_top10_for_users_with_diversity(model, user_ids, dataset, movies_df)

# Get top movies matrix and recommended movies
top_movies_matrix, recommended_movies = get_top_movies_for_users_with_recommendations(recommendations, movies_df, num_users, num_movies)

# Display recommended movies for each user
for user_id, movies in zip(user_ids, recommended_movies):
    print(f"Recommended movies for User {user_id}:")
    for i, movie in enumerate(movies, 1):
        print(f"{i}. {movie}")
    print()

# Calculate cosine similarity matrix
cosine_similarity_matrix = calculate_cosine_similarity_matrix(top_movies_matrix)

# Find farthest vectors
farthest_pairs = find_farthest_vectors(cosine_similarity_matrix)

# Display farthest pairs
for idx, (vector1, vector2, distance) in enumerate(farthest_pairs, 1):
    print(f"Farthest Pair {idx}:")
    print(f"Vector {vector1+1} and Vector {vector2+1} (Distance: {distance:.2f})")


Recommended movies for User 1:
1. Streetcar Named Desire, A (1951)
2. Shawshank Redemption, The (1994)
3. Hoop Dreams (1994)
4. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)
5. Guess Who's Coming to Dinner (1967)
6. Frances (1982)
7. Texas Chainsaw Massacre 2, The (1986)
8. Mortuary (1983)
9. Imperium (2016)
10. Conan the Barbarian (2011)

Recommended movies for User 2:
1. Schindler's List (1993)
2. Streetcar Named Desire, A (1951)
3. Forrest Gump (1994)
4. Pulp Fiction (1994)
5. Hoop Dreams (1994)
6. Reservoir Dogs (1992)
7. Gigantic (A Tale of Two Johns) (2002)
8. Wild Strawberries (Smultronstället) (1957)
9. 11'09"01 - September 11 (2002)
10. Cabin Boy (1994)

Recommended movies for User 3:
1. Shawshank Redemption, The (1994)
2. Streetcar Named Desire, A (1951)
3. Fight Club (1999)
4. Pulp Fiction (1994)
5. Forrest Gump (1994)
6. Happy Gilmore (1996)
7. Man with the Golden Gun, The (1974)
8. National Velvet (1944)
9. Memories of Murder (Salinui chueok) 

  cosine_similarity_matrix = np.dot(matrix, matrix.T) / (np.linalg.norm(matrix, axis=1)[:, np.newaxis] * np.linalg.norm(matrix, axis=1))


    Generate Recommendations: We use the trained model to generate recommendations for each user. These recommendations are typically sorted by predicted ratings in descending order.

    Extract Top Movies: From the generated recommendations, we extract the top 50 movies for each user. These are the movies that are most highly rated by the model for that user.

    Prepare Binary Matrix: We prepare a binary matrix where each row represents a user's recommended movies. Each column corresponds to a movie, and a value of 1 indicates that the movie is recommended for that user, while 0 indicates it is not.

In [11]:
def write_recommendations_to_file(file_path, user_ids, recommended_movies):
    with open(file_path, 'w') as f:
        for user_id, movies in zip(user_ids, recommended_movies):
            f.write(f"Recommended movies for User {user_id}:\n")
            for i, movie in enumerate(movies, 1):
                f.write(f"{i}. {movie}\n")
            f.write("\n")

# Example Usage:
num_users = 5
num_movies = 50
file_path = "recommended_movies.txt"  # Path to the output text file

# Get recommendations for 5 users
user_ids = [1, 2, 3, 4, 5]  # Replace with actual user IDs
recommendations = get_top10_for_users_with_diversity(model, user_ids, dataset, movies_df)

# Get top movies matrix and recommended movies
top_movies_matrix, recommended_movies = get_top_movies_for_users_with_recommendations(recommendations, movies_df, num_users, num_movies)

# Write recommended movies to file
write_recommendations_to_file(file_path, user_ids, recommended_movies)

print(f"Recommended movies have been written to {file_path}.")


Recommended movies have been written to recommended_movies.txt.


To calculate the distance between vectors in the cosine similarity matrix, we can use the cosine similarity itself as a measure of similarity. Cosine similarity measures the cosine of the angle between two vectors and ranges from -1 to 1.

The cosine similarity similarity(A,B)similarity(A,B) between two vectors AA and BB is defined as:
similarity(A,B)=A⋅B∥A∥∥B∥
similarity(A,B)=∥A∥∥B∥A⋅B​

Where A⋅BA⋅B is the dot product of vectors AA and BB, and ∥A∥∥A∥ and ∥B∥∥B∥ are the magnitudes of vectors AA and BB respectively.

To convert the cosine similarity to a distance measure, we subtract it from 1. This is because cosine similarity of 1 means the vectors are perfectly aligned (have no angle between them), and cosine similarity of -1 means they are perfectly anti-aligned. So, subtracting from 1 gives us a value where higher values mean vectors are more similar (closer) and lower values mean they are more dissimilar (farther).

Therefore, the distance distance(A,B)distance(A,B) between two vectors AA and BB is defined as:
distance(A,B)=1−similarity(A,B)
distance(A,B)=1−similarity(A,B)

We can use this formula to calculate the distance between vectors in the cosine similarity matrix. The greater the distance, the more dissimilar the vectors are, and vice versa.