# 1. Data Preparation

This section prepares the raw `movies`, `ratings`, and `tags` datasets:
1. Extracts necessary columns (e.g., `movieId`, `title`, `genres`).
2. Cleans titles by removing years and converts them to lowercase.
3. Groups and merges tags with movies.
4. Combines genres, tags, and cleaned titles into a single feature (`related`) for vectorization.

## Load and Clean Data

In [75]:
import pandas as pd
import numpy as np
import re

# Sklearn libraries for content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score

# Surprise library for collaborative filtering
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import KFold as SurpriseKFold
from surprise.accuracy import rmse, mae







In [3]:
# Load datasets
movies = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\movies.csv')
ratings = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\ratings.csv')
tags = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\tags.csv')

# Keep necessary columns
movies = movies[['movieId', 'title', 'genres']]
ratings = ratings[['userId', 'movieId', 'rating']]
tags = tags[['movieId', 'tag']]


## Process Movie Data

Extract year, clean titles, and combine genres and tags.

In [4]:
# Function to extract year from title
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    else:
        return np.nan

# Apply the function to create 'year' column
movies['year'] = movies['title'].apply(extract_year)

# Clean the 'title' by removing the year and converting to lowercase
movies['title_clean'] = movies['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)', '', x).lower())

# Group tags by 'movieId' and concatenate them into a single string
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge tags with movies
movies = pd.merge(movies, tags_grouped, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

# Combine genres, title_clean, tags, and year into the 'related' column
movies['year_str'] = movies['year'].astype(str)
movies['related'] = movies['genres'].str.replace('|', ' ') + ' ' + movies['title_clean'] + ' ' + movies['tag'] + ' ' + movies['year_str']

# Preprocess the 'related' column
movies['related'] = movies['related'].str.lower()
movies['related'] = movies['related'].str.replace(r'\d+', '', regex=True)
movies['related'] = movies['related'].str.replace(r'[^a-z\s]', '', regex=True)
movies['related'] = movies['related'].str.strip()




In [5]:
movies.head()
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# 2. Content Based

1. Splits `ratings` data into training, validation, and testing sets.
   - **Validation Set:** 10% of data.
   - **Testing Set:** 20% of the remaining data.
2. Uses `TfidfVectorizer` to create a matrix of movie-related text features.
3. Computes pairwise cosine similarity to identify movies similar to a given movie.
4. Implements a helper function (`get_similar_movies`) to fetch the top N most similar movies for a given `movieId`.

In [6]:
remaining, validation = train_test_split(ratings, test_size=0.1, random_state=42, stratify=ratings['userId'])
training, testing = train_test_split(remaining, test_size=0.2, random_state=42)


In [7]:

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['related'])


In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [9]:
def get_similar_movies(movie_id, top_n=10):
    idx = movies.index[movies['movieId'] == movie_id][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies['movieId'].iloc[movie_indices]


## Generating Recommendations
1. For each user in the testing set:
   - Finds movies the user has rated in the training set.
   - Recommends the top 10 most frequent movies similar to those rated by the user.
2. Ensures recommended movies are not already rated by the user.


In [10]:
user_recommendations = {}

for user_id in testing['userId'].unique():
    user_movies = training[training['userId'] == user_id]['movieId']
    rec_movies_list = []  # Use a list to collect similar movies

    for movie_id in user_movies:
        similar_movies = get_similar_movies(movie_id)
        rec_movies_list.extend(similar_movies)  # Append similar movies to the list

    # Convert the list to a pandas Series
    rec_movies = pd.Series(rec_movies_list)

    # Remove movies the user has already rated
    rec_movies = rec_movies[~rec_movies.isin(user_movies)]

    # Count occurrences of recommendations and get the top 10
    user_recommendations[user_id] = rec_movies.value_counts().index[:10]




In [11]:
# Define a rating threshold to consider a movie as "liked"
rating_threshold = 4.0

# Create binary relevance for test data
testing['relevant'] = testing['rating'] >= rating_threshold


## Evaluation
1. Predicts ratings for the testing set using average ratings of similar movies.
2. Computes RMSE and MAE to evaluate the accuracy of predictions.


In [12]:
# Example of estimating ratings
def estimate_rating(user_id, movie_id):
    similar_movies = get_similar_movies(movie_id)
    similar_ratings = training[(training['userId'] == user_id) & (training['movieId'].isin(similar_movies))]['rating']
    if not similar_ratings.empty:
        return similar_ratings.mean()
    else:
        return training[training['userId'] == user_id]['rating'].mean()

testing['predicted_rating'] = testing.apply(lambda x: estimate_rating(x['userId'], x['movieId']), axis=1)


In [13]:


rmse = root_mean_squared_error(testing['rating'], testing['predicted_rating'])
mae = mean_absolute_error(testing['rating'], testing['predicted_rating'])
print(rmse, mae)

1.001351598477931 0.7508850234582828


In [14]:
# Generate binary predictions based on whether the movie was recommended
testing['predicted_relevant'] = testing.apply(
    lambda x: x['movieId'] in user_recommendations.get(x['userId'], []), axis=1)


In [15]:


precision = precision_score(testing['relevant'], testing['predicted_relevant'])
recall = recall_score(testing['relevant'], testing['predicted_relevant'])
f1 = f1_score(testing['relevant'], testing['predicted_relevant'])

print(precision, recall, f1)


0.6867469879518072 0.006502395619438741 0.01288281161713188


In [16]:
# Define a function to calculate metrics for each user
def calculate_user_metrics(group):
    try:
        precision = precision_score(group['relevant'], group['predicted_relevant'], zero_division=0)
        recall = recall_score(group['relevant'], group['predicted_relevant'], zero_division=0)
        f1 = f1_score(group['relevant'], group['predicted_relevant'], zero_division=0)
    except ValueError:  # Handle cases where there are no positive samples
        precision, recall, f1 = 0.0, 0.0, 0.0
    
    rmse = root_mean_squared_error(group['rating'], group['predicted_rating'])
    mae = mean_absolute_error(group['rating'], group['predicted_rating'])
    
    return pd.Series({'precision': precision, 'recall': recall, 'f1': f1, 'rmse': rmse, 'mae': mae})

# Group by userId and compute metrics, excluding the grouping column explicitly
user_metrics = (
    testing.groupby('userId')  # Ensures only grouped rows are passed
    .apply(calculate_user_metrics)
)

# Compute average metrics
avg_precision = user_metrics['precision'].mean()
avg_recall = user_metrics['recall'].mean()
avg_f1 = user_metrics['f1'].mean()
avg_rmse = user_metrics['rmse'].mean()
avg_mae = user_metrics['mae'].mean()

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average RMSE: {avg_rmse:.4f}")
print(f"Average MAE: {avg_mae:.4f}")


# Collaborative filtering model metrics
content_results = pd.DataFrame({
    'Model': ['Collaborative Filtering'],
    'RMSE': [avg_rmse],
    'MAE': [avg_mae],
    'Precision': [avg_precision],
    'Recall': [avg_recall],
    'F1-Score': [avg_f1]
})

Average Precision: 0.0743
Average Recall: 0.0141
Average F1 Score: 0.0212
Average RMSE: 0.9231
Average MAE: 0.7400


  .apply(calculate_user_metrics)


In [17]:
# After each fold, store the metrics
precision_scores = []
recall_scores = []
f1_scores = []
rmse_scores = []
mae_scores = []

# Append metrics in each fold
precision_scores.append(precision)
# Similarly for other metrics...

# After cross-validation
print(f'Precision: {np.mean(precision_scores)} ± {np.std(precision_scores)}')
# Similarly for other metrics...


Precision: 0.6867469879518072 ± 0.0


In [18]:
# After each fold, store the metrics
precision_scores = []
recall_scores = []
f1_scores = []
rmse_scores = []
mae_scores = []

# Append metrics in each fold
precision_scores.append(precision)
# Similarly for other metrics...

# After cross-validation
print(f'Precision: {np.mean(precision_scores)} ± {np.std(precision_scores)}')
# Similarly for other metrics...


Precision: 0.6867469879518072 ± 0.0


In [19]:
# After each fold, store the metrics
precision_scores = []
recall_scores = []
f1_scores = []
rmse_scores = []
mae_scores = []

# Append metrics in each fold
precision_scores.append(precision)
# Similarly for other metrics...

# After cross-validation
print(f'Precision: {np.mean(precision_scores)} ± {np.std(precision_scores)}')
# Similarly for other metrics...


Precision: 0.6867469879518072 ± 0.0


## 5-Fold Cross-Validation
1. Implements 5-fold cross-validation to train and evaluate the content-based filtering model.
2. Splits the training data into 5 subsets (folds).
3. In each fold:
   - 4 folds are used for training (`cv_train_data`).
   - 1 fold is used for validation (`cv_val_data`).


In [20]:
# Define the number of splits for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store cross-validation metrics
rmse_list = []
mae_list = []
precision_list = []
recall_list = []
f1_list = []

# Prepare the 'movies' DataFrame to be used in each fold
movies_copy = movies.copy()

In [21]:
# Start the cross-validation loop
for fold, (train_index, val_index) in enumerate(kf.split(training)):
    print(f"\nProcessing Fold {fold + 1}/{n_splits}...")
    
    # Split the data into training and validation folds
    cv_train_data = training.iloc[train_index]
    cv_val_data = training.iloc[val_index]
    
    # Get the list of movies in cv_train_data and reset the index
    cv_train_movies = movies[movies['movieId'].isin(cv_train_data['movieId'].unique())].reset_index(drop=True)
    
    # Fit the TF-IDF vectorizer on cv_train_movies
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(cv_train_movies['related'])
    
    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Create a mapping from movieId to index in cv_train_movies
    movie_id_to_idx = pd.Series(cv_train_movies.index, index=cv_train_movies['movieId']).to_dict()
    
    # Define a function to get similar movies within the current fold
    def get_similar_movies_cv(movie_id, top_n=10):
        if movie_id not in movie_id_to_idx:
            return []
        idx = movie_id_to_idx[movie_id]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]  # Exclude the movie itself
        movie_indices = [i[0] for i in sim_scores]
        return cv_train_movies['movieId'].iloc[movie_indices].tolist()
    
    # Prepare cv_val_data for predictions
    cv_val_data = cv_val_data.copy()
    cv_val_data['predicted_rating'] = np.nan
    cv_val_data['predicted_relevant'] = False
    
    # Define rating threshold
    rating_threshold = 4.0
    cv_val_data['relevant'] = cv_val_data['rating'] >= rating_threshold
    
    # Generate recommendations and estimate ratings for cv_val_data
    user_recommendations_cv = {}
    
    for user_id in cv_val_data['userId'].unique():
        user_movies = cv_train_data[cv_train_data['userId'] == user_id]['movieId']
        rec_movies_list = []
        
        for movie_id in user_movies:
            similar_movies = get_similar_movies_cv(movie_id)
            rec_movies_list.extend(similar_movies)
        
        rec_movies = pd.Series(rec_movies_list)
        # Remove movies the user has already rated
        rec_movies = rec_movies[~rec_movies.isin(user_movies)]
        # Get top 10 recommendations
        user_recommendations_cv[user_id] = rec_movies.value_counts().index[:10]
    
    # Estimate ratings and predicted relevance for cv_val_data
    for idx, row in cv_val_data.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        
        # Estimate rating
        similar_movies = get_similar_movies_cv(movie_id)
        similar_ratings = cv_train_data[(cv_train_data['userId'] == user_id) &
                                        (cv_train_data['movieId'].isin(similar_movies))]['rating']
        if not similar_ratings.empty:
            predicted_rating = similar_ratings.mean()
        else:
            user_ratings = cv_train_data[cv_train_data['userId'] == user_id]['rating']
            if not user_ratings.empty:
                predicted_rating = user_ratings.mean()
            else:
                predicted_rating = cv_train_data['rating'].mean()
        cv_val_data.at[idx, 'predicted_rating'] = predicted_rating
        
        # Predicted relevance
        cv_val_data.at[idx, 'predicted_relevant'] = movie_id in user_recommendations_cv.get(user_id, [])
    
    # Compute metrics for the current fold
    rmse_fold = np.sqrt(root_mean_squared_error(cv_val_data['rating'], cv_val_data['predicted_rating']))
    mae_fold = mean_absolute_error(cv_val_data['rating'], cv_val_data['predicted_rating'])
    precision_fold = precision_score(cv_val_data['relevant'], cv_val_data['predicted_relevant'], zero_division=0)
    recall_fold = recall_score(cv_val_data['relevant'], cv_val_data['predicted_relevant'], zero_division=0)
    f1_fold = f1_score(cv_val_data['relevant'], cv_val_data['predicted_relevant'], zero_division=0)
    
    # Append metrics to the lists
    rmse_list.append(rmse_fold)
    mae_list.append(mae_fold)
    precision_list.append(precision_fold)
    recall_list.append(recall_fold)
    f1_list.append(f1_fold)
    
    # Print metrics for the current fold
    print(f"Fold {fold + 1} Metrics:")
    print(f"RMSE: {rmse_fold:.4f}, MAE: {mae_fold:.4f}, Precision: {precision_fold:.4f}, Recall: {recall_fold:.4f}, F1-Score: {f1_fold:.4f}")



Processing Fold 1/5...
Fold 1 Metrics:
RMSE: 1.0078, MAE: 0.7652, Precision: 0.5088, Recall: 0.0041, F1-Score: 0.0081

Processing Fold 2/5...
Fold 2 Metrics:
RMSE: 1.0033, MAE: 0.7605, Precision: 0.6203, Recall: 0.0070, F1-Score: 0.0139

Processing Fold 3/5...
Fold 3 Metrics:
RMSE: 0.9980, MAE: 0.7516, Precision: 0.5806, Recall: 0.0052, F1-Score: 0.0103

Processing Fold 4/5...
Fold 4 Metrics:
RMSE: 1.0060, MAE: 0.7605, Precision: 0.6857, Recall: 0.0069, F1-Score: 0.0136

Processing Fold 5/5...
Fold 5 Metrics:
RMSE: 1.0016, MAE: 0.7548, Precision: 0.7308, Recall: 0.0081, F1-Score: 0.0160


In [126]:
# Create a DataFrame with cross-validation metrics
cv_results = pd.DataFrame({
    'Fold': range(1, n_splits +1),
    'RMSE': rmse_list,
    'MAE': mae_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1-Score': f1_list
})

print("\nCross-Validation Metrics:")
print(cv_results)

# Compute average metrics
# For Content-Based Model
avg_metrics = cv_results.mean(numeric_only=True).to_frame().T
avg_metrics['Model'] = 'Content-Based'

print("\nAverage Cross-Validation Metrics:")
print(avg_metrics)



Cross-Validation Metrics:
   Fold      RMSE       MAE  Precision    Recall  F1-Score
0     1  0.866018  0.661682   0.824173  0.365184  0.506114
1     2  0.851497  0.652010   0.815249  0.351686  0.491392
2     3  0.847085  0.649629   0.828666  0.361017  0.502928
3     4  0.853872  0.653935   0.826149  0.358663  0.500179
4     5  0.865285  0.662636   0.822017  0.349053  0.490026

Average Cross-Validation Metrics:
   Fold      RMSE       MAE  Precision    Recall  F1-Score          Model
0   3.0  0.856751  0.655978   0.823251  0.357121  0.498128  Content-Based


# 3. Collaborative

In [92]:
from surprise.model_selection import KFold


# Define reader and load data
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


## Collaborative Filtering with Grid Search
1. Performs grid search to find optimal hyperparameters for the SVD algorithm:
   - Number of latent factors (`n_factors`).
   - Number of epochs (`n_epochs`).
   - Learning rate (`lr_all`).
   - Regularization (`reg_all`).
2. Uses 5-fold cross-validation to evaluate models.
3. Selects the model with the best RMSE.

In [93]:


# Define the parameter grid
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.05]
}

# Perform grid search with cross-validation
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)

# Fit the model to the dataset
gs.fit(data)


In [94]:
# Best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# Best hyperparameters
print("Best hyperparameters:")
print(gs.best_params['rmse'])


Best RMSE score: 0.8572431459131458
Best hyperparameters:
{'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.05}


In [95]:
# Build the training set
trainset = data.build_full_trainset()

# Use the best model
best_svd = gs.best_estimator['rmse']

# Train the model on the full training set
best_svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x226000bfbd0>

In [96]:
# Build the testset from the test data
testing_data = Dataset.load_from_df(testing[['userId', 'movieId', 'rating']], reader)
testingset = testing_data.build_full_trainset().build_testset()

# Predict ratings
predictions = best_svd.test(testingset)


In [97]:
# Extract true and predicted ratings
y_true = np.array([pred.r_ui for pred in predictions])
y_pred = np.array([pred.est for pred in predictions])


In [98]:
# Calculate RMSE and MAE
rmse = root_mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)


In [99]:
# Define rating threshold
rating_threshold = 4.0

# Convert ratings to binary relevance
y_true_binary = (y_true >= rating_threshold).astype(int)
y_pred_binary = (y_pred >= rating_threshold).astype(int)


In [100]:

precision = precision_score(y_true_binary, y_pred_binary, zero_division=0)
recall = recall_score(y_true_binary, y_pred_binary, zero_division=0)
f1 = f1_score(y_true_binary, y_pred_binary, zero_division=0)


In [101]:
# Create a DataFrame to display the results
results = pd.DataFrame({
    'RMSE': [rmse],
    'MAE': [mae],
    'Precision': [precision],
    'Recall': [recall],
    'F1-Score': [f1]
})

print("\nTesting Metrics:")
print(results)



Testing Metrics:
       RMSE       MAE  Precision    Recall  F1-Score
0  0.447033  0.345019   0.967654  0.569929  0.717352


## Testing and Validation
1. Uses the best SVD model to predict ratings for the test set.
2. Computes RMSE, MAE, Precision, Recall, and F1-Score to evaluate performance.


In [130]:
from surprise.model_selection import KFold
from surprise.accuracy import rmse, mae
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Define lists to store cross-validation metrics
rmse_list = []
mae_list = []
precision_list = []
recall_list = []
f1_list = []

# Define rating threshold
rating_threshold = 4.0

# Initialize Surprise KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform 5-fold cross-validation
for trainset, testset in kf.split(data):
    # Train the algorithm on the training set
    best_svd.fit(trainset)
    
    # Test the algorithm on the testing set
    predictions = best_svd.test(testset)
    
    # Extract true and predicted ratings
    y_true = np.array([pred.r_ui for pred in predictions])
    y_pred = np.array([pred.est for pred in predictions])
    
    # Compute RMSE and MAE
    rmse_cv = rmse(predictions, verbose=False)
    mae_cv = mae(predictions, verbose=False)
    
    # Convert to binary relevance
    y_true_binary = (y_true >= rating_threshold).astype(int)
    y_pred_binary = (y_pred >= rating_threshold).astype(int)
    
    # Compute Precision, Recall, and F1-Score
    precision_cv = precision_score(y_true_binary, y_pred_binary, zero_division=0)
    recall_cv = recall_score(y_true_binary, y_pred_binary, zero_division=0)
    f1_cv = f1_score(y_true_binary, y_pred_binary, zero_division=0)
    
    # Append metrics to lists
    rmse_list.append(rmse_cv)
    mae_list.append(mae_cv)
    precision_list.append(precision_cv)
    recall_list.append(recall_cv)
    f1_list.append(f1_cv)

# Create a DataFrame with cross-validation metrics
cv_results = pd.DataFrame({
    'Fold': range(1, 6),
    'RMSE': rmse_list,
    'MAE': mae_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1-Score': f1_list
})

# Print Cross-Validation Metrics
print("\nCross-Validation Metrics:")
print(cv_results)

# For Collaborative Filtering Model
avg_metrics_collaborative = cv_results.mean(numeric_only=True).to_frame().T
avg_metrics_collaborative['Model'] = 'Collaborative'

print("\nAverage Cross-Validation Metrics:")
print(avg_metrics_collaborative)



Cross-Validation Metrics:
   Fold      RMSE       MAE  Precision    Recall  F1-Score
0     1  0.880222  0.678225   0.825354  0.346546  0.488136
1     2  0.873649  0.671061   0.818644  0.346237  0.486650
2     3  0.871624  0.670733   0.802984  0.342098  0.479789
3     4  0.873311  0.671021   0.807221  0.338668  0.477149
4     5  0.876259  0.670357   0.811761  0.341873  0.481122

Average Cross-Validation Metrics:
   Fold      RMSE       MAE  Precision    Recall  F1-Score          Model
0   3.0  0.875013  0.672279   0.813193  0.343084  0.482569  Collaborative


In [104]:
# Build the validationset from the validation data
validation_data = Dataset.load_from_df(validation[['userId', 'movieId', 'rating']], reader)
validationset = validation_data.build_full_trainset().build_testset()


# Predict ratings
predictions_test = best_svd.test(validationset)

# Extract true and predicted ratings
y_true_test = np.array([pred.r_ui for pred in predictions_test])
y_pred_test = np.array([pred.est for pred in predictions_test])

# Compute RMSE and MAE
rmse_test = root_mean_squared_error(y_true_test, y_pred_test)
mae_test = mean_absolute_error(y_true_test, y_pred_test)

# Convert to binary relevance
y_true_test_binary = (y_true_test >= rating_threshold).astype(int)
y_pred_test_binary = (y_pred_test >= rating_threshold).astype(int)

# Compute Precision, Recall, and F1-Score
precision_test = precision_score(y_true_test_binary, y_pred_test_binary, zero_division=0)
recall_test = recall_score(y_true_test_binary, y_pred_test_binary, zero_division=0)
f1_test = f1_score(y_true_test_binary, y_pred_test_binary, zero_division=0)

# Create a DataFrame to display the results
test_results = pd.DataFrame({
    'RMSE': [rmse_test],
    'MAE': [mae_test],
    'Precision': [precision_test],
    'Recall': [recall_test],
    'F1-Score': [f1_test]
})

print("\nValidation Metrics:")
print(test_results)



Validation Metrics:
       RMSE      MAE  Precision    Recall  F1-Score
0  0.523762  0.38002   0.956103  0.514185  0.668731


In [105]:
# Collaborative filtering model metrics
collaborative_results = pd.DataFrame({
    'Model': ['Collaborative Filtering'],
    'RMSE': [rmse],
    'MAE': [mae],
    'Precision': [precision],
    'Recall': [recall],
    'F1-Score': [f1]
})

# 4. Hybrid Model

In [106]:
# Define weights for each model
weight_content = 0.2
weight_collaborative = 1- weight_content


In [107]:
# Ensure that 'testing' DataFrame contains both content-based and collaborative predictions

# From content-based filtering
testing_content = testing.copy()
testing_content = testing_content[['userId', 'movieId', 'predicted_rating', 'relevant', 'predicted_relevant', 'rating']]

# From collaborative filtering
# We have 'predictions' list from collaborative filtering (Surprise library)
# Convert the predictions to a DataFrame
collab_preds = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions],
                            columns=['userId', 'movieId', 'predicted_rating_collab'])

# Merge the content-based and collaborative predictions on userId and movieId
hybrid_data = pd.merge(testing_content, collab_preds, on=['userId', 'movieId'], how='inner')

# Now, compute the hybrid predicted rating
hybrid_data['predicted_rating_hybrid'] = (weight_content * hybrid_data['predicted_rating'] +
                                          weight_collaborative * hybrid_data['predicted_rating_collab'])


In [108]:
testing.head()

Unnamed: 0,userId,movieId,rating,relevant,predicted_rating,predicted_relevant
82248,522,1265,4.5,True,3.734266,False
4631,28,47629,2.0,False,3.01699,False
84347,541,224,4.0,True,3.360656,False
95803,600,30749,2.0,False,3.018987,False
17706,111,94478,4.0,True,4.0,False


In [109]:
# True ratings
y_true_hybrid = hybrid_data['rating']

# Hybrid predicted ratings
y_pred_hybrid = hybrid_data['predicted_rating']

# Compute RMSE and MAE
rmse_hybrid = root_mean_squared_error(y_true_hybrid, y_pred_hybrid)
mae_hybrid = mean_absolute_error(y_true_hybrid, y_pred_hybrid)

print(rmse_hybrid, mae_hybrid)


1.0229241405079337 0.76762793550662


In [110]:
# Define rating threshold
rating_threshold = 4.0

# Convert true ratings to binary relevance
hybrid_data['relevant'] = hybrid_data['rating'] >= rating_threshold

# Convert hybrid predicted ratings to binary relevance
hybrid_data['predicted_relevant_hybrid'] = hybrid_data['predicted_rating_hybrid'] >= rating_threshold

# Compute Precision, Recall, and F1-Score
precision_hybrid = precision_score(hybrid_data['relevant'], hybrid_data['predicted_relevant_hybrid'], zero_division=0)
recall_hybrid = recall_score(hybrid_data['relevant'], hybrid_data['predicted_relevant_hybrid'], zero_division=0)
f1_hybrid = f1_score(hybrid_data['relevant'], hybrid_data['predicted_relevant_hybrid'], zero_division=0)


print(precision_hybrid, recall_hybrid, f1_hybrid)

0.8506787330316742 0.31438127090301005 0.4590964590964591


In [111]:
# Create a DataFrame to display the results
hybrid_results = pd.DataFrame({
    'RMSE': [rmse_hybrid],
    'MAE': [mae_hybrid],
    'Precision': [precision_hybrid],
    'Recall': [recall_hybrid],
    'F1-Score': [f1_hybrid]
})

print("\nHybrid Model Metrics:")
print(hybrid_results)



Hybrid Model Metrics:
       RMSE       MAE  Precision    Recall  F1-Score
0  1.022924  0.767628   0.850679  0.314381  0.459096


In [112]:


# Hybrid model metrics
hybrid_results['Model'] = 'Hybrid'

# Combine all results
all_results = pd.concat([content_results, collaborative_results, hybrid_results], ignore_index=True)

# Rearrange columns
all_results = all_results[['Model', 'RMSE', 'MAE', 'Precision', 'Recall', 'F1-Score']]

print("\nComparison of Models:")
print(all_results)



Comparison of Models:
                     Model                                   RMSE  \
0  Collaborative Filtering                                 0.9231   
1  Collaborative Filtering  <function rmse at 0x00000225E6BE7380>   
2                   Hybrid                               1.022924   

                                    MAE  Precision    Recall  F1-Score  
0                              0.739989   0.074317  0.014127  0.021159  
1  <function mae at 0x00000225E6BE74C0>   0.967654  0.569929  0.717352  
2                              0.767628   0.850679  0.314381  0.459096  


In [None]:
from surprise.model_selection import KFold
from surprise import Dataset, Reader, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, precision_score, recall_score, f1_score
from surprise.accuracy import rmse
import pandas as pd
import numpy as np

# Initialize lists to store cross-validation metrics
rmse_list_hybrid = []
mae_list_hybrid = []
precision_list_hybrid = []
recall_list_hybrid = []
f1_list_hybrid = []

# Define weights for the hybrid model (adjust as needed)
weight_content = 0.5
weight_collaborative = 1- weight_content

# Reader object for Surprise
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Use Surprise's KFold for cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

# Convert training data to Surprise Dataset
data = Dataset.load_from_df(training[['userId', 'movieId', 'rating']], reader)

for fold, (trainset, testset) in enumerate(kf.split(data)):
    print(f"\nProcessing Fold {fold + 1}/5 for Hybrid Model...")
    
    # ----------------------
    # Collaborative Filtering Model
    # ----------------------
    
    # Train the collaborative filtering model
    algo_collab = SVD(n_factors=150, n_epochs=30, lr_all=0.01, reg_all=0.05)
    algo_collab.fit(trainset)
    
    # Test collaborative filtering model
    predictions_collab = algo_collab.test(testset)
    
    # Extract predictions and true ratings for collaborative filtering
    collab_preds = pd.DataFrame([(pred.uid, pred.iid, pred.est, pred.r_ui) for pred in predictions_collab],
                                columns=['userId', 'movieId', 'predicted_rating_collab', 'rating'])
    collab_preds['userId'] = collab_preds['userId'].astype(int)
    collab_preds['movieId'] = collab_preds['movieId'].astype(int)
    
    # ----------------------
    # Content-Based Model
    # ----------------------
    
    # Filter movies in the current fold
    fold_movies = movies[movies['movieId'].isin(collab_preds['movieId'].unique())].reset_index(drop=True)
    
    # Fit the TF-IDF vectorizer
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(fold_movies['related'])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Map movieId to indices in the current fold
    movie_id_to_idx = pd.Series(fold_movies.index, index=fold_movies['movieId']).to_dict()
    
    # Define function to estimate content-based ratings
    def estimate_content_rating(user_id, movie_id):
        if movie_id not in movie_id_to_idx:
            return np.nan
        idx = movie_id_to_idx[movie_id]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
        similar_movies = [fold_movies.iloc[i[0]]['movieId'] for i in sim_scores]
        
        # Get user ratings for similar movies
        user_ratings = collab_preds[(collab_preds['userId'] == user_id) &
                                    (collab_preds['movieId'].isin(similar_movies))]['rating']
        return user_ratings.mean() if not user_ratings.empty else collab_preds[collab_preds['userId'] == user_id]['rating'].mean()

    # Estimate content-based ratings
    collab_preds['predicted_rating_content'] = collab_preds.apply(
        lambda row: estimate_content_rating(row['userId'], row['movieId']), axis=1)
    
    # ----------------------
    # Combine Predictions for Hybrid Model
    # ----------------------
    
    # Compute hybrid predictions
    collab_preds['predicted_rating_hybrid'] = (
        weight_content * collab_preds['predicted_rating_content'] +
        weight_collaborative * collab_preds['predicted_rating_collab']
    )
    
    # Define rating threshold for relevance
    collab_preds['relevant'] = collab_preds['rating'] >= 4.0
    collab_preds['predicted_relevant_hybrid'] = collab_preds['predicted_rating_hybrid'] >= 4.0
    
    # Compute metrics for the current fold
    rmse_fold = rmse(predictions_collab, verbose=False)
    mae_fold = mean_absolute_error(collab_preds['rating'], collab_preds['predicted_rating_hybrid'])
    precision_fold = precision_score(collab_preds['relevant'], collab_preds['predicted_relevant_hybrid'], zero_division=0)
    recall_fold = recall_score(collab_preds['relevant'], collab_preds['predicted_relevant_hybrid'], zero_division=0)
    f1_fold = f1_score(collab_preds['relevant'], collab_preds['predicted_relevant_hybrid'], zero_division=0)
    
    # Append metrics to lists
    rmse_list_hybrid.append(rmse_fold)
    mae_list_hybrid.append(mae_fold)
    precision_list_hybrid.append(precision_fold)
    recall_list_hybrid.append(recall_fold)
    f1_list_hybrid.append(f1_fold)
    
    # Print metrics for the current fold
    print(f"Fold {fold + 1} Metrics:")
    print(f"RMSE: {rmse_fold:.4f}, MAE: {mae_fold:.4f}, Precision: {precision_fold:.4f}, "
          f"Recall: {recall_fold:.4f}, F1-Score: {f1_fold:.4f}")

# Calculate average metrics
print("\nAverage Metrics Across Folds:")
print(f"RMSE: {np.mean(rmse_list_hybrid):.4f}, MAE: {np.mean(mae_list_hybrid):.4f}, "
      f"Precision: {np.mean(precision_list_hybrid):.4f}, Recall: {np.mean(recall_list_hybrid):.4f}, "
      f"F1-Score: {np.mean(f1_list_hybrid):.4f}")



Processing Fold 1/5 for Hybrid Model...
Fold 1 Metrics:
RMSE: 0.8802, MAE: 0.6789, Precision: 0.8374, Recall: 0.3106, F1-Score: 0.4531

Processing Fold 2/5 for Hybrid Model...
Fold 2 Metrics:
RMSE: 0.8760, MAE: 0.6769, Precision: 0.8352, Recall: 0.3038, F1-Score: 0.4455

Processing Fold 3/5 for Hybrid Model...
Fold 3 Metrics:
RMSE: 0.8720, MAE: 0.6696, Precision: 0.8424, Recall: 0.3074, F1-Score: 0.4505

Processing Fold 4/5 for Hybrid Model...
Fold 4 Metrics:
RMSE: 0.8749, MAE: 0.6728, Precision: 0.8392, Recall: 0.2975, F1-Score: 0.4393

Processing Fold 5/5 for Hybrid Model...
Fold 5 Metrics:
RMSE: 0.8782, MAE: 0.6700, Precision: 0.8321, Recall: 0.2935, F1-Score: 0.4339

Average Metrics Across Folds:
RMSE: 0.8763, MAE: 0.6736, Precision: 0.8373, Recall: 0.3026, F1-Score: 0.4445


In [115]:
# Convert lists to numpy arrays for convenience
rmse_array = np.array(rmse_list_hybrid)
mae_array = np.array(mae_list_hybrid)
precision_array = np.array(precision_list_hybrid)
recall_array = np.array(recall_list_hybrid)
f1_array = np.array(f1_list_hybrid)

# Compute means
mean_rmse = rmse_array.mean()
mean_mae = mae_array.mean()
mean_precision = precision_array.mean()
mean_recall = recall_array.mean()
mean_f1 = f1_array.mean()

# Compute standard deviations
std_rmse = rmse_array.std()
std_mae = mae_array.std()
std_precision = precision_array.std()
std_recall = recall_array.std()
std_f1 = f1_array.std()

# Print the results
print("\nAverage Cross-Validation Metrics for Hybrid Model:")
print(f"RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
print(f"MAE: {mean_mae:.4f} ± {std_mae:.4f}")
print(f"Precision: {mean_precision:.4f} ± {std_precision:.4f}")
print(f"Recall: {mean_recall:.4f} ± {std_recall:.4f}")
print(f"F1-Score: {mean_f1:.4f} ± {std_f1:.4f}")



Average Cross-Validation Metrics for Hybrid Model:
RMSE: 0.8763 ± 0.0028
MAE: 0.6736 ± 0.0037
Precision: 0.8373 ± 0.0035
Recall: 0.3026 ± 0.0063
F1-Score: 0.4445 ± 0.0071


In [128]:
# Create a DataFrame with cross-validation metrics
cv_results_hybrid = pd.DataFrame({
    'Fold': range(1, n_splits +1),
    'RMSE': rmse_list_hybrid,
    'MAE': mae_list_hybrid,
    'Precision': precision_list_hybrid,
    'Recall': recall_list_hybrid,
    'F1-Score': f1_list_hybrid
})

print("\nHybrid Model Cross-Validation Metrics:")
print(cv_results_hybrid)


# For Hybrid Model
avg_metrics_hybrid = cv_results_hybrid.mean(numeric_only=True).to_frame().T
avg_metrics_hybrid['Model'] = 'Hybrid'

print("\nAverage Cross-Validation Metrics:")
print(avg_metrics_hybrid)


Hybrid Model Cross-Validation Metrics:
   Fold      RMSE       MAE  Precision    Recall  F1-Score
0     1  0.880243  0.678912   0.837405  0.310589  0.453119
1     2  0.875970  0.676916   0.835238  0.303799  0.445542
2     3  0.872038  0.669575   0.842439  0.307426  0.450466
3     4  0.874900  0.672817   0.839178  0.297514  0.439287
4     5  0.878160  0.669984   0.832123  0.293481  0.433923

Average Cross-Validation Metrics:
   Fold      RMSE       MAE  Precision    Recall  F1-Score   Model
0   3.0  0.876262  0.673641   0.837276  0.302562  0.444467  Hybrid


In [129]:
print(avg_metrics.head(),"\n",avg_metrics_collaborative,"\n", avg_metrics_hybrid, "\n")

   Fold      RMSE       MAE  Precision    Recall  F1-Score          Model
0   3.0  0.856751  0.655978   0.823251  0.357121  0.498128  Content-Based 
    Fold      RMSE       MAE  Precision    Recall  F1-Score  \
0   3.0  0.874667  0.672173   0.814409  0.342893  0.482581   

                     Model  
0  Collaborative Filtering   
    Fold      RMSE       MAE  Precision    Recall  F1-Score   Model
0   3.0  0.876262  0.673641   0.837276  0.302562  0.444467  Hybrid 



In [132]:


# Combine the cross-validation results
avg_metrics_all = pd.concat([avg_metrics, avg_metrics_collaborative, avg_metrics_hybrid], ignore_index=True)
avg_metrics_all = avg_metrics_all[['Model', 'Fold', 'RMSE', 'MAE', 'Precision', 'Recall', 'F1-Score']]

print("\nCross-Validation Metrics Comparison:")
print(avg_metrics_all)



Cross-Validation Metrics Comparison:
                     Model  Fold      RMSE       MAE  Precision    Recall  \
0            Content-Based   3.0  0.856751  0.655978   0.823251  0.357121   
1  Collaborative Filtering   3.0  0.875013  0.672279   0.813193  0.343084   
2                   Hybrid   3.0  0.876262  0.673641   0.837276  0.302562   

   F1-Score  
0  0.498128  
1  0.482569  
2  0.444467  
