# 1. Data Preparation
## Load and Clean Data

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as root_mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import re
from scipy.sparse import csr_matrix

from sklearn.metrics import precision_recall_fscore_support, precision_score, recall_score, f1_score




In [14]:
# Load datasets
movies = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\movies.csv')
ratings = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\ratings.csv')
tags = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\tags.csv')

# Keep necessary columns
movies = movies[['movieId', 'title', 'genres']]
ratings = ratings[['userId', 'movieId', 'rating']]
tags = tags[['movieId', 'tag']]


## Process Movie Data

Extract year, clean titles, and combine genres and tags.

In [15]:
# Function to extract year from title
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    else:
        return np.nan

# Apply the function to create 'year' column
movies['year'] = movies['title'].apply(extract_year)

# Clean the 'title' by removing the year and converting to lowercase
movies['title_clean'] = movies['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)', '', x).lower())

# Group tags by 'movieId' and concatenate them into a single string
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge tags with movies
movies = pd.merge(movies, tags_grouped, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

# Combine genres, title_clean, tags, and year into the 'related' column
movies['year_str'] = movies['year'].astype(str)
movies['related'] = movies['genres'].str.replace('|', ' ') + ' ' + movies['title_clean'] + ' ' + movies['tag'] + ' ' + movies['year_str']

# Preprocess the 'related' column
movies['related'] = movies['related'].str.lower()
movies['related'] = movies['related'].str.replace(r'\d+', '', regex=True)
movies['related'] = movies['related'].str.replace(r'[^a-z\s]', '', regex=True)
movies['related'] = movies['related'].str.strip()




In [16]:
movies.head()
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# 2. Content Based

In [17]:
remaining, validation = train_test_split(ratings, test_size=0.1, random_state=42, stratify=ratings['userId'])
training, testing = train_test_split(remaining, test_size=0.2, random_state=42)


In [18]:

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['related'])


In [19]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [20]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [21]:
for train_index, val_index in kf.split(training):
    cv_train_data = training.iloc[train_index]
    cv_val_data = training.iloc[val_index]
    # Train and evaluate your model here


In [22]:
def get_similar_movies(movie_id, top_n=10):
    idx = movies.index[movies['movieId'] == movie_id][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies['movieId'].iloc[movie_indices]


In [23]:
user_recommendations = {}

for user_id in testing['userId'].unique():
    user_movies = training[training['userId'] == user_id]['movieId']
    rec_movies_list = []  # Use a list to collect similar movies

    for movie_id in user_movies:
        similar_movies = get_similar_movies(movie_id)
        rec_movies_list.extend(similar_movies)  # Append similar movies to the list

    # Convert the list to a pandas Series
    rec_movies = pd.Series(rec_movies_list)

    # Remove movies the user has already rated
    rec_movies = rec_movies[~rec_movies.isin(user_movies)]

    # Count occurrences of recommendations and get the top 10
    user_recommendations[user_id] = rec_movies.value_counts().index[:10]




In [24]:
# Define a rating threshold to consider a movie as "liked"
rating_threshold = 4.0

# Create binary relevance for test data
testing['relevant'] = testing['rating'] >= rating_threshold


In [25]:
# Example of estimating ratings
def estimate_rating(user_id, movie_id):
    similar_movies = get_similar_movies(movie_id)
    similar_ratings = training[(training['userId'] == user_id) & (training['movieId'].isin(similar_movies))]['rating']
    if not similar_ratings.empty:
        return similar_ratings.mean()
    else:
        return training[training['userId'] == user_id]['rating'].mean()

testing['predicted_rating'] = testing.apply(lambda x: estimate_rating(x['userId'], x['movieId']), axis=1)


In [26]:


rmse = root_mean_squared_error(testing['rating'], testing['predicted_rating'])
mae = mean_absolute_error(testing['rating'], testing['predicted_rating'])
print(rmse, mae)

1.0027050237743078 0.7508850234582828


In [27]:
# Generate binary predictions based on whether the movie was recommended
testing['predicted_relevant'] = testing.apply(
    lambda x: x['movieId'] in user_recommendations.get(x['userId'], []), axis=1)


In [28]:


precision = precision_score(testing['relevant'], testing['predicted_relevant'])
recall = recall_score(testing['relevant'], testing['predicted_relevant'])
f1 = f1_score(testing['relevant'], testing['predicted_relevant'])

print(precision, recall, f1)


0.6867469879518072 0.006502395619438741 0.01288281161713188


In [29]:
# Define a function to calculate metrics for each user
def calculate_user_metrics(group):
    try:
        precision = precision_score(group['relevant'], group['predicted_relevant'], zero_division=0)
        recall = recall_score(group['relevant'], group['predicted_relevant'], zero_division=0)
        f1 = f1_score(group['relevant'], group['predicted_relevant'], zero_division=0)
    except ValueError:  # Handle cases where there are no positive samples
        precision, recall, f1 = 0.0, 0.0, 0.0
    
    rmse = root_mean_squared_error(group['rating'], group['predicted_rating'])
    mae = mean_absolute_error(group['rating'], group['predicted_rating'])
    
    return pd.Series({'precision': precision, 'recall': recall, 'f1': f1, 'rmse': rmse, 'mae': mae})

# Group by userId and compute metrics, excluding the grouping column explicitly
user_metrics = (
    testing.groupby('userId', group_keys=False)  # Ensures only grouped rows are passed
    .apply(calculate_user_metrics)
)

# Compute average metrics
avg_precision = user_metrics['precision'].mean()
avg_recall = user_metrics['recall'].mean()
avg_f1 = user_metrics['f1'].mean()
avg_rmse = user_metrics['rmse'].mean()
avg_mae = user_metrics['mae'].mean()

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average RMSE: {avg_rmse:.4f}")
print(f"Average MAE: {avg_mae:.4f}")

Average Precision: 0.0743
Average Recall: 0.0141
Average F1 Score: 0.0212
Average RMSE: 0.9865
Average MAE: 0.7400


  .apply(calculate_user_metrics)


In [30]:
# After each fold, store the metrics
precision_scores = []
recall_scores = []
f1_scores = []
rmse_scores = []
mae_scores = []

# Append metrics in each fold
precision_scores.append(precision)
# Similarly for other metrics...

# After cross-validation
print(f'Precision: {np.mean(precision_scores)} ± {np.std(precision_scores)}')
# Similarly for other metrics...


Precision: 0.6867469879518072 ± 0.0


In [31]:
# After each fold, store the metrics
precision_scores = []
recall_scores = []
f1_scores = []
rmse_scores = []
mae_scores = []

# Append metrics in each fold
precision_scores.append(precision)
# Similarly for other metrics...

# After cross-validation
print(f'Precision: {np.mean(precision_scores)} ± {np.std(precision_scores)}')
# Similarly for other metrics...


Precision: 0.6867469879518072 ± 0.0


In [32]:
# After each fold, store the metrics
precision_scores = []
recall_scores = []
f1_scores = []
rmse_scores = []
mae_scores = []

# Append metrics in each fold
precision_scores.append(precision)
# Similarly for other metrics...

# After cross-validation
print(f'Precision: {np.mean(precision_scores)} ± {np.std(precision_scores)}')
# Similarly for other metrics...


Precision: 0.6867469879518072 ± 0.0


In [33]:
# Content-based model metrics
content_results = pd.DataFrame({
    'Model': ['Content-Based'],
    'RMSE': [rmse],
    'MAE': [mae],
    'Precision': [precision],
    'Recall': [recall],
    'F1-Score': [f1]
})

# 3. Collaborative

In [34]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV, cross_validate, KFold
from surprise.accuracy import rmse, mae

In [35]:
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))


In [36]:
# Create Surprise datasets
data = Dataset.load_from_df(training[['userId', 'movieId', 'rating']], reader)


In [37]:
# Define the parameter grid to search
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.010],
    'reg_all': [0.02, 0.05]
}

# Define the cross-validation iterator
kf = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=kf, n_jobs=-1)

gs.fit(data)


In [38]:
# Best RMSE score
print(f"Best RMSE score: {gs.best_score['rmse']}")

# Best hyperparameters
print("Best hyperparameters:")
print(gs.best_params['rmse'])


Best RMSE score: 0.8746784691583634
Best hyperparameters:
{'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.05}


In [39]:
# Build the training set
trainset = data.build_full_trainset()

# Use the best model
best_svd = gs.best_estimator['rmse']

# Train the model on the full training set
best_svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x18b048cb3d0>

In [40]:
# Build the testset from the test data
testing_data = Dataset.load_from_df(testing[['userId', 'movieId', 'rating']], reader)
testingset = testing_data.build_full_trainset().build_testset()

# Predict ratings
predictions = best_svd.test(testingset)


In [41]:
# Extract true and predicted ratings
y_true = np.array([pred.r_ui for pred in predictions])
y_pred = np.array([pred.est for pred in predictions])


In [42]:
# Calculate RMSE and MAE
rmse = np.sqrt(root_mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)


In [43]:
# Define rating threshold
rating_threshold = 4.0

# Convert ratings to binary relevance
y_true_binary = (y_true >= rating_threshold).astype(int)
y_pred_binary = (y_pred >= rating_threshold).astype(int)


In [44]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_true_binary, y_pred_binary, zero_division=0)
recall = recall_score(y_true_binary, y_pred_binary, zero_division=0)
f1 = f1_score(y_true_binary, y_pred_binary, zero_division=0)


In [45]:
# Create a DataFrame to display the results
results = pd.DataFrame({
    'RMSE': [rmse],
    'MAE': [mae],
    'Precision': [precision],
    'Recall': [recall],
    'F1-Score': [f1]
})

print("\nTesting Metrics:")
print(results)



Testing Metrics:
       RMSE       MAE  Precision    Recall  F1-Score
0  0.859537  0.660689   0.820999  0.350559  0.491326


In [46]:
# Initialize lists to store cross-validation results
rmse_list = []
mae_list = []
precision_list = []
recall_list = []
f1_list = []

# Perform 5-fold cross-validation
for trainset_cv, testset_cv in kf.split(data):
    # Train the algorithm on trainset
    best_svd.fit(trainset_cv)
    
    # Test the algorithm on testset
    predictions_cv = best_svd.test(testset_cv)
    
    # Extract true and predicted ratings
    y_true_cv = np.array([pred.r_ui for pred in predictions_cv])
    y_pred_cv = np.array([pred.est for pred in predictions_cv])
    
    # Compute RMSE and MAE
    rmse_cv = np.sqrt(root_mean_squared_error(y_true_cv, y_pred_cv))
    mae_cv = mean_absolute_error(y_true_cv, y_pred_cv)
    
    # Convert to binary relevance
    y_true_cv_binary = (y_true_cv >= rating_threshold).astype(int)
    y_pred_cv_binary = (y_pred_cv >= rating_threshold).astype(int)
    
    # Compute Precision, Recall, and F1-Score
    precision_cv = precision_score(y_true_cv_binary, y_pred_cv_binary, zero_division=0)
    recall_cv = recall_score(y_true_cv_binary, y_pred_cv_binary, zero_division=0)
    f1_cv = f1_score(y_true_cv_binary, y_pred_cv_binary, zero_division=0)
    
    # Append metrics to lists
    rmse_list.append(rmse_cv)
    mae_list.append(mae_cv)
    precision_list.append(precision_cv)
    recall_list.append(recall_cv)
    f1_list.append(f1_cv)

# Create a DataFrame with cross-validation metrics
cv_results = pd.DataFrame({
    'Fold': range(1, 6),
    'RMSE': rmse_list,
    'MAE': mae_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1-Score': f1_list
})

print("\nCross-Validation Metrics:")
print(cv_results)

# Calculate average metrics
avg_metrics = cv_results.mean(numeric_only=True)

print("\nAverage Cross-Validation Metrics:")
print(avg_metrics)



Cross-Validation Metrics:
   Fold      RMSE       MAE  Precision    Recall  F1-Score
0     1  0.885114  0.682038   0.812895  0.346263  0.485655
1     2  0.874633  0.673629   0.810702  0.347527  0.486503
2     3  0.872095  0.670034   0.812078  0.347732  0.486951
3     4  0.872325  0.670060   0.803193  0.330666  0.468468
4     5  0.876103  0.671348   0.816533  0.333191  0.473264

Average Cross-Validation Metrics:
Fold         3.000000
RMSE         0.876054
MAE          0.673422
Precision    0.811080
Recall       0.341076
F1-Score     0.480168
dtype: float64


In [47]:
# Build the validationset from the validation data
validation_data = Dataset.load_from_df(validation[['userId', 'movieId', 'rating']], reader)
validationset = validation_data.build_full_trainset().build_testset()


# Predict ratings
predictions_test = best_svd.test(validationset)

# Extract true and predicted ratings
y_true_test = np.array([pred.r_ui for pred in predictions_test])
y_pred_test = np.array([pred.est for pred in predictions_test])

# Compute RMSE and MAE
rmse_test = np.sqrt(root_mean_squared_error(y_true_test, y_pred_test))
mae_test = mean_absolute_error(y_true_test, y_pred_test)

# Convert to binary relevance
y_true_test_binary = (y_true_test >= rating_threshold).astype(int)
y_pred_test_binary = (y_pred_test >= rating_threshold).astype(int)

# Compute Precision, Recall, and F1-Score
precision_test = precision_score(y_true_test_binary, y_pred_test_binary, zero_division=0)
recall_test = recall_score(y_true_test_binary, y_pred_test_binary, zero_division=0)
f1_test = f1_score(y_true_test_binary, y_pred_test_binary, zero_division=0)

# Create a DataFrame to display the results
test_results = pd.DataFrame({
    'RMSE': [rmse_test],
    'MAE': [mae_test],
    'Precision': [precision_test],
    'Recall': [recall_test],
    'F1-Score': [f1_test]
})

print("\nValidation Metrics:")
print(test_results)



Validation Metrics:
      RMSE       MAE  Precision    Recall  F1-Score
0  0.86899  0.668116   0.808086  0.326983  0.465576


In [48]:
# Collaborative filtering model metrics
collaborative_results = pd.DataFrame({
    'Model': ['Collaborative Filtering'],
    'RMSE': [rmse],
    'MAE': [mae],
    'Precision': [precision],
    'Recall': [recall],
    'F1-Score': [f1]
})

# 4. Hybrid Model

In [49]:
# Define weights for each model
weight_content = 0.2
weight_collaborative = 1- weight_content


In [50]:
# Ensure that 'testing' DataFrame contains both content-based and collaborative predictions

# From content-based filtering
testing_content = testing.copy()
testing_content = testing_content[['userId', 'movieId', 'predicted_rating', 'relevant', 'predicted_relevant', 'rating']]

# From collaborative filtering
# We have 'predictions' list from collaborative filtering (Surprise library)
# Convert the predictions to a DataFrame
collab_preds = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions],
                            columns=['userId', 'movieId', 'predicted_rating_collab'])

# Merge the content-based and collaborative predictions on userId and movieId
hybrid_data = pd.merge(testing_content, collab_preds, on=['userId', 'movieId'], how='inner')

# Now, compute the hybrid predicted rating
hybrid_data['predicted_rating_hybrid'] = (weight_content * hybrid_data['predicted_rating'] +
                                          weight_collaborative * hybrid_data['predicted_rating_collab'])


In [51]:
testing.head()

Unnamed: 0,userId,movieId,rating,relevant,predicted_rating,predicted_relevant
82248,522,1265,4.5,True,3.734266,False
4631,28,47629,2.0,False,3.01699,False
84347,541,224,4.0,True,3.360656,False
95803,600,30749,2.0,False,3.018987,False
17706,111,94478,4.0,True,4.0,False


In [52]:
# True ratings
y_true_hybrid = hybrid_data['rating']

# Hybrid predicted ratings
y_pred_hybrid = hybrid_data['predicted_rating']

# Compute RMSE and MAE
rmse_hybrid = np.sqrt(root_mean_squared_error(y_true_hybrid, y_pred_hybrid))
mae_hybrid = mean_absolute_error(y_true_hybrid, y_pred_hybrid)

print(rmse_hybrid, mae_hybrid)


1.001351598477931 0.7508850234582828


In [53]:
# Define rating threshold
rating_threshold = 4.0

# Convert true ratings to binary relevance
hybrid_data['relevant'] = hybrid_data['rating'] >= rating_threshold

# Convert hybrid predicted ratings to binary relevance
hybrid_data['predicted_relevant_hybrid'] = hybrid_data['predicted_rating_hybrid'] >= rating_threshold

# Compute Precision, Recall, and F1-Score
precision_hybrid = precision_score(hybrid_data['relevant'], hybrid_data['predicted_relevant_hybrid'], zero_division=0)
recall_hybrid = recall_score(hybrid_data['relevant'], hybrid_data['predicted_relevant_hybrid'], zero_division=0)
f1_hybrid = f1_score(hybrid_data['relevant'], hybrid_data['predicted_relevant_hybrid'], zero_division=0)


print(precision_hybrid, recall_hybrid, f1_hybrid)

0.8363376251788269 0.33344741044946385 0.47679634613816163


In [54]:
# Create a DataFrame to display the results
hybrid_results = pd.DataFrame({
    'RMSE': [rmse_hybrid],
    'MAE': [mae_hybrid],
    'Precision': [precision_hybrid],
    'Recall': [recall_hybrid],
    'F1-Score': [f1_hybrid]
})

print("\nHybrid Model Metrics:")
print(hybrid_results)



Hybrid Model Metrics:
       RMSE       MAE  Precision    Recall  F1-Score
0  1.001352  0.750885   0.836338  0.333447  0.476796


In [55]:


# Hybrid model metrics
hybrid_results['Model'] = 'Hybrid'

# Combine all results
all_results = pd.concat([content_results, collaborative_results, hybrid_results], ignore_index=True)

# Rearrange columns
all_results = all_results[['Model', 'RMSE', 'MAE', 'Precision', 'Recall', 'F1-Score']]

print("\nComparison of Models:")
print(all_results)



Comparison of Models:
                     Model      RMSE       MAE  Precision    Recall  F1-Score
0            Content-Based  1.002705  0.750885   0.686747  0.006502  0.012883
1  Collaborative Filtering  0.859537  0.660689   0.820999  0.350559  0.491326
2                   Hybrid  1.001352  0.750885   0.836338  0.333447  0.476796
