# 1. Data Preparation
## Load and Clean Data

In [27]:
import pandas as pd

# Load datasets
movies = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\movies.csv')
ratings = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\ratings.csv')
tags = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS340-Midterm\Small MovieLens\tags.csv')

# Keep necessary columns
movies = movies[['movieId', 'title', 'genres']]
ratings = ratings[['userId', 'movieId', 'rating']]
tags = tags[['movieId', 'tag']]


## Process Movie Data

Extract year, clean titles, and combine genres and tags.

In [28]:
import numpy as np
import re

# Function to extract year from title
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    else:
        return np.nan

# Apply the function to create 'year' column
movies['year'] = movies['title'].apply(extract_year)

# Clean the 'title' by removing the year and converting to lowercase
movies['title_clean'] = movies['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)', '', x).lower())

# Group tags by 'movieId' and concatenate them into a single string
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge tags with movies
movies = pd.merge(movies, tags_grouped, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

# Combine genres, title_clean, tags, and year into the 'related' column
movies['year_str'] = movies['year'].astype(str)
movies['related'] = movies['genres'].str.replace('|', ' ') + ' ' + movies['title_clean'] + ' ' + movies['tag'] + ' ' + movies['year_str']

# Preprocess the 'related' column
movies['related'] = movies['related'].str.lower()
movies['related'] = movies['related'].str.replace(r'\d+', '', regex=True)
movies['related'] = movies['related'].str.replace(r'[^a-z\s]', '', regex=True)
movies['related'] = movies['related'].str.strip()


## Split Data into Training, Testing, and Validation Sets
We'll create training, testing, and validation sets for both CF and CBF models.

### For Ratings Data (CF Model)

In [29]:
from sklearn.model_selection import train_test_split

# Split 10% of the data for validation
cf_remaining, cf_validation = train_test_split(
    ratings, test_size=0.1, random_state=42, stratify=ratings['userId']
)

# Split remaining data into training and testing sets (80% train, 20% test)
cf_train, cf_test = train_test_split(
    cf_remaining, test_size=0.2, random_state=42, stratify=cf_remaining['userId']
)


### For Movies Data (CBF Model)

Since CBF relies on item features, we'll split the movies.



In [30]:
# Split 10% of the movies for validation
cbf_remaining, cbf_validation = train_test_split(
    movies, test_size=0.1, random_state=42
)

# Split remaining data into training and testing sets (80% train, 20% test)
cbf_train, cbf_test = train_test_split(
    cbf_remaining, test_size=0.2, random_state=42
)


# 2. Content-Based Filtering (CBF Model)
## Step 1: Vectorize Item Content
Use TF-IDF to vectorize the 'related' column in cbf_train.

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer with English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'related' column
tfidf_matrix = tfidf.fit_transform(cbf_train['related'])


## Step 2: Build User Profiles
Create user profiles by aggregating the content of the movies they've rated.

### a. Merge Ratings with Movie Content

In [32]:
# Merge cf_train with cbf_train to get 'related' content
user_ratings = pd.merge(cf_train, cbf_train[['movieId', 'related']], on='movieId', how='inner')


### b. Transform 'related' Conten

In [33]:
# Transform the 'related' column in user_ratings
user_ratings_tfidf = tfidf.transform(user_ratings['related'])


### c. Build User Profiles
Aggregate the TF-IDF vectors weighted by the user's ratings.

In [34]:
import numpy as np

# Create a dictionary to store user profiles
user_profiles = {}

# Get unique user IDs
user_ids = user_ratings['userId'].unique()

for user_id in user_ids:
    # Get indices of movies rated by the user
    indices = user_ratings[user_ratings['userId'] == user_id].index
    
    # Get the TF-IDF vectors and ratings
    tfidf_vectors = user_ratings_tfidf[indices]
    ratings_values = user_ratings.loc[indices, 'rating'].values.reshape(-1, 1)
    
    # Compute the weighted average
    weighted_tfidf = tfidf_vectors.multiply(ratings_values)
    user_profile = weighted_tfidf.mean(axis=0)
    
    user_profiles[user_id] = user_profile


## Step 3: Predict Ratings
Predict ratings for user-item pairs in cf_test.

### a. Prepare Test Data

In [35]:
# Merge cf_test with cbf_test to get 'related' content
test_data = pd.merge(cf_test, cbf_test[['movieId', 'related']], on='movieId', how='inner')


### b. Transform 'related' Content in Test Data

In [36]:
# Transform the 'related' column in test_data
test_tfidf = tfidf.transform(test_data['related'])


### c. Predict Ratings
Compute the dot product between user profiles and item vectors.

In [37]:
predicted_ratings = []

for idx, row in test_data.iterrows():
    user_id = row['userId']
    movie_tfidf = test_tfidf[idx]  # This is a sparse matrix
    user_profile = user_profiles.get(user_id)  # This is also a sparse matrix
    
    if user_profile is not None:
        # Convert both user_profile and movie_tfidf to dense arrays
        user_profile_dense = user_profile.A.flatten()  # Convert to 1D array
        movie_tfidf_dense = movie_tfidf.toarray().flatten()  # Convert to 1D array
        
        # Compute dot product and append as float
        pred_rating = np.dot(user_profile_dense, movie_tfidf_dense)  # Dot product
        predicted_ratings.append(float(pred_rating))
    else:
        # Assign NaN if user profile is not available
        predicted_ratings.append(np.nan)

test_data['predicted_rating'] = predicted_ratings


## Step 4: Scale Predicted Ratings
Scale the predicted ratings to match the actual rating scale (0.5 to 5)

In [38]:
from sklearn.preprocessing import MinMaxScaler

# Remove NaN values
valid_predictions = test_data.dropna(subset=['predicted_rating'])

# Scale predicted ratings
scaler = MinMaxScaler(feature_range=(0.5, 5))
scaled_ratings = scaler.fit_transform(valid_predictions['predicted_rating'].values.reshape(-1, 1))
valid_predictions['predicted_rating'] = scaled_ratings.flatten()


## Step 5: Evaluate the CBF Model
Compute RMSE and MAE

## Step 5: Evaluate the CBF Model
Compute RMSE and MAE

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(valid_predictions['rating'], valid_predictions['predicted_rating'], squared=False)
mae = mean_absolute_error(valid_predictions['rating'], valid_predictions['predicted_rating'])

print(f"CBF Model RMSE: {rmse:.4f}")
print(f"CBF Model MAE: {mae:.4f}")


CBF Model RMSE: 2.5547
CBF Model MAE: 2.3343




Compute Precision, Recall, and F1-score
Define relevant items as those with ratings ≥ 4.

In [40]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Define relevance
valid_predictions['actual_relevant'] = valid_predictions['rating'] >= 4.0
valid_predictions['predicted_relevant'] = valid_predictions['predicted_rating'] >= 4.0

# Compute metrics
precision = precision_score(valid_predictions['actual_relevant'], valid_predictions['predicted_relevant'])
recall = recall_score(valid_predictions['actual_relevant'], valid_predictions['predicted_relevant'])
f1 = f1_score(valid_predictions['actual_relevant'], valid_predictions['predicted_relevant'])

print(f"CBF Model Precision: {precision:.4f}")
print(f"CBF Model Recall: {recall:.4f}")
print(f"CBF Model F1-Score: {f1:.4f}")


CBF Model Precision: 1.0000
CBF Model Recall: 0.0026
CBF Model F1-Score: 0.0052


# 3. Collaborative Filtering (CF) Model
We'll use the Surprise library to implement a CF model using SVD.

## Step 1: Prepare Data
Convert cf_train and cf_test into Surprise data structures.

In [41]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split

# Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Create a validation set
train_data, validation_data = train_test_split(ratings, test_size=0.1, random_state=42, stratify=ratings['userId'])

# Load training data into Surprise format
trainset_full = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

# Perform Surprise train-test split on training data
trainset = trainset_full.build_full_trainset()


## Step 2: Train CF Model

In [42]:
# Initialize SVD algorithm
algo = SVD()

# Train the algorithm on the trainset
algo.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1fef0ab9750>

## Step 3: Predict Ratings
Predict ratings for user-item pairs in cf_test.

In [43]:
# Prepare testset for prediction
testset = list(zip(cf_test['userId'], cf_test['movieId'], cf_test['rating']))

# Get predictions
predictions = algo.test(testset)


## Step 4: Evaluate the CF Model
Extract Predictions

In [44]:
# Convert predictions to DataFrame
pred_df = pd.DataFrame([(pred.uid, pred.iid, pred.r_ui, pred.est) for pred in predictions],
                        columns=['userId', 'movieId', 'rating', 'predicted_rating'])


Compute RMSE and MAE

In [45]:
from surprise import accuracy

rmse = accuracy.rmse(predictions, verbose=False)
mae = accuracy.mae(predictions, verbose=False)

print(f"CF Model RMSE: {rmse:.4f}")
print(f"CF Model MAE: {mae:.4f}")


CF Model RMSE: 0.6269
CF Model MAE: 0.4892


Compute Precision, Recall, and F1-score


In [46]:
# Define relevance
pred_df['actual_relevant'] = pred_df['rating'] >= 4.0
pred_df['predicted_relevant'] = pred_df['predicted_rating'] >= 4.0

# Compute metrics
precision = precision_score(pred_df['actual_relevant'], pred_df['predicted_relevant'])
recall = recall_score(pred_df['actual_relevant'], pred_df['predicted_relevant'])
f1 = f1_score(pred_df['actual_relevant'], pred_df['predicted_relevant'])

print(f"CF Model Precision: {precision:.4f}")
print(f"CF Model Recall: {recall:.4f}")
print(f"CF Model F1-Score: {f1:.4f}")


CF Model Precision: 0.9308
CF Model Recall: 0.4589
CF Model F1-Score: 0.6147


# 4. Hybrid Model
Combine predictions from CBF and CF models.

## Step 1: Merge Predictions
Merge valid_predictions from CBF and pred_df from CF on userId and movieId.

In [47]:
# Merge predictions
hybrid_df = pd.merge(valid_predictions[['userId', 'movieId', 'predicted_rating']], 
                        pred_df[['userId', 'movieId', 'predicted_rating']], 
                        on=['userId', 'movieId'], 
                        suffixes=('_cbf', '_cf'))


## Step 2: Combine Predictions
Use a weighted average to combine the predicted ratings.

In [48]:
# Assign weights
weight_cbf = 0.5
weight_cf = 0.5

# Compute hybrid predicted rating
hybrid_df['predicted_rating'] = (weight_cbf * hybrid_df['predicted_rating_cbf'] + 
                                 weight_cf * hybrid_df['predicted_rating_cf'])


## Step 3: Evaluate the Hybrid Model
Merge with Actual Ratings

In [49]:
# Merge hybrid predictions with actual ratings
hybrid_df = pd.merge(hybrid_df, cf_test[['userId', 'movieId', 'rating']], on=['userId', 'movieId'])


Compute RMSE and MAE

In [50]:
rmse = mean_squared_error(hybrid_df['rating'], hybrid_df['predicted_rating'], squared=False)
mae = mean_absolute_error(hybrid_df['rating'], hybrid_df['predicted_rating'])

print(f"Hybrid Model RMSE: {rmse:.4f}")
print(f"Hybrid Model MAE: {mae:.4f}")


Hybrid Model RMSE: 1.4022
Hybrid Model MAE: 1.2378




Compute Precision, Recall, and F1-score



In [51]:
# Define relevance
hybrid_df['actual_relevant'] = hybrid_df['rating'] >= 4.0
hybrid_df['predicted_relevant'] = hybrid_df['predicted_rating'] >= 4.0

# Compute metrics
precision = precision_score(hybrid_df['actual_relevant'], hybrid_df['predicted_relevant'])
recall = recall_score(hybrid_df['actual_relevant'], hybrid_df['predicted_relevant'])
f1 = f1_score(hybrid_df['actual_relevant'], hybrid_df['predicted_relevant'])

print(f"Hybrid Model Precision: {precision:.4f}")
print(f"Hybrid Model Recall: {recall:.4f}")
print(f"Hybrid Model F1-Score: {f1:.4f}")


Hybrid Model Precision: 1.0000
Hybrid Model Recall: 0.0032
Hybrid Model F1-Score: 0.0065


# 5. Final validation

In [52]:
# For CF Model
validationset = list(zip(cf_validation['userId'], cf_validation['movieId'], cf_validation['rating']))
validation_predictions = algo.test(validationset)

# Compute RMSE and MAE on validation set
rmse_val = accuracy.rmse(validation_predictions, verbose=False)
mae_val = accuracy.mae(validation_predictions, verbose=False)

print(f"CF Model Validation RMSE: {rmse_val:.4f}")
print(f"CF Model Validation MAE: {mae_val:.4f}")

# For CBF Model
# Merge cf_validation with cbf_validation
validation_data = pd.merge(cf_validation, cbf_validation[['movieId', 'related']], on='movieId', how='inner')
validation_tfidf = tfidf.transform(validation_data['related'])

# Predict ratings for validation data
predicted_ratings_val = []

for idx, row in validation_data.iterrows():
    user_id = row['userId']
    movie_tfidf = validation_tfidf[idx]  # Sparse matrix
    user_profile = user_profiles.get(user_id)  # Sparse matrix
    
    if user_profile is not None:
        # Convert both user_profile and movie_tfidf to dense arrays
        user_profile_dense = user_profile.A.flatten()  # Convert to dense array
        movie_tfidf_dense = movie_tfidf.toarray().flatten()  # Convert to dense array

        # Compute the dot product and append the scalar result
        pred_rating = np.dot(user_profile_dense, movie_tfidf_dense)
        predicted_ratings_val.append(pred_rating)
    else:
        predicted_ratings_val.append(np.nan)

# Add the predicted ratings to the DataFrame
validation_data['predicted_rating'] = predicted_ratings_val

# Scale predicted ratings
valid_predictions_val = validation_data.dropna(subset=['predicted_rating'])
scaled_ratings_val = scaler.transform(valid_predictions_val['predicted_rating'].values.reshape(-1, 1))
valid_predictions_val['predicted_rating'] = scaled_ratings_val.flatten()

# Compute RMSE and MAE
rmse_val = mean_squared_error(valid_predictions_val['rating'], valid_predictions_val['predicted_rating'], squared=False)
mae_val = mean_absolute_error(valid_predictions_val['rating'], valid_predictions_val['predicted_rating'])

print(f"CBF Model Validation RMSE: {rmse_val:.4f}")
print(f"CBF Model Validation MAE: {mae_val:.4f}")


CF Model Validation RMSE: 0.8619
CF Model Validation MAE: 0.6617
CBF Model Validation RMSE: 2.5573
CBF Model Validation MAE: 2.3362


