Since I haven't scraped film data (directors, writers, genres, etc.) yet, I'll start with a collaborative filtering approach. Ultimately, I plan to compare these results to a content-based methodology, and then potentially ensemble the two systems. 

There are two key types of collaborative filtering approaches: memory-based and model-based. Memory-based approaches (user-based and item-based) hinge on grouping similar users/items, and using those similarities to inform the recommendations. For example, if Rob and Josie like a lot of the same movies, then Rob will be recommended a movie that Josie enjoyed. I decided to start here.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sparse
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, mean_squared_error, f1_score, roc_auc_score, confusion_matrix
from implicit import als
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('data/combined_data.csv')

# Create outcome variable
df['enjoyed'] = df.apply(lambda row: 1 if row['Rating'] >= 4 or row['Liked'] == 1 else 0, axis=1)

user_item_matrix = df.pivot_table(index='user', columns='Title', values='enjoyed', fill_value=0)

In [3]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=4)

user_item_sparse_matrix = csr_matrix(user_item_matrix)

In [4]:
# Random Guess Baseline
random_predictions = np.random.choice([0, 1], size=len(test_data))

# Accuracy for random baseline
accuracy_random = accuracy_score(test_data['enjoyed'], random_predictions)
print(f"Random Guess Accuracy: {accuracy_random}")

# Precision, Recall, and F1 for Random Guess Baseline
precision_random = precision_score(test_data['enjoyed'], random_predictions)
recall_random = recall_score(test_data['enjoyed'], random_predictions)
f1_random = f1_score(test_data['enjoyed'], random_predictions)

print(f"Random Guess Precision: {precision_random}")
print(f"Random Guess Recall: {recall_random}")
print(f"Random Guess F1-Score: {f1_random}")

# ROC AUC for random baseline
auc_random = roc_auc_score(test_data['enjoyed'], random_predictions)
print(f"Random Guess AUC: {auc_random}")

# Confusion Matrix for Random Guess Baseline
conf_matrix_random = confusion_matrix(test_data['enjoyed'], random_predictions)
# Convert the confusion matrix to a pandas DataFrame for better readability
conf_matrix_df = pd.DataFrame(conf_matrix_random, 
                               columns=['Predicted: 0', 'Predicted: 1'], 
                               index=['Actual: 0', 'Actual: 1'])
print(f"Confusion Matrix (Random Guess): \n{conf_matrix_df}")

Random Guess Accuracy: 0.5006856286371316
Random Guess Precision: 0.4220181735441347
Random Guess Recall: 0.5002514455946557
Random Guess F1-Score: 0.45781666662685416
Random Guess AUC: 0.5006266511465138
Confusion Matrix (Random Guess): 
           Predicted: 0  Predicted: 1
Actual: 0        395308        393727
Actual: 1        287194        287483


In [5]:
# Compute user similarity matrix using cosine similarity
user_similarity = cosine_similarity(user_item_matrix)

# Convert the user similarity matrix into a DataFrame for better readability
user_similarity_df = pd.DataFrame(user_similarity, 
                                  index=user_item_matrix.index, 
                                  columns=user_item_matrix.index)



In [6]:
# Function to check if a key exists in the data
def check_data_availability(user_id, movie_title, user_similarity_df, user_item_matrix):
    # Check if user_id exists in the similarity dataframe
    if user_id not in user_similarity_df.index:
        print(f"Warning: {user_id} not found in user_similarity_df")
        return False
    # Check if movie_title exists in the user-item matrix
    if movie_title not in user_item_matrix.columns:
        print(f"Warning: {movie_title} not found in user_item_matrix")
        return False
    return True

def predict_user_based_binary(user_id, movie_title, user_similarity_df, user_item_matrix, top_n=5):
    # Check if the user and movie are available in the data
    if not check_data_availability(user_id, movie_title, user_similarity_df, user_item_matrix):
        return 0  # Fallback to predicting 0 if the user or movie is missing
    # Get the list of users who have rated the movie
    movie_ratings = user_item_matrix[movie_title]
    # Get the similarity scores for the target user with all other users
    similarities = user_similarity_df[user_id]
    # Filter out the target user from the similarity scores (no need to compare with themselves)
    similarities = similarities.drop(user_id)
    # Get the top N most similar users
    top_similar_users = similarities.nlargest(top_n).index
    # Get ratings of the top N similar users for the movie
    top_ratings = movie_ratings[top_similar_users]
    # Ensure we don't divide by zero: Check if the sum of similarities is non-zero
    similarity_sum = np.sum(similarities[top_similar_users])
    # Calculate the weighted average of ratings based on the similarity scores
    if similarity_sum == 0:
        # If no similarity (i.e., sum is zero), we can use a fallback approach.
        # Option 1: Use global average rating (or user average rating)
        weighted_avg = user_item_matrix.loc[user_id].mean()  # For user-based fallback
    else:
        # Calculate the weighted average of ratings based on the similarity scores
        weighted_avg = np.dot(similarities[top_similar_users], top_ratings) / similarity_sum
    # Convert the weighted average to a binary prediction (1 if weighted_avg > 0.5, else 0)
    return 1 if weighted_avg > 0.5 else 0

predict_user_based_binary("noir1946", "The Ox-Bow Incident", user_similarity_df, user_item_matrix)

1

In [10]:
# Function to evaluate using multiple metrics
def evaluate_user_based_metrics(user_item_matrix, user_similarity_df, test_data):
    predictions = []
    actuals = []

    # Wrap the iteration with tqdm to show a progress bar
    for _, row in tqdm(test_data.iterrows(), total=test_data.shape[0], desc="Evaluating", ncols=100):
        user_id = row['user']
        movie_title = row['Title']
        actual_rating = row['enjoyed']
        
        # Predict the rating for the user-movie pair (binary)
        predicted_rating = predict_user_based_binary(user_id, movie_title, user_similarity_df, user_item_matrix)
        
        predictions.append(predicted_rating)
        actuals.append(actual_rating)
    
    # Calculate the metrics
    accuracy = accuracy_score(actuals, predictions)
    precision = precision_score(actuals, predictions)
    recall = recall_score(actuals, predictions)
    f1 = f1_score(actuals, predictions)
    auc_roc = roc_auc_score(actuals, predictions)
    conf_matrix = confusion_matrix(actuals, predictions)

    # Display the results
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print(f"AUC-ROC: {auc_roc}")
    print(f"Confusion Matrix:\n{conf_matrix}")

# Assuming you have a test dataset (test_data)
evaluate_user_based_metrics(user_item_matrix, user_similarity_df, test_data)

Evaluating:   0%|                                                       | 0/1363712 [00:00<?, ?it/s]

Accuracy: 0.6785750950347288
Precision: 0.6970135709992139
Recall: 0.41969488947704536
F1-Score: 0.5239201528387363
AUC-ROC: 0.6434099609767124
Confusion Matrix:
[[684192 104843]
 [333488 241189]]


- Accuracy: 0.6785750950347288
- Precision: 0.6970135709992139
- Recall: 0.41969488947704536
- F1-Score: 0.5239201528387363
- AUC-ROC: 0.6434099609767124
- Confusion Matrix:
- [[684192 104843]
 [333488 241189]]