In [1]:
# Step 1: Data Collection & Preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
url = 'https://raw.githubusercontent.com/siddhantbhattarai/Machine_Learning_Bootcamp_2024/refs/heads/main/Datasets/movies.csv'
movies_df = pd.read_csv(url)

In [3]:
# Data Cleaning
movies_df['VOTES'] = movies_df['VOTES'].replace(',', '', regex=True).astype(float)
movies_df['YEAR'] = pd.to_numeric(movies_df['YEAR'].str.extract(r'(\d{4})')[0], errors='coerce')
movies_df['GENRE'] = movies_df['GENRE'].str.replace('\n', '').str.strip()
movies_df['ONE-LINE'] = movies_df['ONE-LINE'].str.strip()

In [4]:
# Handling Missing Data
imputer = SimpleImputer(strategy='mean')

In [5]:
# Apply imputation to relevant columns
movies_df[['RATING', 'VOTES', 'RunTime']] = imputer.fit_transform(movies_df[['RATING', 'VOTES', 'RunTime']])

In [6]:
# Fill missing genres with 'Unknown'
movies_df['GENRE'] = movies_df['GENRE'].fillna('Unknown')

In [7]:
# Text Features using TF-IDF for 'ONE-LINE' and 'STARS'
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_one_line = tfidf_vectorizer.fit_transform(movies_df['ONE-LINE'])

In [8]:
# One-hot encode genres
onehot_encoder = OneHotEncoder()
genres_encoded = onehot_encoder.fit_transform(movies_df[['GENRE']]).toarray()

In [9]:
# Combine numeric features and TF-IDF vectors
numeric_columns = ['RATING', 'VOTES', 'RunTime']
scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(movies_df[numeric_columns])

In [10]:
# Concatenate all features
features_combined = np.hstack([scaled_numeric, genres_encoded, tfidf_one_line.toarray()])

In [11]:
# Train-test split
X_train, X_test = train_test_split(features_combined, test_size=0.2, random_state=42)

In [12]:
# Step 2: Model Training

# 1. Collaborative Filtering using KNN
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(X_train)

In [13]:
def knn_recommend(movie_index, n_recommendations=5):
    distances, indices = knn.kneighbors(X_train[[movie_index]], n_neighbors=n_recommendations+1)
    return movies_df.iloc[indices[0][1:]]

In [14]:
# 2. Content-Based Filtering using Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X_train)

In [15]:
def content_based_recommend(movie_index, n_recommendations=5):
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1:n_recommendations+1]]
    return movies_df.iloc[top_indices]

In [16]:
# 3. Hybrid Model (Combining KNN and Content-Based)
def hybrid_recommend(movie_index, n_recommendations=5):
    knn_rec = knn_recommend(movie_index, n_recommendations // 2)
    content_rec = content_based_recommend(movie_index, n_recommendations // 2)
    return pd.concat([knn_rec, content_rec]).drop_duplicates().head(n_recommendations)

In [17]:
# Step 3: Evaluation
from sklearn.metrics import mean_squared_error

def evaluate_model(X_test, model, n_recommendations=5):
    y_true = []
    y_pred = []
    for i in range(len(X_test[:10])):  # Evaluating first 10 samples
        recommendations = model(i, n_recommendations=n_recommendations)
        y_true.append(movies_df.iloc[i]['RATING'])
        y_pred.append(recommendations['RATING'].mean())  # Average of recommendations' ratings
    
    return mean_squared_error(y_true, y_pred)

In [18]:
mse_knn = evaluate_model(X_test, knn_recommend)
mse_content = evaluate_model(X_test, content_based_recommend)
mse_hybrid = evaluate_model(X_test, hybrid_recommend)

In [19]:
print(f'MSE - KNN: {mse_knn}, Content-Based: {mse_content}, Hybrid: {mse_hybrid}')

MSE - KNN: 1.864867559088586, Content-Based: 1.864867559088586, Hybrid: 1.8085060782931826
