In [23]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import SVD, Dataset as SurpriseDataset, Reader, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
import xgboost as xgb



In [24]:
# Load the MovieLens 100K dataset using Surprise
def load_movielens_data():
    data = SurpriseDataset.load_builtin('ml-100k')
    trainset, testset = surprise_train_test_split(data, test_size=0.2)
    return trainset, testset

# Load user and movie information from files
def load_user_movie_data():
    u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
    users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
    m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
    movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5), encoding='latin-1')
    return users, movies

In [35]:
# # Prepare data for Neural Collaborative Filtering (NCF)
# def prepare_data_for_ncf(data):
#     df = pd.DataFrame(data.raw_ratings, columns=["userId", "movieId", "rating", "timestamp"])
#     user_ids = df["userId"].unique()
#     movie_ids = df["movieId"].unique()
#     user_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
#     movie_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}
#     df["userId"] = df["userId"].map(user_to_index)
#     df["movieId"] = df["movieId"].map(movie_to_index)
#     train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
#     return train_df, test_df, user_to_index, movie_to_index

def prepare_data_for_ncf(data):
    # Convert the data to a pandas DataFrame directly from the full dataset's raw ratings
    raw_ratings = data.raw_ratings
    df = pd.DataFrame(raw_ratings, columns=["userId", "movieId", "rating", "timestamp"])

    user_ids = df["userId"].unique()
    movie_ids = df["movieId"].unique()
    user_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
    movie_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}
    df["userId"] = df["userId"].map(user_to_index)
    df["movieId"] = df["movieId"].map(movie_to_index)

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    return train_df, test_df, user_to_index, movie_to_index

In [36]:
def load_and_prepare_data():
    data = SurpriseDataset.load_builtin('ml-100k')
    train_df, test_df, user_to_index, movie_to_index = prepare_data_for_ncf(data)
    return train_df, test_df, user_to_index, movie_to_index

In [26]:
# PyTorch dataset class for NCF
class MovieLensDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = torch.tensor(users, dtype=torch.long)
        self.movies = torch.tensor(movies, dtype=torch.long)
        self.ratings = torch.tensor(ratings, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

# Define the Neural Collaborative Filtering model
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=20):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc_layers = nn.Sequential(
            nn.Linear(2 * embedding_dim, 128), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1)
        )

    def forward(self, user_indices, item_indices):
        user_embedding = self.user_embedding(user_indices)
        item_embedding = self.item_embedding(item_indices)
        x = torch.cat([user_embedding, item_embedding], dim=-1)
        return self.fc_layers(x).squeeze()

In [27]:
#Train the NCF model
def train_ncf_model(ncf_model, train_loader, device, optimizer, criterion, epochs=20):
    ncf_model.train()
    for epoch in range(epochs):
        for users, movies, ratings in train_loader:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            optimizer.zero_grad()
            predictions = ncf_model(users, movies)
            loss = criterion(predictions.squeeze(), ratings)
            loss.backward()
            optimizer.step()



In [28]:
# Predict using SVD and NCF models
def predict_svd_ncf(svd_model, testset, ncf_model, test_loader, device):
    svd_predictions = svd_model.test(testset)
    svd_estimates = np.array([pred.est for pred in svd_predictions])
    ncf_estimates = []
    ncf_model.eval()
    with torch.no_grad():
        for users, movies, _ in test_loader:
            users, movies = users.to(device), movies.to(device)
            predictions = ncf_model(users, movies)
            ncf_estimates.extend(predictions.cpu().numpy())
    return svd_estimates, ncf_estimates, svd_predictions



In [29]:
# Train a meta-learner using XGBoost
def train_meta_learner(svd_estimates, ncf_estimates, ratings):
    X = np.vstack((svd_estimates, ncf_estimates)).T
    y = np.array([pred.r_ui for pred in ratings])
    meta_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3)
    meta_model.fit(X, y)
    return meta_model



In [30]:
# Calculate the final RMSE using the meta-learner's predictions
def calculate_final_rmse(meta_model, X, y):
    combined_estimates = meta_model.predict(X)
    final_rmse = np.sqrt(mean_squared_error(y, combined_estimates))
    return final_rmse



In [None]:
# Main function to orchestrate the workflow
def main():
    trainset, testset = load_movielens_data()
    users, movies = load_user_movie_data()
    train_df, test_df, user_to_index, movie_to_index = prepare_data_for_ncf(trainset)

    train_dataset = MovieLensDataset(train_df['userId'].values, train_df['movieId'].values, train_df['rating'].values)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ncf_model = NCF(len(user_to_index), len(movie_to_index)).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(ncf_model.parameters(), lr=0.001)

    train_ncf_model(ncf_model, train_loader, device, optimizer, criterion)

    test_dataset = MovieLensDataset(test_df['userId'].values, test_df['movieId'].values, test_df['rating'].values)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    svd_model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02)
    svd_model.fit(trainset)

    svd_estimates, ncf_estimates, svd_predictions = predict_svd_ncf(svd_model, testset, ncf_model, test_loader, device)

    meta_model = train_meta_learner(svd_estimates, ncf_estimates, svd_predictions)

    X = np.vstack((svd_estimates, ncf_estimates)).T
    y = np.array([pred.r_ui for pred in svd_predictions])
    final_rmse = calculate_final_rmse(meta_model, X, y)

    print(f'Final Hybrid RMSE with Advanced Meta-Learner: {final_rmse}')

if __name__ == "__main__":
    main()


In [31]:
trainset, testset = load_movielens_data()
users, movies = load_user_movie_data()


In [37]:
# train_df, test_df, user_to_index, movie_to_index = prepare_data_for_ncf(trainset)
train_df, test_df, user_to_index, movie_to_index = load_and_prepare_data()

In [38]:
train_df

Unnamed: 0,userId,movieId,rating,timestamp
75220,804,901,1.0,893082619
48955,467,488,5.0,887925187
44966,465,139,4.0,877384940
13568,321,289,4.0,879537844
92727,618,261,4.0,883799651
...,...,...,...,...
6265,81,146,2.0,880245109
54886,339,695,5.0,876403078
76820,434,355,3.0,880140288
860,40,51,3.0,885329671


In [39]:
train_dataset = MovieLensDataset(train_df['userId'].values, train_df['movieId'].values, train_df['rating'].values)


In [41]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)



In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ncf_model = NCF(len(user_to_index), len(movie_to_index)).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(ncf_model.parameters(), lr=0.001)



In [43]:
train_ncf_model(ncf_model, train_loader, device, optimizer, criterion)

In [44]:
test_dataset = MovieLensDataset(test_df['userId'].values, test_df['movieId'].values, test_df['rating'].values)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)



In [45]:
svd_model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02)
svd_model.fit(trainset)

svd_estimates, ncf_estimates, svd_predictions = predict_svd_ncf(svd_model, testset, ncf_model, test_loader, device)



In [46]:
meta_model = train_meta_learner(svd_estimates, ncf_estimates, svd_predictions)



In [47]:
X = np.vstack((svd_estimates, ncf_estimates)).T
y = np.array([pred.r_ui for pred in svd_predictions])
final_rmse = calculate_final_rmse(meta_model, X, y)

print(f'Final Hybrid RMSE with Advanced Meta-Learner: {final_rmse}')

Final Hybrid RMSE with Advanced Meta-Learner: 0.9254847262724768
