## 作业二

In [None]:
import pandas as pd
import numpy as np
import torch
from scipy.sparse import coo_matrix
import time
import matplotlib.pyplot as plt

### 数据准备

In [None]:
# 数据导入
train_data = pd.read_csv('data/netflix_train.txt', sep=r'\s+', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
test_data = pd.read_csv('data/netflix_test.txt', sep=r'\s+', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

all_users = pd.concat([train_data['user_id'], test_data['user_id']]).unique()
all_movies = pd.concat([train_data['movie_id'], test_data['movie_id']]).unique()

user_id_to_index = {user_id: idx for idx, user_id in enumerate(all_users)}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(all_movies)}

num_users = len(all_users)
print(num_users) #10000
num_movies = len(all_movies)
print(num_movies) #10000

train_data['user_idx'] = train_data['user_id'].map(user_id_to_index)
train_data['movie_idx'] = train_data['movie_id'].map(movie_id_to_index)
test_data['user_idx'] = test_data['user_id'].map(user_id_to_index)
test_data['movie_idx'] = test_data['movie_id'].map(movie_id_to_index)

In [None]:
# 创建稀疏矩阵
train_ratings = coo_matrix(
    (train_data['rating'], (train_data['user_idx'], train_data['movie_idx'])),
    shape=(num_users, num_movies)
)
test_ratings = coo_matrix(
    (test_data['rating'], (test_data['user_idx'], test_data['movie_idx'])),
    shape=(num_users, num_movies)
)

# 使用 Torch 创建稠密矩阵 GPU加速计算
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device) #cuda
train_ratings_tensor = torch.FloatTensor(train_ratings.toarray()).to(device)
test_ratings_tensor = torch.FloatTensor(test_ratings.toarray()).to(device)

### 协同过滤

In [None]:
# 计算用户相似度
def compute_user_similarity(ratings):
    user_norms = torch.norm(ratings, dim=1).unsqueeze(1)
    user_norms[user_norms == 0] = 1e-10
    normalized_ratings = ratings / user_norms
    similarity = torch.matmul(normalized_ratings, normalized_ratings.T)
    return similarity

# 预测评分
def predict(ratings, similarity, k=50):
    _, top_k_users = torch.topk(similarity, k=k, dim=1)
    pred_ratings = torch.zeros_like(ratings)
    num_users = ratings.shape[0]
    for i in range(num_users):
        neighbors = top_k_users[i]
        sim_scores = similarity[i][neighbors]
        neighbor_ratings = ratings[neighbors]
        sim_scores_expanded = sim_scores.unsqueeze(1)
        numerator = torch.sum(neighbor_ratings * sim_scores_expanded, dim=0)
        denominator = torch.sum((neighbor_ratings != 0).float() * sim_scores_expanded, dim=0)
        denominator[denominator == 0] = 1e-10
        pred_ratings[i] = numerator / denominator
    return pred_ratings

# 计算 RMSE
def compute_rmse(predictions, targets):
    mask = targets != 0
    mse = torch.mean(((predictions - targets)[mask]) ** 2)
    rmse = torch.sqrt(mse)
    return rmse

start_time = time.time()

print("Calculating user similarity...")
user_similarity = compute_user_similarity(train_ratings_tensor)
print("User similarity calculated.")

print("Predicting ratings...")
pred_ratings = predict(train_ratings_tensor, user_similarity, k=50)
print("Ratings predicted.")

print("Computing RMSE on test set (Collaborative Filtering)...")
rmse_value_cf = compute_rmse(pred_ratings, test_ratings_tensor)
print(f"Test RMSE (Collaborative Filtering): {rmse_value_cf:.4f}")

end_time = time.time()
print(f"Time elapsed: {end_time - start_time:.2f} seconds")

### 矩阵分解

In [None]:
# 矩阵分解模型（添加偏置项和 Dropout）
class MatrixFactorization(torch.nn.Module):
    def __init__(self, num_users, num_items, num_factors, dropout_rate=0.2):
        super(MatrixFactorization, self).__init__()
        self.user_factors = torch.nn.Embedding(num_users, num_factors)
        self.item_factors = torch.nn.Embedding(num_items, num_factors)
        self.user_bias = torch.nn.Embedding(num_users, 1)
        self.item_bias = torch.nn.Embedding(num_items, 1)
        self.global_bias = torch.nn.Parameter(torch.zeros(1))
        self.dropout = torch.nn.Dropout(dropout_rate)
        torch.nn.init.xavier_uniform_(self.user_factors.weight)
        torch.nn.init.xavier_uniform_(self.item_factors.weight)
        torch.nn.init.zeros_(self.user_bias.weight)
        torch.nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_idx, item_idx):
        user_embedding = self.dropout(self.user_factors(user_idx))
        item_embedding = self.dropout(self.item_factors(item_idx))
        user_bias = self.user_bias(user_idx).squeeze()
        item_bias = self.item_bias(item_idx).squeeze()
        return (user_embedding * item_embedding).sum(1) + user_bias + item_bias + self.global_bias

# 训练矩阵分解模型
def train_matrix_factorization(train_data, num_users, num_items, num_factors=20, lr=0.01, reg=0.001, epochs=50):
    model = MatrixFactorization(num_users, num_items, num_factors).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=reg)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
    criterion = torch.nn.MSELoss()

    user_idx = torch.LongTensor(train_data['user_idx'].values).to(device)
    item_idx = torch.LongTensor(train_data['movie_idx'].values).to(device)
    ratings = torch.FloatTensor(train_data['rating'].values).to(device)

    train_losses = []
    test_rmses = []

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        predictions = model(user_idx, item_idx)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_losses.append(loss.item())

        # 计算测试集上的 RMSE
        model.eval()
        with torch.no_grad():
            test_predictions = model(test_user_idx, test_item_idx)
            test_rmse = torch.sqrt(torch.nn.functional.mse_loss(test_predictions, test_ratings)).item()
            test_rmses.append(test_rmse)
        if(epoch % 5 == 0):
            print(f"Epoch {epoch}/{epochs}, Loss: {loss.item():.4f}, Test RMSE: {test_rmse:.4f}")
    
    # 绘制训练损失和测试集 RMSE
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss over Epochs')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(1, epochs + 1), test_rmses, label='Test RMSE', color='orange')
    plt.xlabel('Epochs')
    plt.ylabel('RMSE')
    plt.title('Test RMSE over Epochs')
    plt.legend()

    plt.tight_layout()
    plt.show()
    
    return model

# 矩阵分解
print("Training matrix factorization model...")
test_user_idx = torch.LongTensor(test_data['user_idx'].values).to(device)
test_item_idx = torch.LongTensor(test_data['movie_idx'].values).to(device)
test_ratings = torch.FloatTensor(test_data['rating'].values).to(device)

mf_model = train_matrix_factorization(train_data, num_users, num_movies, num_factors=50, lr=0.01, reg=0.01, epochs=50)

print("Computing RMSE on test set (Matrix Factorization)...")
mf_model.eval()
predictions = mf_model(test_user_idx, test_item_idx)
mf_rmse = torch.sqrt(torch.nn.functional.mse_loss(predictions, test_ratings)).item()
print(f"Test RMSE (Matrix Factorization): {mf_rmse:.4f}")

# 比较结果
print("Comparison of Collaborative Filtering and Matrix Factorization:")
print(f"RMSE (Collaborative Filtering): {rmse_value_cf:.4f}")
print(f"RMSE (Matrix Factorization): {mf_rmse:.4f}")

In [None]:
# 超参数调优
def parameter_tuning(train_data, test_user_idx, test_item_idx, test_ratings, num_users, num_movies):
    k_values = [20, 50]
    lambda_values = [0.001,0.01, 0.1]
    best_rmse = float('inf')
    best_params = None

    for k in k_values:
        for lambda_value in lambda_values:
            print(f"Training with k={k}, lambda={lambda_value}")
            mf_model = train_matrix_factorization(train_data, num_users, num_movies, num_factors=k, lr=0.01, reg=lambda_value, epochs=50)
            
            mf_model.eval()
            with torch.no_grad():
                predictions = mf_model(test_user_idx, test_item_idx)
                mf_rmse = torch.sqrt(torch.nn.functional.mse_loss(predictions, test_ratings)).item()
                print(f"Test RMSE for k={k}, lambda={lambda_value}: {mf_rmse:.4f}")

            if mf_rmse < best_rmse:
                best_rmse = mf_rmse
                best_params = (k, lambda_value)

    print(f"Best RMSE: {best_rmse:.4f} with k={best_params[0]}, lambda={best_params[1]}")

parameter_tuning(train_data, test_user_idx, test_item_idx, test_ratings, num_users, num_movies)

# 比较结果
print("Comparison of Collaborative Filtering and Matrix Factorization:")
print(f"RMSE (Collaborative Filtering): {rmse_value_cf:.4f}")