In [1]:
!pip install torch



In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

In [5]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')


In [7]:
user_ids = ratings['userId'].unique()
movie_ids = movies['movieId'].unique()
tag_ids = tags['tag'].unique()

In [9]:
user_mapping = {id_: i for i, id_ in enumerate(user_ids)}
movie_mapping = {id_: i for i, id_ in enumerate(movie_ids)}
tag_mapping = {tag: i for i, tag in enumerate(tag_ids)}

In [11]:
ratings['user_idx'] = ratings['userId'].map(user_mapping)
ratings['movie_idx'] = ratings['movieId'].map(movie_mapping)

In [13]:
hyperedges_ratings = ratings[['user_idx', 'movie_idx']].to_numpy()

In [15]:
tags['user_idx'] = tags['userId'].map(user_mapping)
tags['movie_idx'] = tags['movieId'].map(movie_mapping)
tags['tag_idx'] = tags['tag'].map(tag_mapping)

In [17]:
hyperedges_tags = tags[['user_idx', 'movie_idx', 'tag_idx']].to_numpy()

In [19]:
padding = -1 * np.ones((hyperedges_ratings.shape[0], 1), dtype=np.int32)
hyperedges_ratings_padded = np.hstack((hyperedges_ratings, padding))

In [21]:
hyperedges = np.vstack((hyperedges_ratings_padded, hyperedges_tags))

In [23]:
hyperedges = torch.tensor(hyperedges, dtype=torch.long)

In [25]:
train_size = int(0.8 * len(hyperedges))
test_size = len(hyperedges) - train_size
train_hyperedges, test_hyperedges = random_split(hyperedges, [train_size, test_size])

In [27]:
embedding_dim = 64  # Dimension of the embeddings
num_users = len(user_mapping)
num_movies = len(movie_mapping)
num_tags = len(tag_mapping)

In [29]:
user_embeddings = nn.Embedding(num_users, embedding_dim)
movie_embeddings = nn.Embedding(num_movies, embedding_dim)
tag_embeddings = nn.Embedding(num_tags, embedding_dim)

In [31]:
nn.init.xavier_uniform_(user_embeddings.weight)
nn.init.xavier_uniform_(movie_embeddings.weight)
nn.init.xavier_uniform_(tag_embeddings.weight)

Parameter containing:
tensor([[ 0.0051, -0.0008,  0.0022,  ...,  0.0029,  0.0062, -0.0040],
        [-0.0018, -0.0047, -0.0050,  ...,  0.0060,  0.0054,  0.0058],
        [-0.0045, -0.0053,  0.0051,  ...,  0.0019,  0.0026,  0.0024],
        ...,
        [-0.0004,  0.0033,  0.0057,  ...,  0.0063, -0.0038,  0.0017],
        [-0.0030, -0.0051,  0.0050,  ...,  0.0029, -0.0033, -0.0028],
        [ 0.0064, -0.0021,  0.0027,  ...,  0.0064, -0.0049, -0.0033]],
       requires_grad=True)

In [33]:
print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")
print(f"Number of tags: {num_tags}")
print(f"Training hyperedges: {len(train_hyperedges)}")
print(f"Testing hyperedges: {len(test_hyperedges)}")

Number of users: 200948
Number of movies: 87585
Number of tags: 140980
Training hyperedges: 27200220
Testing hyperedges: 6800056


In [35]:
print(type(train_hyperedges))

<class 'torch.utils.data.dataset.Subset'>


In [37]:
print(train_hyperedges[:5])

tensor([[ 23030,   2305,     -1],
        [ 81283,   1856,     -1],
        [165781,    365,     -1],
        [123403,   2245,     -1],
        [ 17805,    133,     -1]])


In [39]:
from torch.utils.data import Dataset, DataLoader

class HyperedgeDataset(Dataset):
    def __init__(self, hyperedges):
        self.hyperedges = hyperedges

    def __len__(self):
        return len(self.hyperedges)

    def __getitem__(self, idx):
        return torch.tensor(self.hyperedges[idx])

# Create Dataset and DataLoader
dataset = HyperedgeDataset(train_hyperedges)
dataloader = DataLoader(dataset, batch_size=10000, shuffle=False)  # Adjust batch_size as needed

for batch in dataloader:
    batch[batch[:, 2] == -1, 2] = 0
    # Process the batch (e.g., update statistics or other operations)


  return torch.tensor(self.hyperedges[idx])


In [41]:
import torch

# Assuming 'train_hyperedges' is a Subset object
# Access the underlying dataset
original_dataset = train_hyperedges.dataset

# If 'original_dataset' is already a tensor, proceed with it
# Otherwise, you might need to convert it to tensor as shown earlier

# Extract data from the dataset and convert it to tensor if necessary
# For illustration, let's assume 'original_dataset' has data in a list of lists format
# Convert the data to tensor
train_hyperedges_np = np.array(original_dataset)  # Adjust if necessary
train_hyperedges = torch.tensor(train_hyperedges_np)

# Replace -1 with 0
train_hyperedges[train_hyperedges[:, 2] == -1, 2] = 0

# Extract max values
num_users = train_hyperedges[:, 0].max().item() + 1
num_movies = train_hyperedges[:, 1].max().item() + 1
num_tags = train_hyperedges[:, 2].max().item() + 1

print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")
print(f"Number of tags: {num_tags}")


Number of users: 200948
Number of movies: 87585
Number of tags: 140980


In [61]:
import torch
import torch.nn as nn

class HypergraphContrastiveModel(nn.Module):
    def __init__(self, num_users, num_movies, num_tags, embed_dim):
        super(HypergraphContrastiveModel, self).__init__()
        # Define embedding layers
        self.user_embeddings = nn.Embedding(num_users, embed_dim)
        self.movie_embeddings = nn.Embedding(num_movies, embed_dim)
        self.tag_embeddings = nn.Embedding(num_tags, embed_dim)
        
    def forward(self, hyperedges):
        users = hyperedges[:, 0]
        movies = hyperedges[:, 1]
        tags = hyperedges[:, 2] if hyperedges.size(1) > 2 else None
        
        # Get embeddings
        user_embeds = self.user_embeddings(users)
        movie_embeds = self.movie_embeddings(movies)
        tag_embeds = self.tag_embeddings(tags) if tags is not None else None
        
        # Debugging: print shapes of the embeddings
        print("User embeddings shape:", user_embeds.shape)
        print("Movie embeddings shape:", movie_embeds.shape)
        if tag_embeds is not None:
            print("Tag embeddings shape:", tag_embeds.shape)
        
        return user_embeds, movie_embeds, tag_embeds


In [67]:
import torch.nn.functional as F

def contrastive_loss(user_embeds, movie_embeds, tag_embeds, margin=1.0):
    # Compute cosine similarity
    user_movie_similarity = F.cosine_similarity(user_embeds.unsqueeze(1), movie_embeds.unsqueeze(0), dim=-1)
    user_tag_similarity = F.cosine_similarity(user_embeds.unsqueeze(1), tag_embeds.unsqueeze(0), dim=-1)
    
    # Define loss as a combination of similarity scores
    loss = F.mse_loss(user_movie_similarity, user_tag_similarity)
    
    return loss
def validate(model, val_hyperedges):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        user_embeds, movie_embeds, tag_embeds = model(val_hyperedges)
        val_loss = contrastive_loss(user_embeds, movie_embeds, tag_embeds)
    print(f"Validation Loss: {val_loss.item():.4f}")
    return val_loss.item()

# Training Function with Early Stopping
def train_contrastive_model(model, train_hyperedges, val_hyperedges, num_epochs=10, lr=0.001, batch_size=8, accumulation_steps=4, patience=3):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scaler = torch.cuda.amp.GradScaler()  # For mixed precision training
    
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        optimizer.zero_grad()
        
        for i in range(0, len(train_hyperedges), batch_size):
            hyperedge_batch = train_hyperedges[i:i+batch_size]
            
            with torch.cuda.amp.autocast():  # Mixed precision context
                user_embeds, movie_embeds, tag_embeds = model(hyperedge_batch)
                loss = contrastive_loss(user_embeds, movie_embeds, tag_embeds)
            
            scaler.scale(loss).backward()
            
            if (i // batch_size + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
        
        # Validate the model
        val_loss = validate(model, val_hyperedges)
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print("Early stopping due to no improvement.")
            break


In [69]:
embed_dim = 128  # Adjust as needed

model = HypergraphContrastiveModel(num_users, num_movies, num_tags, embed_dim)

# Example train and validation hyperedges tensors with reduced batch size
train_hyperedges = torch.randint(0, min(num_users, num_movies, num_tags), (32, 3))  # Simulated smaller batch size
val_hyperedges = torch.randint(0, min(num_users, num_movies, num_tags), (16, 3))  # Simulated validation set

# Train the model with validation and early stopping
train_contrastive_model(model, train_hyperedges, val_hyperedges)



User embeddings shape: torch.Size([8, 128])
Movie embeddings shape: torch.Size([8, 128])
Tag embeddings shape: torch.Size([8, 128])
User embeddings shape: torch.Size([8, 128])
Movie embeddings shape: torch.Size([8, 128])
Tag embeddings shape: torch.Size([8, 128])
User embeddings shape: torch.Size([8, 128])
Movie embeddings shape: torch.Size([8, 128])
Tag embeddings shape: torch.Size([8, 128])
User embeddings shape: torch.Size([8, 128])
Movie embeddings shape: torch.Size([8, 128])
Tag embeddings shape: torch.Size([8, 128])
Epoch [1/10], Loss: 0.0160
User embeddings shape: torch.Size([16, 128])
Movie embeddings shape: torch.Size([16, 128])
Tag embeddings shape: torch.Size([16, 128])
Validation Loss: 0.0151
User embeddings shape: torch.Size([8, 128])
Movie embeddings shape: torch.Size([8, 128])
Tag embeddings shape: torch.Size([8, 128])
User embeddings shape: torch.Size([8, 128])
Movie embeddings shape: torch.Size([8, 128])
Tag embeddings shape: torch.Size([8, 128])
User embeddings shape:

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def predict_labels(user_embeds, movie_embeds, threshold=0.5):
    # Compute cosine similarity between user and movie embeddings
    similarity = torch.cosine_similarity(user_embeds, movie_embeds, dim=-1)
    
    # Predict label based on the threshold
    predictions = (similarity > threshold).int()  # 1 if similarity > threshold, else 0
    
    return predictions

def evaluate_model(model, val_hyperedges, true_labels, threshold=0.5):
    model.eval()
    with torch.no_grad():
        user_embeds, movie_embeds, tag_embeds = model(val_hyperedges)
        predictions = predict_labels(user_embeds, movie_embeds, threshold)
    
    # Convert tensors to numpy arrays for compatibility with sklearn
    predictions = predictions.cpu().numpy()
    true_labels = true_labels.cpu().numpy()
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return accuracy, precision, recall, f1

# Example usage
# true_labels should be the ground truth labels for your validation set
true_labels = torch.randint(0, 2, (val_hyperedges.size(0),))  # Simulated ground truth labels

# Evaluate the model on the validation set
evaluate_model(model, val_hyperedges, true_labels)


User embeddings shape: torch.Size([16, 128])
Movie embeddings shape: torch.Size([16, 128])
Tag embeddings shape: torch.Size([16, 128])
Accuracy: 0.8750
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.875, 0.0, 0.0, 0.0)

In [83]:
def predict_labels(user_embeddings, movie_embeddings, threshold=0.5):
    # Compute cosine similarity between user and movie embeddings
    similarity = torch.cosine_similarity(user_embeddings, movie_embeddings, dim=-1)
    print(f"Similarity stats: min={similarity.min().item()}, max={similarity.max().item()}, mean={similarity.mean().item()}")
    # Predict label based on the threshold
    predictions = (similarity > threshold).int()  # 1 if similarity > threshold, else 0
    return predictions


In [101]:
def evaluate_model(predictions, true_labels, thresholds):
    # Convert predictions and true_labels to numpy arrays or use PyTorch metrics
    predictions = predictions.numpy() if isinstance(predictions, torch.Tensor) else predictions
    true_labels = true_labels.numpy() if isinstance(true_labels, torch.Tensor) else true_labels
    
    accuracy = []
    precision = []
    recall = []
    f1 = []

    for threshold in thresholds:
        # Binarize predictions based on the threshold
        binarized_predictions = (predictions > threshold).astype(int)

        # Compute metrics
        accuracy.append((binarized_predictions == true_labels).mean())
        tp = ((binarized_predictions == 1) & (true_labels == 1)).sum()
        fp = ((binarized_predictions == 1) & (true_labels == 0)).sum()
        fn = ((binarized_predictions == 0) & (true_labels == 1)).sum()

        precision.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
        recall.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
        f1.append((2 * precision[-1] * recall[-1]) / (precision[-1] + recall[-1]) if (precision[-1] + recall[-1]) > 0 else 0)

    return accuracy, precision, recall, f1


In [105]:
import torch

# Define or import the evaluate_model function
def evaluate_model(predictions, true_labels, thresholds):
    # Convert predictions and true_labels to numpy arrays or use PyTorch metrics
    predictions = predictions.numpy() if isinstance(predictions, torch.Tensor) else predictions
    true_labels = true_labels.numpy() if isinstance(true_labels, torch.Tensor) else true_labels
    
    accuracy = []
    precision = []
    recall = []
    f1 = []

    for threshold in thresholds:
        # Binarize predictions based on the threshold
        binarized_predictions = (predictions > threshold).astype(int)

        # Compute metrics
        accuracy.append((binarized_predictions == true_labels).mean())
        tp = ((binarized_predictions == 1) & (true_labels == 1)).sum()
        fp = ((binarized_predictions == 1) & (true_labels == 0)).sum()
        fn = ((binarized_predictions == 0) & (true_labels == 1)).sum()

        precision.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
        recall.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
        f1.append((2 * precision[-1] * recall[-1]) / (precision[-1] + recall[-1]) if (precision[-1] + recall[-1]) > 0 else 0)

    return accuracy, precision, recall, f1

# Function to predict labels
def predict_labels(user_indices, movie_indices, model, threshold=0.0):
    user_embeddings, movie_embeddings, _ = model(user_indices, movie_indices)
    similarity = torch.mm(user_embeddings, movie_embeddings.t())  # (batch_size, batch_size)
    print(f"Adjusted Similarity stats: min={similarity.min().item()}, max={similarity.max().item()}, mean={similarity.mean().item()}")
    predictions = (similarity > threshold).int()
    return predictions

# Example validation indices
val_user_indices = torch.arange(16)
val_movie_indices = torch.arange(16)

# Example model
model = HypergraphContrastiveModel(num_users, num_movies, num_tags, embed_dim=128)

# Example usage with adjusted threshold
for threshold in [-0.1, 0.0, 0.1]:
    predictions = predict_labels(val_user_indices, val_movie_indices, model, threshold)
    # Assume true_labels is defined and corresponds to the same shape as predictions
    true_labels = torch.randint(0, 2, (16, 16))  # Sample true labels for demonstration
    accuracy, precision, recall, f1 = evaluate_model(predictions, true_labels, [threshold])
    print(f"Threshold: {threshold}, Accuracy: {accuracy[0]:.4f}, Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1 Score: {f1[0]:.4f}")


Adjusted Similarity stats: min=-41.15969467163086, max=36.23440933227539, mean=0.3133211135864258
Threshold: -0.1, Accuracy: 0.5000, Precision: 0.5000, Recall: 1.0000, F1 Score: 0.6667
Adjusted Similarity stats: min=-41.15969467163086, max=36.23440933227539, mean=0.3133211135864258
Threshold: 0.0, Accuracy: 0.4336, Precision: 0.3664, Recall: 0.4364, F1 Score: 0.3983
Adjusted Similarity stats: min=-41.15969467163086, max=36.23440933227539, mean=0.3133211135864258
Threshold: 0.1, Accuracy: 0.5430, Precision: 0.5538, Recall: 0.5496, F1 Score: 0.5517


In [107]:
import torch
import torch.nn as nn
import torch.optim as optim

class ImprovedHypergraphContrastiveModel(nn.Module):
    def __init__(self, num_users, num_movies, num_tags, embed_dim):
        super(ImprovedHypergraphContrastiveModel, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embed_dim)
        self.movie_embeddings = nn.Embedding(num_movies, embed_dim)
        self.tag_embeddings = nn.Embedding(num_tags, embed_dim)
        
        # Additional layers
        self.fc = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, hyperedges):
        users = hyperedges[:, 0]
        movies = hyperedges[:, 1]
        tags = hyperedges[:, 2] if hyperedges.size(1) > 2 else None
        
        user_embeds = self.user_embeddings(users)
        movie_embeds = self.movie_embeddings(movies)
        tag_embeds = self.tag_embeddings(tags) if tags is not None else None
        
        # Additional processing
        user_embeds = self.fc(user_embeds)
        movie_embeds = self.fc(movie_embeds)
        user_embeds = self.dropout(user_embeds)
        movie_embeds = self.dropout(movie_embeds)
        
        return user_embeds, movie_embeds, tag_embeds

def train_improved_model(model, train_hyperedges, num_epochs=10, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()  # Example loss function, adjust as needed
    
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        user_embeds, movie_embeds, _ = model(train_hyperedges)
        
        # Compute loss
        loss = contrastive_loss(user_embeds, movie_embeds)  # Define contrastive_loss as needed
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

def contrastive_loss(user_embeds, movie_embeds, margin=1.0):
    # Define contrastive loss function
    user_movie_similarity = torch.cosine_similarity(user_embeds, movie_embeds)
    loss = torch.mean(torch.clamp(margin - user_movie_similarity, min=0.0))
    return loss

# Example usage with improved model
embed_dim = 256  # Try different embedding sizes
model = ImprovedHypergraphContrastiveModel(num_users, num_movies, num_tags, embed_dim)
train_improved_model(model, train_hyperedges)


Epoch 1/10, Loss: 1.001325011253357
Epoch 2/10, Loss: 0.9710863828659058
Epoch 3/10, Loss: 0.9536166787147522
Epoch 4/10, Loss: 0.9417213797569275
Epoch 5/10, Loss: 0.910735011100769
Epoch 6/10, Loss: 0.867866039276123
Epoch 7/10, Loss: 0.8692584037780762
Epoch 8/10, Loss: 0.8301055431365967
Epoch 9/10, Loss: 0.8074963688850403
Epoch 10/10, Loss: 0.807449221611023


In [111]:
def get_indices_and_labels(hyperedges):
    # Assuming the hyperedges tensor has shape [N, 3]
    user_indices = hyperedges[:, 0]
    movie_indices = hyperedges[:, 1]
    true_labels = hyperedges[:, 2]
    return user_indices, movie_indices, true_labels

# Convert Subset objects to tensors if needed
def subset_to_tensor(subset, data):
    indices = subset.indices
    return data[indices]

# Get validation data
val_user_indices, val_movie_indices, val_true_labels = get_indices_and_labels(subset_to_tensor(test_hyperedges, hyperedges_tensor))

# Now, you can evaluate your model
accuracy, precision, recall, f1 = evaluate_model(model, val_user_indices, val_movie_indices, val_true_labels, threshold=0.1)
print(f"Threshold: 0.1, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

NameError: name 'hyperedges_tensor' is not defined