In [1]:
# FORCE FULL CPU UTILIZATION
import os
os.environ["MKL_NUM_THREADS"] = "20"
os.environ["OMP_NUM_THREADS"] = "20"
os.environ["NUMEXPR_NUM_THREADS"] = "20"
os.environ["OPENBLAS_NUM_THREADS"] = "20"


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch_geometric.nn import Node2Vec

import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Device: cuda
GPU: NVIDIA GeForce RTX 5090


## 1. Load Data

In [2]:
data_dir = '../data/'

edge_index = torch.load(data_dir + 'edge_index.pt')
node_features = torch.load(data_dir + 'node_features.pt')
y = torch.load(data_dir + 'y.pt')
train_idx = torch.load(data_dir + 'train_idx.pt')
test_idx = torch.load(data_dir + 'test_idx.pt')

num_nodes = node_features.shape[0]
num_edges = edge_index.shape[1]
num_labels = y.shape[1]

print("="*80)
print("DATA LOADED")
print("="*80)
print(f"Nodes: {num_nodes:,}")
print(f"Edges: {num_edges:,}")
print(f"Labels: {num_labels}")
print(f"Train: {len(train_idx):,} | Test: {len(test_idx):,}")
print("="*80)

DATA LOADED
Nodes: 19,765
Edges: 1,554,790
Labels: 305
Train: 5,046 | Test: 3,365


## 2. Train/Val Split

In [3]:
# 80/20 split
train_size = int(0.8 * len(train_idx))
perm = torch.randperm(len(train_idx))
train_indices = train_idx[perm[:train_size]]
val_indices = train_idx[perm[train_size:]]

# Create masks
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_idx] = True

print(f"Train: {train_mask.sum()} | Val: {val_mask.sum()} | Test: {test_mask.sum()}")

Train: 4036 | Val: 1010 | Test: 3365


## 3. Train Node2Vec Embeddings

**Configuration:**
- Embedding dims: 64, 128 (test both)
- Walk length: 20
- Context size: 10
- Walks per node: 10
- p=1, q=1 (balanced exploration)

**Note:** Using num_workers=0 for Windows compatibility

In [4]:
def train_node2vec(edge_index, num_nodes, embedding_dim=64, epochs=100):
    """
    Train Node2Vec embeddings.
    
    Args:
        edge_index: Graph edges [2, num_edges]
        num_nodes: Number of nodes
        embedding_dim: Embedding dimension
        epochs: Training epochs
    
    Returns:
        Embeddings tensor [num_nodes, embedding_dim]
    """
    print(f"\n{'='*80}")
    print(f"TRAINING NODE2VEC ({embedding_dim}D)")
    print(f"{'='*80}")
    
    # Initialize Node2Vec
    model = Node2Vec(
        edge_index=edge_index,
        embedding_dim=embedding_dim,
        walk_length=20,
        context_size=10,
        walks_per_node=10,
        p=1.0,  # Return parameter
        q=1.0,  # In-out parameter
        num_negative_samples=1,
        sparse=True
    ).to(device)
    
    # Create data loader (num_workers=0 for Windows)
    loader = model.loader(batch_size=128, shuffle=True, num_workers=0)
    
    # Optimizer (sparse optimizer for sparse embeddings)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
    
    # Training loop
    model.train()
    total_loss = 0
    
    for epoch in tqdm(range(1, epochs + 1), desc=f"Node2Vec {embedding_dim}d"):
        epoch_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        total_loss += epoch_loss / len(loader)
        
        if epoch % 20 == 0 or epoch == 1:
            avg_loss = epoch_loss / len(loader)
            print(f"  Epoch {epoch:3d} | Loss: {avg_loss:.4f}")
    
    # Extract embeddings
    model.eval()
    with torch.no_grad():
        embeddings = model(torch.arange(num_nodes, device=device))
    
    print(f"\n‚úì Node2Vec {embedding_dim}d training complete!")
    print(f"  Final loss: {total_loss / epochs:.4f}")
    print(f"  Embeddings shape: {embeddings.shape}")
    print(f"{'='*80}")
    
    return embeddings.cpu()

In [5]:
# Train both 64d and 128d embeddings
embeddings_64d = train_node2vec(edge_index, num_nodes, embedding_dim=64, epochs=100)
embeddings_128d = train_node2vec(edge_index, num_nodes, embedding_dim=128, epochs=100)

# Save for reuse
torch.save(embeddings_64d, '../data/node2vec_64d_draft8.pt')
torch.save(embeddings_128d, '../data/node2vec_128d_draft8.pt')
print("\n‚úì Embeddings saved for reuse in Draft8_GAT_Enhanced")


TRAINING NODE2VEC (64D)


Node2Vec 64d:   0%|          | 0/100 [00:00<?, ?it/s]

  Epoch   1 | Loss: 4.2412
  Epoch  20 | Loss: 1.0898
  Epoch  40 | Loss: 1.0871
  Epoch  60 | Loss: 1.0868
  Epoch  80 | Loss: 1.0865
  Epoch 100 | Loss: 1.0855

‚úì Node2Vec 64d training complete!
  Final loss: 1.1306
  Embeddings shape: torch.Size([19765, 64])

TRAINING NODE2VEC (128D)


Node2Vec 128d:   0%|          | 0/100 [00:00<?, ?it/s]

  Epoch   1 | Loss: 6.7013
  Epoch  20 | Loss: 1.0912
  Epoch  40 | Loss: 1.0892
  Epoch  60 | Loss: 1.0886
  Epoch  80 | Loss: 1.0881
  Epoch 100 | Loss: 1.0871

‚úì Node2Vec 128d training complete!
  Final loss: 1.1760
  Embeddings shape: torch.Size([19765, 128])

‚úì Embeddings saved for reuse in Draft8_GAT_Enhanced


---

### After this, proceed to train simple MLP classifier on the embeddings;
### Resutls, were not satisfactory, so further tuning or alternative methods may be needed.

---

## 4. Simple MLP Classifier

**Architecture:**
- Input: Node2Vec embeddings (64d or 128d)
- Hidden: 256 ‚Üí 256 (2 layers)
- Output: 305 disease labels
- Activation: ReLU
- Dropout: 0.3

In [6]:
class SimpleMLP(nn.Module):
    """
    Simple 2-layer MLP for structure-only baseline.
    """
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, out_dim)
        self.dropout = dropout
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.fc3(x)
        return x

print("‚úì SimpleMLP defined")

‚úì SimpleMLP defined


## 5. Training Function

In [7]:
def evaluate_ap(y_true, y_pred, mask):
    """Compute micro-averaged Average Precision."""
    y_true_np = y_true[mask].cpu().numpy().ravel()
    y_pred_np = y_pred[mask].cpu().detach().numpy().ravel()
    return average_precision_score(y_true_np, y_pred_np, average='micro')


def train_mlp(embeddings, y, train_mask, val_mask, embedding_name="64d", epochs=200):
    """
    Train MLP on Node2Vec embeddings.
    
    Args:
        embeddings: Node2Vec embeddings [num_nodes, embedding_dim]
        y: Labels [num_nodes, num_labels]
        train_mask: Training node mask
        val_mask: Validation node mask
        embedding_name: Name for logging
        epochs: Training epochs
    
    Returns:
        trained model, best_val_ap, final predictions
    """
    print(f"\n{'='*80}")
    print(f"TRAINING MLP ON NODE2VEC {embedding_name}")
    print(f"{'='*80}")
    
    # Move to device
    x = embeddings.to(device)
    y_device = y.to(device)
    
    # Initialize model
    in_dim = embeddings.shape[1]
    hidden_dim = 256
    out_dim = num_labels
    
    model = SimpleMLP(in_dim, hidden_dim, out_dim, dropout=0.3).to(device)
    
    print(f"\nModel: SimpleMLP")
    print(f"  Input dim: {in_dim}")
    print(f"  Hidden dim: {hidden_dim}")
    print(f"  Output dim: {out_dim}")
    print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Compute class weights (same as Draft5)
    pos_count = y[train_mask].sum(dim=0).float()
    neg_count = train_mask.sum() - pos_count
    pos_weight = (neg_count / pos_count.clamp(min=1)).clamp(max=50).to(device)
    
    # Setup training
    optimizer = Adam(model.parameters(), lr=0.003, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=10
    )
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    
    # Training loop
    best_val_ap = 0
    best_state = None
    patience = 20
    patience_counter = 0
    
    print(f"\nTraining for up to {epochs} epochs...\n")
    
    for epoch in tqdm(range(1, epochs + 1), desc=f"Training {embedding_name}"):
        # Train
        model.train()
        optimizer.zero_grad()
        
        out = model(x)
        loss = criterion(out[train_mask], y_device[train_mask].float())
        
        loss.backward()
        optimizer.step()
        
        # Evaluate
        if epoch % 10 == 0 or epoch == 1:
            model.eval()
            with torch.no_grad():
                out = model(x)
                probs = torch.sigmoid(out)
                
                train_ap = evaluate_ap(y_device, probs, train_mask)
                val_ap = evaluate_ap(y_device, probs, val_mask)
            
            scheduler.step(val_ap)
            
            if epoch % 20 == 0 or epoch == 1:
                print(f"Epoch {epoch:3d} | Loss: {loss.item():.4f} | "
                      f"Train AP: {train_ap:.4f} | Val AP: {val_ap:.4f}")
            
            # Save best
            if val_ap > best_val_ap:
                best_val_ap = val_ap
                best_state = {k: v.cpu() for k, v in model.state_dict().items()}
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                print(f"\nEarly stopping at epoch {epoch}")
                break
    
    # Load best model
    model.load_state_dict({k: v.to(device) for k, v in best_state.items()})
    
    # Final predictions
    model.eval()
    with torch.no_grad():
        final_out = model(x)
        final_probs = torch.sigmoid(final_out)
    
    print(f"\n‚úì MLP training complete!")
    print(f"‚úì Best validation AP: {best_val_ap:.4f}")
    print(f"{'='*80}")
    
    return model, best_val_ap, final_probs

## 6. Train MLPs on Both Embeddings

In [8]:
# Train on 64d embeddings
mlp_64d, val_ap_64d, probs_64d = train_mlp(
    embeddings_64d, y, train_mask, val_mask, 
    embedding_name="64d", epochs=200
)

# Train on 128d embeddings
mlp_128d, val_ap_128d, probs_128d = train_mlp(
    embeddings_128d, y, train_mask, val_mask,
    embedding_name="128d", epochs=200
)


TRAINING MLP ON NODE2VEC 64d

Model: SimpleMLP
  Input dim: 64
  Hidden dim: 256
  Output dim: 305
  Parameters: 161,841

Training for up to 200 epochs...



Training 64d:   0%|          | 0/200 [00:00<?, ?it/s]

Epoch   1 | Loss: 1.3090 | Train AP: 0.0359 | Val AP: 0.0341
Epoch  20 | Loss: 1.0408 | Train AP: 0.1143 | Val AP: 0.0782
Epoch  40 | Loss: 0.9174 | Train AP: 0.1905 | Val AP: 0.0816
Epoch  60 | Loss: 0.8344 | Train AP: 0.2732 | Val AP: 0.0840
Epoch  80 | Loss: 0.7828 | Train AP: 0.3214 | Val AP: 0.0853
Epoch 100 | Loss: 0.7475 | Train AP: 0.3603 | Val AP: 0.0844
Epoch 120 | Loss: 0.7196 | Train AP: 0.3859 | Val AP: 0.0829
Epoch 140 | Loss: 0.6873 | Train AP: 0.4105 | Val AP: 0.0839
Epoch 160 | Loss: 0.6700 | Train AP: 0.4313 | Val AP: 0.0832
Epoch 180 | Loss: 0.6514 | Train AP: 0.4522 | Val AP: 0.0844
Epoch 200 | Loss: 0.6433 | Train AP: 0.4698 | Val AP: 0.0845

‚úì MLP training complete!
‚úì Best validation AP: 0.0854

TRAINING MLP ON NODE2VEC 128d

Model: SimpleMLP
  Input dim: 128
  Hidden dim: 256
  Output dim: 305
  Parameters: 178,225

Training for up to 200 epochs...



Training 128d:   0%|          | 0/200 [00:00<?, ?it/s]

Epoch   1 | Loss: 1.2977 | Train AP: 0.0378 | Val AP: 0.0353
Epoch  20 | Loss: 1.0002 | Train AP: 0.1138 | Val AP: 0.0676
Epoch  40 | Loss: 0.8215 | Train AP: 0.2687 | Val AP: 0.0742
Epoch  60 | Loss: 0.7247 | Train AP: 0.3698 | Val AP: 0.0736
Epoch  80 | Loss: 0.6664 | Train AP: 0.4297 | Val AP: 0.0722
Epoch 100 | Loss: 0.6339 | Train AP: 0.4760 | Val AP: 0.0728
Epoch 120 | Loss: 0.5946 | Train AP: 0.5064 | Val AP: 0.0725
Epoch 140 | Loss: 0.5743 | Train AP: 0.5279 | Val AP: 0.0722
Epoch 160 | Loss: 0.5560 | Train AP: 0.5536 | Val AP: 0.0748
Epoch 180 | Loss: 0.5393 | Train AP: 0.5745 | Val AP: 0.0710
Epoch 200 | Loss: 0.5236 | Train AP: 0.5882 | Val AP: 0.0698

‚úì MLP training complete!
‚úì Best validation AP: 0.0758


## 7. Results & Analysis

In [9]:
print("\n" + "="*80)
print("STRUCTURE-ONLY BASELINE RESULTS")
print("="*80)

print("\nüìä Validation Micro-AP:")
print(f"  Node2Vec 64d + MLP:  {val_ap_64d:.4f}")
print(f"  Node2Vec 128d + MLP: {val_ap_128d:.4f}")

print("\nüìà Comparison to Existing Methods (from previous drafts):")
print(f"  GAT (Draft4):        ~0.051 (Kaggle)")
print(f"  LP+C&S (Draft5):     ~0.056 (Kaggle)")
print(f"  Best on leaderboard: ~0.064")

# Determine best embedding
if val_ap_128d > val_ap_64d:
    best_embedding_dim = 128
    best_val_ap = val_ap_128d
    best_probs = probs_128d
    best_embeddings = embeddings_128d
else:
    best_embedding_dim = 64
    best_val_ap = val_ap_64d
    best_probs = probs_64d
    best_embeddings = embeddings_64d

print(f"\nüèÜ BEST: Node2Vec {best_embedding_dim}d + MLP ‚Üí {best_val_ap:.4f}")

print("\nüí° INTERPRETATION:")
if best_val_ap >= 0.055:
    print("  ‚úì Structure-only baseline is STRONG (‚â•0.055)")
    print("  ‚Üí Graph structure carries most of the signal")
    print("  ‚Üí Focus on methods that exploit structure:")
    print("    ‚Ä¢ GNNs with structural embeddings")
    print("    ‚Ä¢ Concatenate Node2Vec + bio features in GAT")
    print("    ‚Ä¢ Apply C&S post-processing")
elif best_val_ap >= 0.045:
    print("  ‚úì Structure-only baseline is MODERATE (0.045-0.055)")
    print("  ‚Üí Structure + features both important")
    print("  ‚Üí Recommended approach:")
    print("    ‚Ä¢ GAT with [bio features || Node2Vec]")
    print("    ‚Ä¢ Keep LP+C&S as strong baseline")
else:
    print("  ‚úì Structure-only baseline is WEAK (<0.045)")
    print("  ‚Üí LP+C&S is the main hero")
    print("  ‚Üí Focus on label smoothing:")
    print("    ‚Ä¢ Optimize C&S hyperparameters")
    print("    ‚Ä¢ Try label reuse tricks")

print("\n" + "="*80)


STRUCTURE-ONLY BASELINE RESULTS

üìä Validation Micro-AP:
  Node2Vec 64d + MLP:  0.0854
  Node2Vec 128d + MLP: 0.0758

üìà Comparison to Existing Methods (from previous drafts):
  GAT (Draft4):        ~0.051 (Kaggle)
  LP+C&S (Draft5):     ~0.056 (Kaggle)
  Best on leaderboard: ~0.064

üèÜ BEST: Node2Vec 64d + MLP ‚Üí 0.0854

üí° INTERPRETATION:
  ‚úì Structure-only baseline is STRONG (‚â•0.055)
  ‚Üí Graph structure carries most of the signal
  ‚Üí Focus on methods that exploit structure:
    ‚Ä¢ GNNs with structural embeddings
    ‚Ä¢ Concatenate Node2Vec + bio features in GAT
    ‚Ä¢ Apply C&S post-processing



## 8. Generate Test Predictions (If Good Enough)

In [10]:
# Only generate submission if validation AP suggests it's competitive
SUBMIT_THRESHOLD = 0.053  # Only if better than current GAT

if best_val_ap >= SUBMIT_THRESHOLD:
    print(f"\n{'='*80}")
    print("GENERATING SUBMISSION (Val AP ‚â• 0.053)")
    print(f"{'='*80}")
    
    # Test predictions
    test_probs = best_probs[test_mask].cpu().numpy()
    
    # Create submission
    submission = pd.DataFrame(test_probs)
    submission.to_csv('../Submissions/submission_Draft8_Node2Vec_MLP.csv', index=False)
    
    print(f"\n‚úì Submission saved: submission_Draft8_Node2Vec_MLP.csv")
    print(f"  Embedding: Node2Vec {best_embedding_dim}d")
    print(f"  Validation AP: {best_val_ap:.4f}")
    print(f"  Mean prediction: {test_probs.mean():.4f}")
    print(f"\nüí° Consider submitting if:")
    print(f"  ‚Ä¢ Val AP > your current best ({best_val_ap:.4f})")
    print(f"  ‚Ä¢ Mean prediction looks reasonable (~0.02-0.03)")
    print(f"{'='*80}")
else:
    print(f"\n{'='*80}")
    print("NOT GENERATING SUBMISSION")
    print(f"{'='*80}")
    print(f"  Validation AP: {best_val_ap:.4f} < {SUBMIT_THRESHOLD:.3f}")
    print(f"  ‚Üí Structure-only is weaker than GAT")
    print(f"  ‚Üí But we'll use Node2Vec embeddings in Draft8_GAT_Enhanced!")
    print(f"{'='*80}")


GENERATING SUBMISSION (Val AP ‚â• 0.053)

‚úì Submission saved: submission_Draft8_Node2Vec_MLP.csv
  Embedding: Node2Vec 64d
  Validation AP: 0.0854
  Mean prediction: 0.3111

üí° Consider submitting if:
  ‚Ä¢ Val AP > your current best (0.0854)
  ‚Ä¢ Mean prediction looks reasonable (~0.02-0.03)


## 9. Summary & Next Steps

In [11]:
print("\n" + "="*80)
print("DRAFT8 NODE2VEC BASELINE - SUMMARY")
print("="*80)

print("\n‚úÖ COMPLETED:")
print("  1. Trained Node2Vec embeddings (64d, 128d)")
print("  2. Trained SimpleMLP on structure-only")
print("  3. Evaluated validation micro-AP")
print(f"  4. Best result: {best_val_ap:.4f} (Node2Vec {best_embedding_dim}d)")

print("\nüìÅ FILES SAVED:")
print("  ‚Ä¢ node2vec_64d_draft8.pt")
print("  ‚Ä¢ node2vec_128d_draft8.pt")
if best_val_ap >= SUBMIT_THRESHOLD:
    print("  ‚Ä¢ submission_Draft8_Node2Vec_MLP.csv")

print("\n‚û°Ô∏è NEXT STEPS:")
print("  1. Open Draft8_GAT_Enhanced.ipynb")
print("  2. Test GAT with 3 input configurations:")
print("     a) Bio features + log-degree (baseline)")
print("     b) Node2Vec only")
print("     c) [Bio features || log-degree || Node2Vec] (recommended)")
print("  3. Pick best variant on validation")
print("  4. Apply C&S in Draft8_BestModel_CS.ipynb")

print("\n" + "="*80)
print("Ready to proceed to Draft8_GAT_Enhanced! üöÄ")
print("="*80)


DRAFT8 NODE2VEC BASELINE - SUMMARY

‚úÖ COMPLETED:
  1. Trained Node2Vec embeddings (64d, 128d)
  2. Trained SimpleMLP on structure-only
  3. Evaluated validation micro-AP
  4. Best result: 0.0854 (Node2Vec 64d)

üìÅ FILES SAVED:
  ‚Ä¢ node2vec_64d_draft8.pt
  ‚Ä¢ node2vec_128d_draft8.pt
  ‚Ä¢ submission_Draft8_Node2Vec_MLP.csv

‚û°Ô∏è NEXT STEPS:
  1. Open Draft8_GAT_Enhanced.ipynb
  2. Test GAT with 3 input configurations:
     a) Bio features + log-degree (baseline)
     b) Node2Vec only
     c) [Bio features || log-degree || Node2Vec] (recommended)
  3. Pick best variant on validation
  4. Apply C&S in Draft8_BestModel_CS.ipynb

Ready to proceed to Draft8_GAT_Enhanced! üöÄ
