# 🤖 VEP Model Setup Tutorial

This tutorial demonstrates model initialization and configuration for Variant Effect Prediction using genomic foundation models.

## 1. Model Loading and Configuration

In [None]:
# Import libraries
import torch
from omnigenbench import (
    OmniTokenizer,
    OmniModelForSequenceClassification
)
from autocuda import auto_cuda

print("✅ Libraries imported successfully!")
print(f"🔥 PyTorch version: {torch.__version__}")
print(f"🎯 CUDA available: {torch.cuda.is_available()}")

In [None]:
# Model configuration
class VEPModelConfig:
    MODEL_NAME = "yangheng/OmniGenome-52M"
    NUM_LABELS = 2  # Binary: functional vs neutral
    DEVICE = auto_cuda()
    
    # Model parameters
    OUTPUT_HIDDEN_STATES = True  # Need for embedding extraction
    OUTPUT_ATTENTIONS = False    # Optional for attention analysis

config = VEPModelConfig()
print(f"⚙️ Model configuration: {config.MODEL_NAME}")
print(f"📱 Target device: {config.DEVICE}")

## 2. Load Tokenizer and Model

In [None]:
# Load tokenizer
print("🔄 Loading tokenizer...")
tokenizer = OmniTokenizer.from_pretrained(config.MODEL_NAME, trust_remote_code=True)
print("✅ Tokenizer loaded!")

# Load genomic foundation model
print("🔄 Loading genomic foundation model...")
model = OmniModelForSequenceClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=config.NUM_LABELS,
    trust_remote_code=True,
    output_hidden_states=config.OUTPUT_HIDDEN_STATES,
    output_attentions=config.OUTPUT_ATTENTIONS
)

# Move to device and set evaluation mode
model.to(config.DEVICE)
model.eval()

print(f"✅ Model loaded successfully!")
print(f"  🧠 Model: {config.MODEL_NAME}")
print(f"  📊 Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
print(f"  📱 Device: {config.DEVICE}")
print(f"  🎯 Labels: {config.NUM_LABELS} (Binary classification)")

## 3. Model Testing

In [None]:
# Test model with sample sequences
test_sequences = [
    "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG",  # Reference
    "ATCGATCGATCGGTCGATCGATCGATCGATCGATCGATCG",  # Variant (A->G)
]

print("🧪 Testing model with sample sequences...")

for i, sequence in enumerate(test_sequences):
    # Tokenize
    inputs = tokenizer(
        sequence,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(config.DEVICE)
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract information
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    hidden_states = outputs.hidden_states[-1]  # Last layer
    
    print(f"\n  Sequence {i+1}: {sequence[:30]}...")
    print(f"    🎯 Logits: {logits.cpu().numpy().flatten()}")
    print(f"    📊 Probabilities: {probabilities.cpu().numpy().flatten()}")
    print(f"    🧠 Hidden state shape: {hidden_states.shape}")

print("\n✅ Model testing complete!")
print("🎉 Model is ready for variant effect prediction!")