# 📊 VEP Data Preparation Tutorial

This tutorial focuses on data preparation for Variant Effect Prediction (VEP) analysis using OmniGenBench's enhanced dataset loading capabilities.

## 1. Setup and Configuration

In [None]:
# Import libraries
import os
import pandas as pd
from pathlib import Path
from omnigenbench import (
    OmniTokenizer,
    OmniDatasetForSequenceClassification
)

print("✅ Libraries imported successfully!")

In [None]:
# VEP Data Configuration
class VEPDataConfig:
    DATASET_NAME = "yangheng/variant_effect_prediction"
    MODEL_NAME = "yangheng/OmniGenome-52M"
    CACHE_DIR = "vep_data_cache"
    MAX_LENGTH = 512
    
    # Quick testing parameters
    SAMPLE_SIZE = 100  # Use subset for quick analysis

config = VEPDataConfig()
print("⚙️ Configuration set for VEP data preparation")

## 2. Enhanced Data Loading

In [None]:
# Load tokenizer
print("🔄 Loading tokenizer...")
tokenizer = OmniTokenizer.from_pretrained(config.MODEL_NAME, trust_remote_code=True)

# Load VEP dataset using enhanced OmniDataset
print("📊 Loading VEP dataset with automatic caching...")
datasets = OmniDatasetForSequenceClassification.from_huggingface(
    dataset_name=config.DATASET_NAME,
    tokenizer=tokenizer,
    max_length=config.MAX_LENGTH,
    cache_dir=config.CACHE_DIR
)

print(f"📋 VEP Dataset loaded:")
for split, dataset in datasets.items():
    print(f"  📊 {split.title()}: {len(dataset)} variants")

print("✅ Data preparation complete!")

## 3. Data Exploration

In [None]:
# Explore dataset structure
test_dataset = datasets['test']
sample_item = test_dataset[0]

print("🔍 Sample variant structure:")
for key, value in sample_item.items():
    print(f"  {key}: {type(value)} - {str(value)[:100]}...")

# Create DataLoader for batch processing
dataloader = test_dataset.get_dataloader(batch_size=8, shuffle=False)
sample_batch = next(iter(dataloader))

print(f"\n📦 Sample batch structure:")
for key, tensor in sample_batch.items():
    print(f"  {key}: {tensor.shape}")

print("✅ Data exploration complete!")