## 1. Setup and Installation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# Import UNIDQ components
from unidq import UNIDQ, UNIDQConfig, MultiTaskDataset, UNIDQTrainer
from unidq.utils import set_seed, get_device, create_synthetic_errors
from unidq.evaluation import evaluate_all_tasks

# Set random seed
set_seed(42)

print("Setup complete!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {get_device()}")

## 2. Load and Prepare Data

In [None]:
# Create a sample dataset
data = {
    'employee_id': range(1, 101),
    'name': [f'Employee_{i}' for i in range(1, 101)],
    'age': np.random.randint(22, 65, 100),
    'department': np.random.choice(['IT', 'Sales', 'HR', 'Finance'], 100),
    'salary': np.random.randint(40000, 120000, 100),
    'years_experience': np.random.randint(0, 30, 100),
}

df = pd.DataFrame(data)

print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

## 3. Create Synthetic Quality Issues

For demonstration, we'll introduce synthetic data quality issues.

In [None]:
# Create corrupted version with errors
df_corrupted = create_synthetic_errors(df, error_rate=0.1)

print("Introduced synthetic errors")
print(f"Missing values: {df_corrupted.isna().sum().sum()}")
print("\nCorrupted data sample:")
df_corrupted.head(10)

## 4. Create Task Labels

Create labels for each data quality task.

In [None]:
# Generate synthetic labels for demonstration
np.random.seed(42)

task_labels = {
    'error_detection': pd.Series(np.random.randint(0, 2, len(df))),
    'duplicate_detection': pd.Series(np.random.randint(0, 2, len(df))),
    'outlier_detection': pd.Series(np.random.randint(0, 2, len(df))),
}

print("Task label distributions:")
for task, labels in task_labels.items():
    print(f"\n{task}:")
    print(labels.value_counts())

## 5. Create Dataset and DataLoaders

In [None]:
# Create dataset
dataset = MultiTaskDataset(
    data=df_corrupted,
    task_labels=task_labels,
    max_length=256,
)

# Split into train/validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size]
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=MultiTaskDataset.collate_fn,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=8,
    collate_fn=MultiTaskDataset.collate_fn,
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Batches per epoch: {len(train_loader)}")

## 6. Initialize UNIDQ Model

In [None]:
# Configure model
config = UNIDQConfig(
    d_model=256,
    n_heads=8,
    n_layers=4,
    d_ff=1024,
    dropout=0.1,
    max_seq_length=256,
    vocab_size=256,
)

# Create model
model = UNIDQ(config)
device = get_device()

# Print model summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Device: {device}")

## 7. Train the Model

In [None]:
# Create trainer
trainer = UNIDQTrainer(
    model=model,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    device=device,
)

# Train model
history = trainer.train(
    num_epochs=5,
    save_dir='./tutorial_checkpoints',
    save_best=True,
)

print("\nTraining completed!")

## 8. Visualize Training Progress

In [None]:
# Plot training history
plt.figure(figsize=(10, 5))
plt.plot(history['train_loss'], label='Training Loss', marker='o')
if history['val_loss']:
    plt.plot(history['val_loss'], label='Validation Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Progress')
plt.legend()
plt.grid(True)
plt.show()

## 9. Evaluate Model Performance

In [None]:
# Evaluate on validation set
metrics = evaluate_all_tasks(
    model=model,
    dataloader=val_loader,
    device=device,
)

# Display metrics
print("Evaluation Metrics:")
print("=" * 50)
for metric_name, value in sorted(metrics.items()):
    print(f"{metric_name:30s}: {value:.4f}")

## 10. Make Predictions

In [None]:
# Get a sample batch
model.eval()
sample_batch = next(iter(val_loader))
sample_batch = {k: v.to(device) for k, v in sample_batch.items()}

# Make predictions
with torch.no_grad():
    outputs = model(
        sample_batch['input_ids'],
        sample_batch['attention_mask']
    )

# Show prediction shapes
print("Prediction outputs:")
for task_name, task_output in outputs.items():
    print(f"{task_name:25s}: {task_output.shape}")

## 11. Task-Specific Predictions

In [None]:
# Error detection predictions
error_logits = outputs['error_detection']
error_probs = torch.softmax(error_logits.mean(dim=1), dim=-1)
error_preds = torch.argmax(error_probs, dim=-1)

print("Error Detection Results (first 5 samples):")
print(f"Predictions: {error_preds[:5].cpu().numpy()}")
print(f"Probabilities: {error_probs[:5, 1].cpu().numpy()}")

# Duplicate detection predictions
duplicate_logits = outputs['duplicate_detection']
duplicate_probs = torch.softmax(duplicate_logits.mean(dim=1), dim=-1)
duplicate_preds = torch.argmax(duplicate_probs, dim=-1)

print("\nDuplicate Detection Results (first 5 samples):")
print(f"Predictions: {duplicate_preds[:5].cpu().numpy()}")
print(f"Probabilities: {duplicate_probs[:5, 1].cpu().numpy()}")

## 12. Save and Load Model

In [None]:
# Save model
save_path = './tutorial_model'
model.save_pretrained(save_path)
print(f"Model saved to {save_path}")

# Load model
loaded_model = UNIDQ.from_pretrained(save_path)
loaded_model.to(device)
print("Model loaded successfully!")

# Verify loaded model works
loaded_model.eval()
with torch.no_grad():
    test_outputs = loaded_model(
        sample_batch['input_ids'],
        sample_batch['attention_mask']
    )
    
print("Loaded model produces output successfully!")

## Summary

This tutorial covered:

1. ✅ Setting up UNIDQ
2. ✅ Preparing data and creating datasets
3. ✅ Configuring and initializing the model
4. ✅ Training with multi-task learning
5. ✅ Evaluating performance
6. ✅ Making predictions
7. ✅ Saving and loading models

## Next Steps

- Try with your own datasets
- Experiment with different model configurations
- Fine-tune on specific data quality tasks
- Integrate into your data pipeline
- Check the API documentation for advanced usage