# Training Experiments and Model Fine-tuning

This notebook handles model training, configuration, and real-time monitoring of training metrics.

## 1. Environment Setup and Imports

In [None]:
# Standard library imports
import os
import json
import yaml
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Data processing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Deep learning
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms

# YOLO and training utilities
from ultralytics import YOLO
from ultralytics.utils import yaml_load, yaml_save

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Check environment
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

## 2. Configuration Setup

In [None]:
# Define paths
PROJECT_ROOT = Path('/home/user/qontinui-finetune')
DATASET_PATH = PROJECT_ROOT / 'data' / 'dataset'  # Update with actual dataset path
RUNS_DIR = PROJECT_ROOT / 'runs'
CHECKPOINTS_DIR = RUNS_DIR / 'checkpoints'
LOGS_DIR = RUNS_DIR / 'logs'

# Create directories
RUNS_DIR.mkdir(exist_ok=True)
CHECKPOINTS_DIR.mkdir(exist_ok=True)
LOGS_DIR.mkdir(exist_ok=True)

print("Project structure:")
print(f"  Project Root: {PROJECT_ROOT}")
print(f"  Dataset: {DATASET_PATH}")
print(f"  Runs: {RUNS_DIR}")
print(f"  Checkpoints: {CHECKPOINTS_DIR}")

## 3. Model Configuration

In [None]:
# Training configuration
training_config = {
    'model': 'yolov8n',  # yolov8n, yolov8s, yolov8m, yolov8l, yolov8x
    'epochs': 100,
    'batch_size': 16,
    'imgsz': 640,
    'device': 0 if torch.cuda.is_available() else 'cpu',
    'workers': 4,
    'optimizer': 'SGD',  # SGD, Adam, AdamW
    'lr0': 0.01,  # Initial learning rate
    'lrf': 0.01,  # Final learning rate ratio
    'momentum': 0.937,
    'weight_decay': 0.0005,
    'warmup_epochs': 3.0,
    'warmup_momentum': 0.8,
    'warmup_bias_lr': 0.1,
    'augmentation': True,
    'hsv_h': 0.015,  # HSV-Hue augmentation
    'hsv_s': 0.7,    # HSV-Saturation augmentation
    'hsv_v': 0.4,    # HSV-Value augmentation
    'degrees': 10.0, # Rotation range
    'translate': 0.1, # Translation ratio
    'scale': 0.5,    # Scale ratio
    'flipud': 0.0,   # Flip up-down
    'fliplr': 0.5,   # Flip left-right
    'mosaic': 1.0,   # Mosaic augmentation
    'patience': 20,  # Early stopping patience
    'save_period': 10,  # Save checkpoint every N epochs
}

print("Training Configuration:")
for key, value in training_config.items():
    print(f"  {key:20s}: {value}")

## 4. Create Dataset YAML Configuration

In [None]:
# Create dataset.yaml for YOLO
dataset_yaml = {
    'path': str(DATASET_PATH),
    'train': 'images/train',
    'val': 'images/val',
    'test': 'images/test',
    'nc': 2,  # Number of classes - UPDATE based on your dataset
    'names': {0: 'class_0', 1: 'class_1'}  # Class names - UPDATE accordingly
}

# Save dataset YAML
dataset_yaml_path = PROJECT_ROOT / 'dataset.yaml'
with open(dataset_yaml_path, 'w') as f:
    yaml.dump(dataset_yaml, f, default_flow_style=False)

print("Dataset YAML created:")
print(yaml.dump(dataset_yaml, default_flow_style=False))

## 5. Initialize Model

In [None]:
# Load pretrained model
model_name = training_config['model']
print(f"Loading {model_name} model...")

model = YOLO(f'{model_name}.pt')

# Display model info
print(f"\nModel Summary:")
model.info()

# Get model device
device = next(model.model.parameters()).device
print(f"\nModel device: {device}")
print(f"Model parameters: {sum(p.numel() for p in model.model.parameters()):,}")

## 6. Training Loop with Progress Tracking

In [None]:
# Configure training parameters
training_args = {
    'data': str(dataset_yaml_path),
    'epochs': training_config['epochs'],
    'batch': training_config['batch_size'],
    'imgsz': training_config['imgsz'],
    'device': training_config['device'],
    'workers': training_config['workers'],
    'optimizer': training_config['optimizer'],
    'lr0': training_config['lr0'],
    'lrf': training_config['lrf'],
    'momentum': training_config['momentum'],
    'weight_decay': training_config['weight_decay'],
    'warmup_epochs': training_config['warmup_epochs'],
    'warmup_momentum': training_config['warmup_momentum'],
    'warmup_bias_lr': training_config['warmup_bias_lr'],
    'hsv_h': training_config['hsv_h'],
    'hsv_s': training_config['hsv_s'],
    'hsv_v': training_config['hsv_v'],
    'degrees': training_config['degrees'],
    'translate': training_config['translate'],
    'scale': training_config['scale'],
    'flipud': training_config['flipud'],
    'fliplr': training_config['fliplr'],
    'mosaic': training_config['mosaic'],
    'patience': training_config['patience'],
    'save_period': training_config['save_period'],
    'project': str(RUNS_DIR),
    'name': f'exp_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    'exist_ok': False,
    'verbose': True,
    'seed': 42,
    'deterministic': True,
}

print("Starting training...")
print(f"Experiment name: {training_args['name']}")

# Train the model
results = model.train(**training_args)

print("\nTraining completed!")

## 7. Load Training Results

In [None]:
# Get the latest run directory
run_dirs = sorted([d for d in RUNS_DIR.glob('exp_*')], key=os.path.getmtime, reverse=True)
if run_dirs:
    latest_run = run_dirs[0]
    print(f"Latest run: {latest_run}")
    
    # Load results CSV
    results_csv = latest_run / 'results.csv'
    if results_csv.exists():
        results_df = pd.read_csv(results_csv)
        print(f"\nResults shape: {results_df.shape}")
        print(f"\nLast 5 epochs:")
        print(results_df.tail())
    else:
        print(f"Results CSV not found at {results_csv}")

## 8. Live Loss and Metric Plotting

In [None]:
def plot_training_metrics(results_df):
    """Plot training metrics from results dataframe."""
    
    # Extract columns
    epochs = results_df['epoch'].values if 'epoch' in results_df.columns else range(len(results_df))
    
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    
    # Training loss
    if 'train/loss' in results_df.columns:
        axes[0, 0].plot(epochs, results_df['train/loss'], 'b-o', linewidth=2, label='Train Loss')
        axes[0, 0].set_xlabel('Epoch', fontsize=11)
        axes[0, 0].set_ylabel('Loss', fontsize=11)
        axes[0, 0].set_title('Training Loss', fontsize=12, fontweight='bold')
        axes[0, 0].grid(True, alpha=0.3)
        axes[0, 0].legend()
    
    # Validation loss
    if 'val/loss' in results_df.columns:
        axes[0, 1].plot(epochs, results_df['val/loss'], 'r-o', linewidth=2, label='Val Loss')
        axes[0, 1].set_xlabel('Epoch', fontsize=11)
        axes[0, 1].set_ylabel('Loss', fontsize=11)
        axes[0, 1].set_title('Validation Loss', fontsize=12, fontweight='bold')
        axes[0, 1].grid(True, alpha=0.3)
        axes[0, 1].legend()
    
    # mAP@0.5
    if 'metrics/mAP50' in results_df.columns:
        axes[0, 2].plot(epochs, results_df['metrics/mAP50'], 'g-o', linewidth=2, label='mAP@0.5')
        axes[0, 2].set_xlabel('Epoch', fontsize=11)
        axes[0, 2].set_ylabel('mAP', fontsize=11)
        axes[0, 2].set_title('mAP@0.5', fontsize=12, fontweight='bold')
        axes[0, 2].grid(True, alpha=0.3)
        axes[0, 2].legend()
    
    # Precision
    if 'metrics/precision' in results_df.columns:
        axes[1, 0].plot(epochs, results_df['metrics/precision'], 'purple', marker='o', linewidth=2, label='Precision')
        axes[1, 0].set_xlabel('Epoch', fontsize=11)
        axes[1, 0].set_ylabel('Precision', fontsize=11)
        axes[1, 0].set_title('Precision', fontsize=12, fontweight='bold')
        axes[1, 0].set_ylim([0, 1])
        axes[1, 0].grid(True, alpha=0.3)
        axes[1, 0].legend()
    
    # Recall
    if 'metrics/recall' in results_df.columns:
        axes[1, 1].plot(epochs, results_df['metrics/recall'], 'orange', marker='o', linewidth=2, label='Recall')
        axes[1, 1].set_xlabel('Epoch', fontsize=11)
        axes[1, 1].set_ylabel('Recall', fontsize=11)
        axes[1, 1].set_title('Recall', fontsize=12, fontweight='bold')
        axes[1, 1].set_ylim([0, 1])
        axes[1, 1].grid(True, alpha=0.3)
        axes[1, 1].legend()
    
    # mAP@0.5:0.95
    if 'metrics/mAP50-95' in results_df.columns:
        axes[1, 2].plot(epochs, results_df['metrics/mAP50-95'], 'brown', marker='o', linewidth=2, label='mAP@0.5:0.95')
        axes[1, 2].set_xlabel('Epoch', fontsize=11)
        axes[1, 2].set_ylabel('mAP', fontsize=11)
        axes[1, 2].set_title('mAP@0.5:0.95', fontsize=12, fontweight='bold')
        axes[1, 2].grid(True, alpha=0.3)
        axes[1, 2].legend()
    
    plt.tight_layout()
    return fig

# Plot if results exist
if results_csv.exists():
    results_df = pd.read_csv(results_csv)
    plot_training_metrics(results_df)
    plt.show()

## 9. Checkpoint Management

In [None]:
def list_checkpoints(run_dir):
    """List all checkpoint files in a run directory."""
    weights_dir = run_dir / 'weights'
    if weights_dir.exists():
        checkpoints = list(weights_dir.glob('*.pt'))
        return sorted(checkpoints, key=os.path.getctime, reverse=True)
    return []

if run_dirs:
    latest_run = run_dirs[0]
    checkpoints = list_checkpoints(latest_run)
    
    print(f"Available checkpoints in {latest_run.name}:")
    for checkpoint in checkpoints:
        file_size = os.path.getsize(checkpoint) / (1024 * 1024)  # Convert to MB
        print(f"  - {checkpoint.name:30s} ({file_size:6.2f} MB)")
    
    # Copy best model to checkpoints directory
    best_model = latest_run / 'weights' / 'best.pt'
    if best_model.exists():
        import shutil
        dest = CHECKPOINTS_DIR / f'best_{latest_run.name}.pt'
        shutil.copy(best_model, dest)
        print(f"\nBest model copied to {dest}")

## 10. Model Validation

In [None]:
# Validate the model on validation set
if run_dirs:
    latest_run = run_dirs[0]
    best_model_path = latest_run / 'weights' / 'best.pt'
    
    if best_model_path.exists():
        print(f"Validating model: {best_model_path}")
        
        # Load best model
        val_model = YOLO(str(best_model_path))
        
        # Run validation
        val_results = val_model.val(
            data=str(dataset_yaml_path),
            imgsz=training_config['imgsz'],
            batch=training_config['batch_size'],
            device=training_config['device'],
            verbose=True,
        )
        
        print("\nValidation completed!")

## 11. Training Summary

In [None]:
# Print training summary
if run_dirs and results_csv.exists():
    results_df = pd.read_csv(results_csv)
    
    print("="*60)
    print("TRAINING SUMMARY".center(60))
    print("="*60)
    
    print(f"\nTotal Epochs: {len(results_df)}")
    
    # Find best metrics
    if 'metrics/mAP50' in results_df.columns:
        best_map_idx = results_df['metrics/mAP50'].idxmax()
        best_map = results_df.loc[best_map_idx, 'metrics/mAP50']
        best_map_epoch = results_df.loc[best_map_idx, 'epoch']
        print(f"\nBest mAP@0.5: {best_map:.4f} (Epoch {int(best_map_epoch)})")
    
    if 'metrics/precision' in results_df.columns:
        best_precision = results_df['metrics/precision'].max()
        print(f"Best Precision: {best_precision:.4f}")
    
    if 'metrics/recall' in results_df.columns:
        best_recall = results_df['metrics/recall'].max()
        print(f"Best Recall: {best_recall:.4f}")
    
    if 'train/loss' in results_df.columns:
        final_train_loss = results_df['train/loss'].iloc[-1]
        print(f"\nFinal Training Loss: {final_train_loss:.4f}")
    
    if 'val/loss' in results_df.columns:
        final_val_loss = results_df['val/loss'].iloc[-1]
        print(f"Final Validation Loss: {final_val_loss:.4f}")
    
    print(f"\nRun Directory: {latest_run}")
    print("="*60)