# Exploraci√≥n del Dataset - Cajas VR

Este notebook permite explorar y validar el dataset antes del entrenamiento.

## Objetivos
1. Verificar la estructura del dataset
2. Visualizar im√°genes con sus anotaciones
3. Analizar la distribuci√≥n de clases
4. Detectar posibles problemas en las anotaciones

---

## 1. Setup e Importaciones

In [None]:
# =============================================================================
# Importaciones necesarias
# =============================================================================

import os
import sys
from pathlib import Path
from collections import Counter

# Visualizaci√≥n
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Procesamiento de im√°genes
from PIL import Image
import cv2

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

# Configurar el directorio del proyecto
PROJECT_ROOT = Path().absolute().parent
DATA_DIR = PROJECT_ROOT / 'data'

print(f"Directorio del proyecto: {PROJECT_ROOT}")
print(f"Directorio de datos: {DATA_DIR}")

## 2. Configuraci√≥n del Dataset

Define las clases y verifica la estructura de carpetas.

In [None]:
# =============================================================================
# Configuraci√≥n de clases
# IMPORTANTE: Ajusta estos nombres seg√∫n tu dataset de Roboflow
# =============================================================================

CLASS_NAMES = {
    0: 'vr_box_type_a',    # Clase 0: Ajusta el nombre
    1: 'vr_box_type_b',    # Clase 1: Ajusta el nombre
}

# Colores para visualizaci√≥n (RGB)
CLASS_COLORS = {
    0: '#FF6B6B',  # Rojo coral
    1: '#4ECDC4',  # Turquesa
}

print("Clases configuradas:")
for idx, name in CLASS_NAMES.items():
    print(f"  {idx}: {name} ({CLASS_COLORS[idx]})")

In [None]:
# =============================================================================
# Verificar estructura del dataset
# =============================================================================

def check_dataset_structure():
    """Verifica que la estructura del dataset sea correcta."""
    
    splits = ['train', 'valid', 'test']
    structure_ok = True
    
    print("Verificando estructura del dataset...\n")
    
    for split in splits:
        split_dir = DATA_DIR / split
        images_dir = split_dir / 'images'
        labels_dir = split_dir / 'labels'
        
        print(f"üìÅ {split}/")
        
        # Verificar carpeta de im√°genes
        if images_dir.exists():
            num_images = len(list(images_dir.glob('*.[jJpP][pPnN][gG]')))
            print(f"   ‚îú‚îÄ‚îÄ images/: {num_images} im√°genes ‚úÖ")
        else:
            print(f"   ‚îú‚îÄ‚îÄ images/: NO ENCONTRADA ‚ùå")
            structure_ok = False
        
        # Verificar carpeta de labels
        if labels_dir.exists():
            num_labels = len(list(labels_dir.glob('*.txt')))
            print(f"   ‚îî‚îÄ‚îÄ labels/: {num_labels} archivos ‚úÖ")
        else:
            print(f"   ‚îî‚îÄ‚îÄ labels/: NO ENCONTRADA ‚ùå")
            structure_ok = False
        
        print()
    
    if structure_ok:
        print("‚úÖ Estructura del dataset correcta")
    else:
        print("‚ùå Hay problemas con la estructura del dataset")
        print("   Aseg√∫rate de exportar el dataset de Roboflow en formato YOLOv8")
    
    return structure_ok

dataset_ok = check_dataset_structure()

## 3. An√°lisis de Distribuci√≥n de Clases

Es importante verificar que las clases est√©n balanceadas para evitar sesgo en el modelo.

In [None]:
# =============================================================================
# Funciones de utilidad para parsear anotaciones YOLO
# =============================================================================

def parse_yolo_label(label_path):
    """
    Parsea un archivo de label YOLO.
    
    Formato YOLO: class_id x_center y_center width height (normalizados 0-1)
    
    Args:
        label_path: Ruta al archivo .txt
        
    Returns:
        Lista de diccionarios con las anotaciones
    """
    annotations = []
    
    if not Path(label_path).exists():
        return annotations
    
    with open(label_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 5:
                annotations.append({
                    'class_id': int(parts[0]),
                    'x_center': float(parts[1]),
                    'y_center': float(parts[2]),
                    'width': float(parts[3]),
                    'height': float(parts[4]),
                })
    
    return annotations


def collect_all_annotations(split='train'):
    """
    Recolecta todas las anotaciones de un split.
    
    Args:
        split: 'train', 'valid', o 'test'
        
    Returns:
        DataFrame con todas las anotaciones
    """
    labels_dir = DATA_DIR / split / 'labels'
    images_dir = DATA_DIR / split / 'images'
    
    all_annotations = []
    
    if not labels_dir.exists():
        print(f"‚ö†Ô∏è Directorio no encontrado: {labels_dir}")
        return pd.DataFrame()
    
    for label_file in labels_dir.glob('*.txt'):
        annotations = parse_yolo_label(label_file)
        
        # Buscar imagen correspondiente
        img_name = label_file.stem
        img_path = None
        for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
            potential_path = images_dir / f"{img_name}{ext}"
            if potential_path.exists():
                img_path = potential_path
                break
        
        for ann in annotations:
            ann['split'] = split
            ann['label_file'] = label_file.name
            ann['image_file'] = img_path.name if img_path else None
            all_annotations.append(ann)
    
    return pd.DataFrame(all_annotations)

print("Funciones de utilidad cargadas ‚úÖ")

In [None]:
# =============================================================================
# Recolectar y analizar anotaciones de todos los splits
# =============================================================================

# Recolectar anotaciones
df_train = collect_all_annotations('train')
df_valid = collect_all_annotations('valid')
df_test = collect_all_annotations('test')

# Combinar todos los datos
df_all = pd.concat([df_train, df_valid, df_test], ignore_index=True)

if len(df_all) > 0:
    print(f"\nüìä Resumen del Dataset:")
    print(f"   Total de anotaciones: {len(df_all)}")
    print(f"   - Train: {len(df_train)}")
    print(f"   - Valid: {len(df_valid)}")
    print(f"   - Test: {len(df_test)}")
else:
    print("‚ö†Ô∏è No se encontraron anotaciones. Aseg√∫rate de que el dataset est√© en data/")

In [None]:
# =============================================================================
# Visualizar distribuci√≥n de clases
# =============================================================================

if len(df_all) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Gr√°fico 1: Distribuci√≥n total de clases
    class_counts = df_all['class_id'].value_counts().sort_index()
    colors = [CLASS_COLORS.get(i, '#666666') for i in class_counts.index]
    
    ax1 = axes[0]
    bars = ax1.bar(class_counts.index, class_counts.values, color=colors, edgecolor='black')
    ax1.set_xlabel('Clase')
    ax1.set_ylabel('N√∫mero de instancias')
    ax1.set_title('Distribuci√≥n de Clases (Total)')
    ax1.set_xticks(list(CLASS_NAMES.keys()))
    ax1.set_xticklabels([CLASS_NAMES.get(i, f'Clase {i}') for i in class_counts.index])
    
    # A√±adir valores sobre las barras
    for bar, count in zip(bars, class_counts.values):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                 str(count), ha='center', va='bottom', fontweight='bold')
    
    # Gr√°fico 2: Distribuci√≥n por split
    ax2 = axes[1]
    splits = ['train', 'valid', 'test']
    x = np.arange(len(CLASS_NAMES))
    width = 0.25
    
    for i, split in enumerate(splits):
        df_split = df_all[df_all['split'] == split]
        counts = [len(df_split[df_split['class_id'] == c]) for c in CLASS_NAMES.keys()]
        ax2.bar(x + i*width, counts, width, label=split.capitalize())
    
    ax2.set_xlabel('Clase')
    ax2.set_ylabel('N√∫mero de instancias')
    ax2.set_title('Distribuci√≥n de Clases por Split')
    ax2.set_xticks(x + width)
    ax2.set_xticklabels([CLASS_NAMES.get(i, f'Clase {i}') for i in CLASS_NAMES.keys()])
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Calcular balance de clases
    print("\nüìà Balance de clases:")
    total = len(df_all)
    for class_id in CLASS_NAMES.keys():
        count = len(df_all[df_all['class_id'] == class_id])
        percentage = (count / total) * 100
        print(f"   {CLASS_NAMES[class_id]}: {count} ({percentage:.1f}%)")
    
    # Advertencia si hay desbalance
    percentages = [(len(df_all[df_all['class_id'] == c]) / total) * 100 for c in CLASS_NAMES.keys()]
    if max(percentages) / min(percentages) > 3:
        print("\n‚ö†Ô∏è ADVERTENCIA: Las clases est√°n muy desbalanceadas")
        print("   Considera agregar m√°s datos de la clase minoritaria")
else:
    print("No hay datos para visualizar")

## 4. An√°lisis de Tama√±os de Bounding Boxes

Entender el tama√±o de los objetos ayuda a configurar mejor el modelo.

In [None]:
# =============================================================================
# Analizar tama√±os de bounding boxes
# =============================================================================

if len(df_all) > 0:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Gr√°fico 1: Distribuci√≥n de anchos
    ax1 = axes[0]
    for class_id in CLASS_NAMES.keys():
        df_class = df_all[df_all['class_id'] == class_id]
        ax1.hist(df_class['width'], bins=20, alpha=0.6, 
                 label=CLASS_NAMES[class_id], color=CLASS_COLORS[class_id])
    ax1.set_xlabel('Ancho (normalizado)')
    ax1.set_ylabel('Frecuencia')
    ax1.set_title('Distribuci√≥n de Anchos')
    ax1.legend()
    
    # Gr√°fico 2: Distribuci√≥n de altos
    ax2 = axes[1]
    for class_id in CLASS_NAMES.keys():
        df_class = df_all[df_all['class_id'] == class_id]
        ax2.hist(df_class['height'], bins=20, alpha=0.6, 
                 label=CLASS_NAMES[class_id], color=CLASS_COLORS[class_id])
    ax2.set_xlabel('Alto (normalizado)')
    ax2.set_ylabel('Frecuencia')
    ax2.set_title('Distribuci√≥n de Altos')
    ax2.legend()
    
    # Gr√°fico 3: Scatter de ancho vs alto
    ax3 = axes[2]
    for class_id in CLASS_NAMES.keys():
        df_class = df_all[df_all['class_id'] == class_id]
        ax3.scatter(df_class['width'], df_class['height'], alpha=0.5,
                   label=CLASS_NAMES[class_id], color=CLASS_COLORS[class_id])
    ax3.set_xlabel('Ancho (normalizado)')
    ax3.set_ylabel('Alto (normalizado)')
    ax3.set_title('Ancho vs Alto de Bounding Boxes')
    ax3.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Estad√≠sticas
    print("\nüìê Estad√≠sticas de tama√±o de bounding boxes:")
    for class_id in CLASS_NAMES.keys():
        df_class = df_all[df_all['class_id'] == class_id]
        if len(df_class) > 0:
            print(f"\n   {CLASS_NAMES[class_id]}:")
            print(f"      Ancho: {df_class['width'].mean():.3f} ¬± {df_class['width'].std():.3f}")
            print(f"      Alto:  {df_class['height'].mean():.3f} ¬± {df_class['height'].std():.3f}")
            area = df_class['width'] * df_class['height']
            print(f"      √Årea:  {area.mean():.4f} ¬± {area.std():.4f}")
else:
    print("No hay datos para analizar")

## 5. Visualizaci√≥n de Im√°genes con Anotaciones

Verificar visualmente que las anotaciones son correctas.

In [None]:
# =============================================================================
# Funci√≥n para visualizar imagen con anotaciones
# =============================================================================

def visualize_image_with_annotations(image_path, label_path, figsize=(12, 8)):
    """
    Muestra una imagen con sus bounding boxes anotados.
    
    Args:
        image_path: Ruta a la imagen
        label_path: Ruta al archivo de label
        figsize: Tama√±o de la figura
    """
    # Cargar imagen
    img = Image.open(image_path)
    img_width, img_height = img.size
    
    # Parsear anotaciones
    annotations = parse_yolo_label(label_path)
    
    # Crear figura
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    ax.imshow(img)
    
    # Dibujar cada bounding box
    for ann in annotations:
        # Convertir de formato YOLO (centro, tama√±o normalizado) a p√≠xeles
        x_center = ann['x_center'] * img_width
        y_center = ann['y_center'] * img_height
        width = ann['width'] * img_width
        height = ann['height'] * img_height
        
        # Calcular esquina superior izquierda
        x1 = x_center - width / 2
        y1 = y_center - height / 2
        
        # Obtener color y nombre de clase
        class_id = ann['class_id']
        color = CLASS_COLORS.get(class_id, '#666666')
        class_name = CLASS_NAMES.get(class_id, f'Clase {class_id}')
        
        # Dibujar rect√°ngulo
        rect = patches.Rectangle(
            (x1, y1), width, height,
            linewidth=2, edgecolor=color, facecolor='none'
        )
        ax.add_patch(rect)
        
        # A√±adir etiqueta
        ax.text(x1, y1 - 5, class_name, color='white', fontsize=10,
               fontweight='bold', bbox=dict(boxstyle='round', facecolor=color, alpha=0.8))
    
    ax.set_title(f"{Path(image_path).name} - {len(annotations)} anotaciones")
    ax.axis('off')
    plt.tight_layout()
    plt.show()

print("Funci√≥n de visualizaci√≥n cargada ‚úÖ")

In [None]:
# =============================================================================
# Visualizar algunas im√°genes de ejemplo
# =============================================================================

def show_random_samples(split='train', n_samples=4):
    """
    Muestra n_samples im√°genes aleatorias del split especificado.
    """
    images_dir = DATA_DIR / split / 'images'
    labels_dir = DATA_DIR / split / 'labels'
    
    if not images_dir.exists():
        print(f"‚ö†Ô∏è Directorio no encontrado: {images_dir}")
        return
    
    # Obtener lista de im√°genes
    image_files = list(images_dir.glob('*.[jJpP][pPnN][gG]'))
    
    if len(image_files) == 0:
        print(f"‚ö†Ô∏è No se encontraron im√°genes en {images_dir}")
        return
    
    # Seleccionar muestras aleatorias
    n_samples = min(n_samples, len(image_files))
    selected = np.random.choice(image_files, n_samples, replace=False)
    
    print(f"\nüñºÔ∏è Mostrando {n_samples} im√°genes de {split}:\n")
    
    for img_path in selected:
        label_path = labels_dir / f"{img_path.stem}.txt"
        visualize_image_with_annotations(img_path, label_path)

# Mostrar ejemplos del set de entrenamiento
show_random_samples('train', n_samples=4)

## 6. Verificaci√≥n de Calidad de Datos

Detectar posibles problemas en el dataset.

In [None]:
# =============================================================================
# Verificar calidad de datos
# =============================================================================

def check_data_quality(split='train'):
    """
    Verifica la calidad de los datos de un split.
    
    Busca:
    - Im√°genes sin labels
    - Labels sin im√°genes
    - Bounding boxes fuera de rango
    - Clases inv√°lidas
    """
    images_dir = DATA_DIR / split / 'images'
    labels_dir = DATA_DIR / split / 'labels'
    
    issues = []
    
    if not images_dir.exists() or not labels_dir.exists():
        print(f"‚ö†Ô∏è Directorios no encontrados para {split}")
        return
    
    # Obtener sets de archivos
    image_stems = {f.stem for f in images_dir.glob('*.[jJpP][pPnN][gG]')}
    label_stems = {f.stem for f in labels_dir.glob('*.txt')}
    
    # 1. Im√°genes sin labels
    missing_labels = image_stems - label_stems
    if missing_labels:
        issues.append(f"‚ö†Ô∏è {len(missing_labels)} im√°genes sin archivo de label")
        for name in list(missing_labels)[:5]:
            issues.append(f"   - {name}")
        if len(missing_labels) > 5:
            issues.append(f"   ... y {len(missing_labels) - 5} m√°s")
    
    # 2. Labels sin im√°genes
    missing_images = label_stems - image_stems
    if missing_images:
        issues.append(f"‚ö†Ô∏è {len(missing_images)} labels sin imagen correspondiente")
    
    # 3. Verificar contenido de labels
    invalid_bbox = 0
    invalid_class = 0
    empty_labels = 0
    
    for label_file in labels_dir.glob('*.txt'):
        annotations = parse_yolo_label(label_file)
        
        if len(annotations) == 0:
            empty_labels += 1
            continue
        
        for ann in annotations:
            # Verificar rango de coordenadas
            if not (0 <= ann['x_center'] <= 1 and 0 <= ann['y_center'] <= 1):
                invalid_bbox += 1
            if not (0 < ann['width'] <= 1 and 0 < ann['height'] <= 1):
                invalid_bbox += 1
            
            # Verificar clase v√°lida
            if ann['class_id'] not in CLASS_NAMES:
                invalid_class += 1
    
    if empty_labels > 0:
        issues.append(f"‚ÑπÔ∏è {empty_labels} im√°genes sin objetos (labels vac√≠os)")
    
    if invalid_bbox > 0:
        issues.append(f"‚ùå {invalid_bbox} bounding boxes con coordenadas inv√°lidas")
    
    if invalid_class > 0:
        issues.append(f"‚ùå {invalid_class} anotaciones con clase inv√°lida")
    
    # Mostrar resultados
    print(f"\nüîç Verificaci√≥n de calidad - {split.upper()}")
    print("=" * 50)
    
    if not issues:
        print("‚úÖ No se encontraron problemas")
    else:
        for issue in issues:
            print(issue)
    
    print(f"\n   Total im√°genes: {len(image_stems)}")
    print(f"   Total labels: {len(label_stems)}")

# Verificar cada split
for split in ['train', 'valid', 'test']:
    check_data_quality(split)

## 7. Resumen y Recomendaciones

Basado en el an√°lisis, aqu√≠ est√°n las recomendaciones para el entrenamiento.

In [None]:
# =============================================================================
# Generar resumen y recomendaciones
# =============================================================================

print("\n" + "=" * 60)
print("üìã RESUMEN DEL DATASET")
print("=" * 60)

if len(df_all) > 0:
    print(f"\nüìä Estad√≠sticas generales:")
    print(f"   - Total de anotaciones: {len(df_all)}")
    print(f"   - Clases: {len(CLASS_NAMES)}")
    
    print(f"\nüìÅ Distribuci√≥n por split:")
    for split in ['train', 'valid', 'test']:
        count = len(df_all[df_all['split'] == split])
        print(f"   - {split}: {count} anotaciones")
    
    # Recomendaciones basadas en los datos
    print(f"\nüí° RECOMENDACIONES:")
    
    # 1. Tama√±o del dataset
    train_count = len(df_train)
    if train_count < 100:
        print(f"   ‚ö†Ô∏è Dataset peque√±o ({train_count} anotaciones). Considera:")
        print(f"      - Usar augmentation agresivo")
        print(f"      - Reducir √©pocas (50-100) para evitar overfitting")
        print(f"      - Agregar m√°s datos si es posible")
    elif train_count < 500:
        print(f"   ‚ÑπÔ∏è Dataset moderado ({train_count} anotaciones).")
        print(f"      - Augmentation est√°ndar deber√≠a funcionar bien")
        print(f"      - 100-200 √©pocas recomendadas")
    else:
        print(f"   ‚úÖ Dataset de buen tama√±o ({train_count} anotaciones).")
        print(f"      - Puedes reducir augmentation si es necesario")
        print(f"      - 100-300 √©pocas seg√∫n resultados")
    
    # 2. Balance de clases
    total = len(df_all)
    percentages = {c: (len(df_all[df_all['class_id'] == c]) / total) * 100 for c in CLASS_NAMES.keys()}
    max_pct = max(percentages.values())
    min_pct = min(percentages.values())
    
    if max_pct / min_pct > 3:
        print(f"\n   ‚ö†Ô∏è Clases desbalanceadas (ratio {max_pct/min_pct:.1f}:1)")
        print(f"      - Considera usar class weights")
        print(f"      - O agregar m√°s datos de la clase minoritaria")
    else:
        print(f"\n   ‚úÖ Clases relativamente balanceadas")
    
    # 3. Tama√±o de objetos
    mean_area = (df_all['width'] * df_all['height']).mean()
    if mean_area < 0.01:
        print(f"\n   ‚ÑπÔ∏è Objetos peque√±os (√°rea promedio: {mean_area:.4f})")
        print(f"      - Considera usar imgsz=1280 para mejor detecci√≥n")
        print(f"      - Pero requiere m√°s VRAM")
    elif mean_area > 0.1:
        print(f"\n   ‚úÖ Objetos de tama√±o mediano-grande (√°rea: {mean_area:.4f})")
        print(f"      - imgsz=640 deber√≠a funcionar bien")
    
    print("\n" + "=" * 60)
    print("Listo para entrenar: python scripts/train.py")
    print("=" * 60)
    
else:
    print("\n‚ö†Ô∏è No hay datos en el dataset")
    print("   Sigue las instrucciones en README.md para:")
    print("   1. Capturar screenshots VR")
    print("   2. Anotar con Roboflow")
    print("   3. Exportar en formato YOLOv8")
    print("   4. Colocar los archivos en data/")

---

## Pr√≥ximos Pasos

1. Si el dataset se ve bien, procede al entrenamiento:
   ```bash
   python scripts/train.py
   ```

2. Si encontraste problemas, corr√≠gelos en Roboflow y re-exporta.

3. Despu√©s del entrenamiento, documenta los resultados en `docs/benchmarks.md`