# üèüÔ∏è Soccer Banner Segmentation - EDA & Data Preparation

Questo notebook prepara i dati per il training del modello YOLOv11:
- Download dataset da Kaggle e Roboflow
- Preprocessing e split (70/20/10 per Kaggle)
- Analisi esplorativa comparativa dei dataset

---

## üì¶ 1. Installazione Dipendenze

Decommentare ed eseguire questa cella solo al primo avvio.

In [None]:
# ============================================================================
# INSTALLAZIONE DIPENDENZE (eseguire solo se necessario)
# ============================================================================
# pip install "numpy<2.0" ultralytics opencv-python roboflow kaggle pyyaml matplotlib tqdm python-dotenv seaborn

## üìö 2. Import delle Librerie

In [None]:
# ============================================================================
# IMPORT LIBRERIE
# ============================================================================

# Librerie Standard Python
import sys
import os
import yaml
import shutil
import glob
import random

# Librerie Scientifiche
import numpy as np
import pandas as pd
import cv2

# Visualizzazione
import matplotlib.pyplot as plt
import seaborn as sns

# Deep Learning
import torch

# Dataset & Utilities
from roboflow import Roboflow
from dotenv import load_dotenv
from tqdm import tqdm
from pathlib import Path

# Caricamento variabili d'ambiente
load_dotenv()

# Configurazione visualizzazione
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("‚úÖ Ambiente configurato correttamente.")
print(f"   PyTorch version: {torch.__version__}")
print(f"   CUDA disponibile: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

## üìÅ 3. Configurazione Directory di Lavoro

Creazione struttura cartelle: input, output, dataset, model.

In [None]:
# ============================================================================
# CONFIGURAZIONE DIRECTORY
# ============================================================================
# Struttura del progetto:
#   BASE_DIR/
#   ‚îú‚îÄ‚îÄ input/           # Video e immagini di input per inferenza
#   ‚îú‚îÄ‚îÄ output/          # Risultati del training e inferenza
#   ‚îú‚îÄ‚îÄ dataset/         # Dataset scaricati
#   ‚îÇ   ‚îú‚îÄ‚îÄ kaggle_raw/
#   ‚îÇ   ‚îú‚îÄ‚îÄ kaggle_dataset/
#   ‚îÇ   ‚îî‚îÄ‚îÄ roboflow_dataset/
#   ‚îî‚îÄ‚îÄ model/           # Modelli addestrati
# ============================================================================

# Directory root del progetto
BASE_DIR = Path(os.getcwd())

# Sottodirectory principali
INPUT_DIR = BASE_DIR / "input"
OUTPUT_DIR = BASE_DIR / "output"
DATASETS_DIR = BASE_DIR / "dataset"
MODEL_DIR = BASE_DIR / "model"

# Creazione automatica delle directory
for directory in [INPUT_DIR, OUTPUT_DIR, DATASETS_DIR, MODEL_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

print("‚úÖ Directory configurate correttamente:")
print(f"   üìÇ Base:     {BASE_DIR}")
print(f"   üìÇ Input:    {INPUT_DIR}")
print(f"   üìÇ Output:   {OUTPUT_DIR}")
print(f"   üìÇ Dataset:  {DATASETS_DIR}")
print(f"   üìÇ Model:    {MODEL_DIR}")

## üîê 4. Validazione Credenziali API

In [None]:
# ============================================================================
# VALIDAZIONE CREDENZIALI API
# ============================================================================

KAGGLE_USERNAME = os.getenv('KAGGLE_USERNAME')
KAGGLE_KEY = os.getenv('KAGGLE_KEY')
ROBOFLOW_KEY = os.getenv('ROBOFLOW_KEY')

missing_keys = []
if not KAGGLE_USERNAME:
    missing_keys.append('KAGGLE_USERNAME')
if not KAGGLE_KEY:
    missing_keys.append('KAGGLE_KEY')
if not ROBOFLOW_KEY:
    missing_keys.append('ROBOFLOW_KEY')

if missing_keys:
    raise RuntimeError(
        f"‚ùå Chiavi mancanti nel file .env: {', '.join(missing_keys)}\n"
        f"   Crea un file .env con:\n"
        f"   KAGGLE_USERNAME=tuo_username\n"
        f"   KAGGLE_KEY=tua_chiave\n"
        f"   ROBOFLOW_KEY=tua_chiave"
    )

os.environ['KAGGLE_USERNAME'] = str(KAGGLE_USERNAME)
os.environ['KAGGLE_KEY'] = str(KAGGLE_KEY)

print("‚úÖ Credenziali caricate e validate.")
print(f"   üë§ Kaggle User: {KAGGLE_USERNAME}")
print(f"   üîë Roboflow Key: {str(ROBOFLOW_KEY)[:8]}...")

## üì• 5. Download Dataset Kaggle

In [None]:
# ============================================================================
# ACQUISIZIONE DATASET KAGGLE
# ============================================================================

def acquire_kaggle_dataset():
    """Scarica e prepara il dataset Kaggle."""
    print("üì• Download Dataset Kaggle...")
    
    kaggle_cmd = "kaggle datasets download -d swagatajana/football-match-adboards-mask-dataset --force"
    exit_code = os.system(kaggle_cmd)
    
    if exit_code != 0:
        print("‚ö†Ô∏è  Download fallito. Verifica credenziali Kaggle.")
        return None
    
    zip_path = "football-match-adboards-mask-dataset.zip"
    kaggle_raw_dir = DATASETS_DIR / "kaggle_raw"
    
    if os.path.exists(zip_path):
        print("   üì¶ Estrazione archivio...")
        shutil.unpack_archive(zip_path, kaggle_raw_dir)
        os.remove(zip_path)
        print(f"   ‚úÖ Dataset estratto: {kaggle_raw_dir}")
        return kaggle_raw_dir
    
    return None

KAGGLE_DIR = acquire_kaggle_dataset()
print("="*60)

## üì• 6. Download Dataset Roboflow

In [None]:
# ============================================================================
# ACQUISIZIONE DATASET ROBOFLOW
# ============================================================================

def acquire_roboflow_dataset():
    """Scarica dataset Roboflow con split pre-configurato."""
    print("\nüì• Download Dataset Roboflow v4...")
    
    rf = Roboflow(api_key=ROBOFLOW_KEY)
    project = rf.workspace("workspace-tp24t").project("soccer-banner-segmentation")
    roboflow_dir = DATASETS_DIR / "roboflow_dataset"
    
    dataset = project.version(4).download(
        model_format="yolov11",
        location=str(roboflow_dir)
    )
    
    yaml_path = str(roboflow_dir / 'data.yaml')
    
    # Verifica split
    train_imgs = len(list((roboflow_dir / 'train' / 'images').glob('*.*'))) if (roboflow_dir / 'train' / 'images').exists() else 0
    valid_imgs = len(list((roboflow_dir / 'valid' / 'images').glob('*.*'))) if (roboflow_dir / 'valid' / 'images').exists() else 0
    test_imgs = len(list((roboflow_dir / 'test' / 'images').glob('*.*'))) if (roboflow_dir / 'test' / 'images').exists() else 0
    total = train_imgs + valid_imgs + test_imgs
    
    print(f"   ‚úÖ Dataset scaricato: {roboflow_dir}")
    print(f"   üìä Split (gi√† effettuato da Roboflow):")
    print(f"      Train: {train_imgs:3d} immagini ({train_imgs/total*100:.0f}%)" if total > 0 else "      Train: 0")
    print(f"      Valid: {valid_imgs:3d} immagini ({valid_imgs/total*100:.0f}%)" if total > 0 else "      Valid: 0")
    print(f"      Test:  {test_imgs:3d} immagini ({test_imgs/total*100:.0f}%)" if total > 0 else "      Test:  0")
    
    return yaml_path

ROBO_YAML_PATH = acquire_roboflow_dataset()
print("="*60)

## üè∑Ô∏è 7. Preprocessing Dataset Kaggle (Split 70/20/10)

Conversione maschere ‚Üí annotazioni YOLO con split train/valid/test.

In [None]:
# ============================================================================
# PREPROCESSING DATASET KAGGLE CON SPLIT 70/20/10
# ============================================================================

def prepare_kaggle_labels():
    """
    Converte maschere binarie in label YOLO con split 70/20/10.
    """
    # Directory sorgente
    raw_base = DATASETS_DIR / "kaggle_raw"
    src_img_dir = raw_base / 'Tagged_Images' / 'Tagged Images'
    src_msk_dir = raw_base / 'Masks' / 'Masks'
    
    # Directory destinazione
    dest_base = DATASETS_DIR / 'kaggle_dataset'
    
    # Creazione struttura split
    splits = ['train', 'valid', 'test']
    for split in splits:
        (dest_base / split / 'images').mkdir(parents=True, exist_ok=True)
        (dest_base / split / 'labels').mkdir(parents=True, exist_ok=True)
    
    # Ricerca file sorgente
    img_files = sorted(glob.glob(str(src_img_dir / "*.jpg")))
    msk_files = sorted(glob.glob(str(src_msk_dir / "*.png")))
    num_pairs = min(len(img_files), len(msk_files))
    
    print(f"üîÑ Elaborazione di {num_pairs} coppie immagine-maschera...")
    
    # Shuffle per split randomico
    indices = list(range(num_pairs))
    random.seed(42)
    random.shuffle(indices)
    
    # Calcolo indici split 70/20/10
    train_end = int(0.70 * num_pairs)
    valid_end = int(0.90 * num_pairs)
    
    split_indices = {
        'train': indices[:train_end],
        'valid': indices[train_end:valid_end],
        'test': indices[valid_end:]
    }
    
    counts = {'train': 0, 'valid': 0, 'test': 0}
    
    for split, idx_list in split_indices.items():
        for i in tqdm(idx_list, desc=f"Processing {split}"):
            new_name = f"kaggle_{i:04d}"
            
            # Copia immagine
            shutil.copy(img_files[i], dest_base / split / 'images' / f"{new_name}.jpg")
            
            # Conversione maschera
            mask = cv2.imread(msk_files[i], cv2.IMREAD_GRAYSCALE)
            if mask is None:
                continue
            
            _, binary = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
            contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            h, w = mask.shape
            polygons = []
            
            for contour in contours:
                if len(contour) < 3:
                    continue
                points = contour.reshape(-1, 2).astype(np.float32)
                points[:, 0] /= w
                points[:, 1] /= h
                coords_str = " ".join([f"{pt[0]:.6f} {pt[1]:.6f}" for pt in points])
                polygons.append(f"0 {coords_str}")
            
            with open(dest_base / split / 'labels' / f"{new_name}.txt", "w") as f:
                f.write("\n".join(polygons))
            
            counts[split] += 1
    
    # Generazione YAML
    kaggle_config = {
        'path': str(dest_base.absolute()),
        'train': 'train/images',
        'val': 'valid/images',
        'test': 'test/images',
        'nc': 1,
        'names': ['banner']
    }
    
    yaml_path = dest_base / 'kaggle_data.yaml'
    with open(yaml_path, 'w') as f:
        yaml.dump(kaggle_config, f, default_flow_style=False)
    
    total = sum(counts.values())
    print(f"\n‚úÖ Preprocessing completato con split 70/20/10:")
    print(f"   üìä Train: {counts['train']} ({counts['train']/total*100:.0f}%)")
    print(f"   üìä Valid: {counts['valid']} ({counts['valid']/total*100:.0f}%)")
    print(f"   üìä Test:  {counts['test']} ({counts['test']/total*100:.0f}%)")
    print(f"   üìÑ Config: {yaml_path}")
    
    return str(yaml_path)

KAG_YAML_PATH = prepare_kaggle_labels()

## üîß 8. Fix Configurazione Roboflow

In [None]:
# ============================================================================
# FIX CONFIGURAZIONE YAML ROBOFLOW
# ============================================================================

def fix_roboflow_yaml(yaml_path: str) -> bool:
    """Corregge path nel data.yaml Roboflow."""
    
    if not os.path.exists(yaml_path):
        print(f"‚ùå File non trovato: {yaml_path}")
        return False
    
    with open(yaml_path, 'r') as f:
        config = yaml.safe_load(f)
    
    base_dir = Path(yaml_path).parent.absolute()
    
    config['path'] = str(base_dir)
    config['train'] = str(base_dir / 'train' / 'images')
    config['val'] = str(base_dir / 'valid' / 'images')
    config['nc'] = 1
    config['names'] = {0: 'banner'}
    
    # Validazione
    train_path = Path(config['train'])
    val_path = Path(config['val'])
    
    if not train_path.exists() or not val_path.exists():
        print("‚ùå Directory train/ o valid/ non trovate")
        return False
    
    n_train = len(list(train_path.glob('*.*')))
    n_val = len(list(val_path.glob('*.*')))
    
    with open(yaml_path, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    
    print(f"‚úÖ data.yaml Roboflow corretto")
    print(f"   üìä Train: {n_train} immagini")
    print(f"   üìä Valid: {n_val} immagini")
    
    return True

fix_roboflow_yaml(ROBO_YAML_PATH)

---
# üìä Exploratory Data Analysis (EDA)

Analisi comparativa dei dataset Kaggle e Roboflow.

---

## üìà 9. Statistiche Dataset

In [None]:
# ============================================================================
# STATISTICHE GENERALI DEI DATASET
# ============================================================================

def get_dataset_stats(dataset_path: Path, name: str):
    """Calcola statistiche per un dataset."""
    stats = {'name': name, 'splits': {}}
    
    for split in ['train', 'valid', 'test']:
        img_dir = dataset_path / split / 'images'
        lbl_dir = dataset_path / split / 'labels'
        
        if not img_dir.exists():
            continue
            
        images = list(img_dir.glob('*.*'))
        labels = list(lbl_dir.glob('*.txt')) if lbl_dir.exists() else []
        
        # Dimensioni immagini
        sizes = []
        for img_path in images[:50]:  # Campione di 50
            img = cv2.imread(str(img_path))
            if img is not None:
                sizes.append((img.shape[1], img.shape[0]))
        
        stats['splits'][split] = {
            'n_images': len(images),
            'n_labels': len(labels),
            'avg_width': np.mean([s[0] for s in sizes]) if sizes else 0,
            'avg_height': np.mean([s[1] for s in sizes]) if sizes else 0
        }
    
    return stats

# Calcolo statistiche
kaggle_stats = get_dataset_stats(DATASETS_DIR / 'kaggle_dataset', 'Kaggle')
roboflow_stats = get_dataset_stats(DATASETS_DIR / 'roboflow_dataset', 'Roboflow')

# Visualizzazione
print("üìä STATISTICHE DATASET")
print("="*60)

for stats in [kaggle_stats, roboflow_stats]:
    print(f"\nüóÇÔ∏è  {stats['name']}:")
    total = sum(s['n_images'] for s in stats['splits'].values())
    for split, data in stats['splits'].items():
        pct = data['n_images']/total*100 if total > 0 else 0
        print(f"   {split.capitalize():6s}: {data['n_images']:4d} imgs ({pct:.0f}%) | {data['avg_width']:.0f}x{data['avg_height']:.0f}")

## üìä 10. Confronto Distribuzione Split

In [None]:
# ============================================================================
# VISUALIZZAZIONE CONFRONTO SPLIT
# ============================================================================

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for ax, stats in zip(axes, [kaggle_stats, roboflow_stats]):
    splits = list(stats['splits'].keys())
    counts = [stats['splits'][s]['n_images'] for s in splits]
    colors = ['#2ecc71', '#3498db', '#e74c3c']
    
    ax.pie(counts, labels=splits, autopct='%1.0f%%', colors=colors[:len(splits)],
           explode=[0.02]*len(splits), shadow=True, startangle=90)
    ax.set_title(f"{stats['name']} Dataset\n(Total: {sum(counts)} images)", fontsize=12, fontweight='bold')

plt.suptitle('üìä Distribuzione Split Dataset', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'eda_split_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## üñºÔ∏è 11. Visualizzazione Campioni

In [None]:
# ============================================================================
# VISUALIZZAZIONE CAMPIONI DA ENTRAMBI I DATASET
# ============================================================================

def show_samples(dataset_path: Path, title: str, n_samples=4):
    """Mostra campioni casuali da un dataset."""
    train_imgs = list((dataset_path / 'train' / 'images').glob('*.*'))
    
    if not train_imgs:
        print(f"‚ùå Nessuna immagine trovata in {dataset_path}")
        return
    
    samples = random.sample(train_imgs, min(n_samples, len(train_imgs)))
    
    fig, axes = plt.subplots(1, n_samples, figsize=(4*n_samples, 4))
    if n_samples == 1:
        axes = [axes]
    
    for ax, img_path in zip(axes, samples):
        img = cv2.imread(str(img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        ax.imshow(img)
        ax.set_title(img_path.name[:20], fontsize=9)
        ax.axis('off')
    
    plt.suptitle(f'üñºÔ∏è {title}', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.show()

show_samples(DATASETS_DIR / 'kaggle_dataset', 'Campioni Dataset Kaggle')
show_samples(DATASETS_DIR / 'roboflow_dataset', 'Campioni Dataset Roboflow')

## üìè 12. Analisi Annotazioni

In [None]:
# ============================================================================
# ANALISI ANNOTAZIONI (NUMERO E DIMENSIONE POLIGONI)
# ============================================================================

def analyze_annotations(dataset_path: Path, name: str):
    """Analizza le annotazioni di un dataset."""
    label_dir = dataset_path / 'train' / 'labels'
    
    if not label_dir.exists():
        return None
    
    polygon_counts = []
    polygon_sizes = []
    
    for lbl_path in label_dir.glob('*.txt'):
        with open(lbl_path, 'r') as f:
            lines = f.readlines()
        
        polygon_counts.append(len(lines))
        
        for line in lines:
            parts = line.strip().split()
            if len(parts) > 1:
                n_points = (len(parts) - 1) // 2
                polygon_sizes.append(n_points)
    
    return {
        'name': name,
        'polygon_counts': polygon_counts,
        'polygon_sizes': polygon_sizes,
        'avg_polygons': np.mean(polygon_counts) if polygon_counts else 0,
        'avg_points': np.mean(polygon_sizes) if polygon_sizes else 0
    }

kaggle_ann = analyze_annotations(DATASETS_DIR / 'kaggle_dataset', 'Kaggle')
roboflow_ann = analyze_annotations(DATASETS_DIR / 'roboflow_dataset', 'Roboflow')

# Visualizzazione
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

for ax, ann in zip(axes, [kaggle_ann, roboflow_ann]):
    if ann:
        ax.hist(ann['polygon_counts'], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
        ax.axvline(ann['avg_polygons'], color='red', linestyle='--', linewidth=2, label=f"Media: {ann['avg_polygons']:.1f}")
        ax.set_xlabel('Numero Banner per Immagine')
        ax.set_ylabel('Frequenza')
        ax.set_title(f"{ann['name']} - Distribuzione Annotazioni")
        ax.legend()

plt.suptitle('üìè Analisi Annotazioni', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'eda_annotations.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nüìè STATISTICHE ANNOTAZIONI")
print("="*60)
for ann in [kaggle_ann, roboflow_ann]:
    if ann:
        print(f"\n{ann['name']}:")
        print(f"   Media banner/img: {ann['avg_polygons']:.2f}")
        print(f"   Media punti/polygon: {ann['avg_points']:.1f}")

## üìã 13. Riepilogo Finale

In [None]:
# ============================================================================
# RIEPILOGO FINALE
# ============================================================================

print("="*60)
print("‚úÖ EDA & DATA PREPARATION COMPLETATA")
print("="*60)
print("\nüìÅ Path dei dataset configurati:")
print(f"   Kaggle YAML:   {KAG_YAML_PATH}")
print(f"   Roboflow YAML: {ROBO_YAML_PATH}")
print("\nüìä Grafici salvati in:")
print(f"   {OUTPUT_DIR}")
print("\nüöÄ Ora puoi procedere con il training notebook (SBS TRAINING.ipynb)")
print("="*60)