In [None]:
# =========================================================
# INSTALA√á√ÉO
# =========================================================
!pip install ultralytics pillow

print("‚úÖ Depend√™ncias instaladas!")

In [None]:
from ultralytics import YOLO
from google.colab import drive
from pathlib import Path
import shutil
import os
import yaml
import glob
import xml.etree.ElementTree as ET
from collections import Counter

# =========================================================
# 0. Mount Google Drive
# =========================================================
drive.mount('/content/drive')

DRIVE_ROOT = "/content/drive/MyDrive/colab"
PROJECT_NAME = "cloud-arch-security-mvp"

DRIVE_PROJECT = f"{DRIVE_ROOT}/{PROJECT_NAME}"
DRIVE_CHECKPOINTS = f"{DRIVE_PROJECT}/checkpoints"
DRIVE_DATASET_CACHE = f"{DRIVE_PROJECT}/kaggle_dataset_cache"

CONTENT_PROJECT = "/content/yolo-project"

os.makedirs(DRIVE_CHECKPOINTS, exist_ok=True)

# =========================================================
# 1. Carregar Dataset do Drive
# =========================================================
print("üì• Carregando dataset do Google Drive...")

# Limpa ambiente anterior
if Path(CONTENT_PROJECT).exists():
    shutil.rmtree(CONTENT_PROJECT)
os.makedirs(CONTENT_PROJECT, exist_ok=True)

RAW_DATA_PATH = f"{CONTENT_PROJECT}/raw_data"

# Verifica se o dataset existe no Drive
if not os.path.exists(DRIVE_DATASET_CACHE):
    print("‚ùå Dataset n√£o encontrado!")
    print(f"   Esperado em: {DRIVE_DATASET_CACHE}")
    print("   Fa√ßa upload do dataset para essa pasta no Google Drive")
    raise Exception("Dataset n√£o encontrado no Drive")

# Conta arquivos no cache
cache_files = os.listdir(DRIVE_DATASET_CACHE)
print(f"   üìÇ Encontrados {len(cache_files)} itens no cache")

# Copia para o ambiente de trabalho
print("   üìã Copiando para ambiente de trabalho...")
shutil.copytree(DRIVE_DATASET_CACHE, RAW_DATA_PATH, dirs_exist_ok=True)
print(f"‚úÖ Dataset carregado! ({len(os.listdir(RAW_DATA_PATH))} arquivos)")

os.chdir(CONTENT_PROJECT)

# =========================================================
# 2. Converter Pascal VOC (XML) para YOLO Format
# =========================================================
print("\nüîÑ Convertendo Pascal VOC para YOLO format...")

RAW_DATA = Path("raw_data")

# Coleta todas as classes do dataset
all_classes = set()
xml_files = list(RAW_DATA.glob("**/*.xml"))  # Busca recursiva
print(f"   Encontrados {len(xml_files)} arquivos XML")

if len(xml_files) == 0:
    print("‚ùå Nenhum arquivo XML encontrado!")
    print(f"   Verificando conte√∫do de {RAW_DATA}:")
    for i, item in enumerate(RAW_DATA.iterdir()):
        print(f"      - {item.name}")
        if i > 20:
            print("      ... (mais arquivos)")
            break
    raise Exception("Dataset inv√°lido - sem arquivos XML")

for xml_file in xml_files:
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for obj in root.findall('object'):
            class_name = obj.find('name').text
            all_classes.add(class_name)
    except Exception as e:
        pass  # Ignora erros silenciosamente

all_classes = sorted(list(all_classes))
print(f"   Total de classes encontradas: {len(all_classes)}")

# =========================================================
# 3. Mapear para 14 Categorias STRIDE
# =========================================================
print("\nüìä Mapeando para categorias STRIDE...")

CATEGORY_MAPPING = {
    'compute': ['EC2', 'Lambda', 'EKS', 'Fargate', 'Container', 'ECS', 
                'App Service', 'Virtual Machine', 'VM', 'Compute Engine',
                'Cloud Run', 'App Engine', 'GKE', 'AKS', 'Kubernetes',
                'Elastic Beanstalk', 'Batch', 'Lightsail', 'EMR'],
    
    'database': ['RDS', 'DynamoDB', 'Aurora', 'DocumentDB', 'ElastiCache',
                 'Cosmos DB', 'SQL Database', 'Cloud SQL', 'Firestore',
                 'BigQuery', 'Redshift', 'Neptune', 'Cloud Spanner',
                 'Managed Database', 'Database', 'DB', 'Redis', 'Memcached'],
    
    'storage': ['S3', 'EBS', 'EFS', 'Glacier', 'Storage', 'Blob Storage',
                'Cloud Storage', 'File Storage', 'Azure Storage', 'GCS',
                'Backup', 'Archive', 'Data Lake'],
    
    'network': ['VPC', 'Virtual Network', 'VNet', 'Subnet', 'Gateway',
                'Load Balancer', 'ALB', 'NLB', 'ELB', 'CloudFront',
                'CDN', 'Route 53', 'DNS', 'VPN', 'Direct Connect',
                'ExpressRoute', 'Cloud Interconnect', 'NAT', 'Firewall',
                'Network', 'Internet Gateway', 'Transit Gateway'],
    
    'security': ['IAM', 'Identity', 'Cognito', 'WAF', 'Shield', 'GuardDuty',
                 'Security Hub', 'Key Vault', 'KMS', 'Secrets Manager',
                 'Certificate', 'Azure AD', 'Cloud Identity', 'SSO'],
    
    'api_gateway': ['API Gateway', 'API Management', 'Apigee', 'AppSync',
                    'API', 'Gateway', 'Endpoints'],
    
    'messaging': ['SQS', 'SNS', 'EventBridge', 'Service Bus', 'Pub/Sub',
                  'Kinesis', 'Event Hub', 'MQ', 'Queue', 'Topic', 
                  'Notification', 'Event Grid'],
    
    'monitoring': ['CloudWatch', 'Monitor', 'Log Analytics', 'Stackdriver',
                   'Cloud Monitoring', 'X-Ray', 'Application Insights',
                   'Logging', 'Metrics', 'Trace', 'Grafana', 'Prometheus'],
    
    'identity': ['User', 'Client', 'Application', 'Service Principal',
                 'OAuth', 'OIDC', 'SAML', 'Directory', 'Active Directory'],
    
    'ml_ai': ['SageMaker', 'Machine Learning', 'AI Platform', 'Databricks',
              'Cognitive Services', 'Vertex AI', 'Rekognition', 'Comprehend',
              'Textract', 'Vision', 'Speech', 'Natural Language'],
    
    'devops': ['CodePipeline', 'CodeBuild', 'CodeDeploy', 'DevOps',
               'Cloud Build', 'Artifact Registry', 'Container Registry',
               'ECR', 'ACR', 'GCR', 'CI/CD', 'Pipeline', 'Build'],
    
    'serverless': ['Lambda', 'Functions', 'Azure Functions', 'Cloud Functions',
                   'Step Functions', 'Logic Apps', 'Workflows'],
    
    'analytics': ['Athena', 'BigQuery', 'Synapse', 'Data Factory',
                  'Glue', 'Dataflow', 'EMR', 'HDInsight', 'Dataproc',
                  'Analytics', 'Data Warehouse', 'ETL']
}

# Cria mapeamento inverso (case-insensitive)
name_to_category = {}
for category, keywords in CATEGORY_MAPPING.items():
    for keyword in keywords:
        name_to_category[keyword.lower()] = category

def get_category(class_name):
    """Mapeia nome de classe para categoria."""
    class_lower = class_name.lower()
    
    # Busca exata
    if class_lower in name_to_category:
        return name_to_category[class_lower]
    
    # Busca parcial (cont√©m keyword)
    for keyword, category in name_to_category.items():
        if keyword in class_lower or class_lower in keyword:
            return category
    
    return 'other'

# Mapeia todas as classes
class_to_category = {cls: get_category(cls) for cls in all_classes}

# Mostra distribui√ß√£o
category_counts = Counter(class_to_category.values())
print("\nüìä Distribui√ß√£o por categoria:")
for cat, count in sorted(category_counts.items(), key=lambda x: -x[1]):
    print(f"   {cat}: {count} classes")

# Lista classes que foram para "other"
other_classes = [cls for cls, cat in class_to_category.items() if cat == 'other']
if other_classes:
    print(f"\n‚ö†Ô∏è Classes em 'other' ({len(other_classes)}):")
    for cls in other_classes[:20]:
        print(f"   - {cls}")

# =========================================================
# 4. Criar estrutura YOLO e converter anota√ß√µes
# =========================================================
print("\nüìÅ Criando estrutura YOLO...")

SIMPLIFIED_NAMES = list(CATEGORY_MAPPING.keys()) + ['other']
category_to_id = {cat: idx for idx, cat in enumerate(SIMPLIFIED_NAMES)}

# Cria estrutura de pastas
for split in ['train', 'valid', 'test']:
    os.makedirs(f"dataset/{split}/images", exist_ok=True)
    os.makedirs(f"dataset/{split}/labels", exist_ok=True)

def convert_voc_to_yolo(xml_file, img_width, img_height):
    """Converte anota√ß√£o Pascal VOC para formato YOLO."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    yolo_lines = []
    for obj in root.findall('object'):
        class_name = obj.find('name').text
        category = class_to_category.get(class_name, 'other')
        class_id = category_to_id[category]
        
        bbox = obj.find('bndbox')
        xmin = float(bbox.find('xmin').text)
        ymin = float(bbox.find('ymin').text)
        xmax = float(bbox.find('xmax').text)
        ymax = float(bbox.find('ymax').text)
        
        # Converte para formato YOLO (centro x, centro y, width, height) normalizado
        x_center = (xmin + xmax) / 2 / img_width
        y_center = (ymin + ymax) / 2 / img_height
        width = (xmax - xmin) / img_width
        height = (ymax - ymin) / img_height
        
        # Garante valores entre 0 e 1
        x_center = max(0, min(1, x_center))
        y_center = max(0, min(1, y_center))
        width = max(0, min(1, width))
        height = max(0, min(1, height))
        
        yolo_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")
    
    return yolo_lines

# Processa todos os arquivos
from PIL import Image
import random

# Coleta pares (imagem, xml) - busca recursiva
pairs = []
for xml_file in xml_files:
    img_name = xml_file.stem
    xml_dir = xml_file.parent
    for ext in ['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG']:
        img_path = xml_dir / f"{img_name}{ext}"
        if img_path.exists():
            pairs.append((img_path, xml_file))
            break

print(f"   Encontrados {len(pairs)} pares imagem/anota√ß√£o")

if len(pairs) == 0:
    print("‚ùå Nenhum par imagem/anota√ß√£o encontrado!")
    raise Exception("Dataset inv√°lido")

# Shuffle e split (80% train, 10% valid, 10% test)
random.seed(42)
random.shuffle(pairs)

n_train = int(len(pairs) * 0.8)
n_valid = int(len(pairs) * 0.1)

train_pairs = pairs[:n_train]
valid_pairs = pairs[n_train:n_train + n_valid]
test_pairs = pairs[n_train + n_valid:]

print(f"   Split: {len(train_pairs)} train, {len(valid_pairs)} valid, {len(test_pairs)} test")

# Converte e copia
label_counts = Counter()
errors = 0

for split, split_pairs in [('train', train_pairs), ('valid', valid_pairs), ('test', test_pairs)]:
    for img_path, xml_path in split_pairs:
        try:
            # L√™ dimens√µes da imagem
            with Image.open(img_path) as img:
                img_width, img_height = img.size
            
            # Converte anota√ß√£o
            yolo_lines = convert_voc_to_yolo(xml_path, img_width, img_height)
            
            if yolo_lines:
                # Conta labels por categoria
                for line in yolo_lines:
                    class_id = int(line.split()[0])
                    label_counts[SIMPLIFIED_NAMES[class_id]] += 1
                
                # Copia imagem
                dest_img = Path(f"dataset/{split}/images") / img_path.name
                shutil.copy(img_path, dest_img)
                
                # Salva label YOLO
                label_name = img_path.stem + ".txt"
                dest_label = Path(f"dataset/{split}/labels") / label_name
                with open(dest_label, "w") as f:
                    f.write("\n".join(yolo_lines))
        except Exception as e:
            errors += 1

if errors > 0:
    print(f"   ‚ö†Ô∏è {errors} arquivos com erro (ignorados)")

print("\nüìä Distribui√ß√£o de labels por categoria:")
total_labels = sum(label_counts.values())
for cat in SIMPLIFIED_NAMES:
    count = label_counts.get(cat, 0)
    pct = (count / total_labels * 100) if total_labels > 0 else 0
    bar = "#" * min(50, count // 50)
    print(f"   {cat:12}: {count:5} ({pct:5.1f}%) {bar}")

print(f"\n   Total: {total_labels} labels")

# =========================================================
# 5. Criar data.yaml
# =========================================================
data_config = {
    'path': '/content/yolo-project/dataset',
    'train': 'train/images',
    'val': 'valid/images',
    'test': 'test/images',
    'nc': len(SIMPLIFIED_NAMES),
    'names': SIMPLIFIED_NAMES
}

with open("dataset/data.yaml", "w") as f:
    yaml.dump(data_config, f, default_flow_style=False)

print(f"\n‚úÖ Dataset preparado com {len(SIMPLIFIED_NAMES)} categorias!")

# =========================================================
# 6. Verificar Checkpoints Anteriores
# =========================================================
print("\nüîç Verificando checkpoints anteriores...")

# IMPORTANTE: Limpa checkpoints antigos (modelo antigo tinha classes diferentes)
old_checkpoints = glob.glob(f"{DRIVE_CHECKPOINTS}/*.pt")
if old_checkpoints:
    print("‚ö†Ô∏è Encontrados checkpoints antigos - limpando...")
    for ckpt in old_checkpoints:
        os.remove(ckpt)
    print("‚úÖ Checkpoints antigos removidos")

last_checkpoint = None
print("üì≠ Iniciando treinamento do zero com novo dataset")

# =========================================================
# 7. Callback para salvar checkpoints
# =========================================================
SAVE_EVERY_N_EPOCHS = 5

def save_checkpoint_to_drive(trainer):
    """Salva checkpoints no Google Drive."""
    current_epoch = trainer.epoch + 1
    
    if current_epoch % SAVE_EVERY_N_EPOCHS == 0:
        weights_dir = trainer.save_dir / "weights"
        
        if (weights_dir / "last.pt").exists():
            epoch_name = f"epoch_{current_epoch:03d}.pt"
            shutil.copy(weights_dir / "last.pt", f"{DRIVE_CHECKPOINTS}/{epoch_name}")
            shutil.copy(weights_dir / "last.pt", f"{DRIVE_CHECKPOINTS}/last.pt")
            print(f"\nüíæ Checkpoint salvo: {epoch_name}")
        
        if (weights_dir / "best.pt").exists():
            shutil.copy(weights_dir / "best.pt", f"{DRIVE_CHECKPOINTS}/best.pt")

# =========================================================
# 8. Carregar modelo base
# =========================================================
print("\nüì¶ Carregando modelo base: yolov8n.pt")
model = YOLO("yolov8n.pt")
model.add_callback("on_train_epoch_end", save_checkpoint_to_drive)

# =========================================================
# 9. Treinamento
# =========================================================
print("\nüöÄ Iniciando treinamento...")
print(f"   üìä {len(train_pairs)} imagens de treino")
print(f"   üìä {len(SIMPLIFIED_NAMES)} categorias")

results = model.train(
    data="dataset/data.yaml",
    
    # Configura√ß√£o principal
    epochs=100,
    patience=20,
    batch=16,
    imgsz=640,
    
    # Otimiza√ß√£o
    optimizer='AdamW',
    lr0=0.001,
    lrf=0.01,
    weight_decay=0.0005,
    warmup_epochs=3,
    cos_lr=True,
    
    # Augmenta√ß√£o (moderada - dataset j√° tem augmenta√ß√£o)
    hsv_h=0.015,
    hsv_s=0.4,
    hsv_v=0.3,
    degrees=10,
    translate=0.1,
    scale=0.4,
    fliplr=0.5,
    mosaic=0.8,
    mixup=0.1,
    
    # Loss weights
    cls=1.0,
    box=7.5,
    dfl=1.5,
    
    # Infraestrutura
    cache=True,
    workers=4,
    device=0,
    exist_ok=True,
    plots=True,
    save_period=5,
    
    name='train_kaggle',
    project='runs/detect',
)

# =========================================================
# 10. Salvar modelo final
# =========================================================
DEST_WEIGHTS = f"{DRIVE_PROJECT}/weights_backup"
SOURCE_WEIGHTS = "runs/detect/train_kaggle/weights"

os.makedirs(DEST_WEIGHTS, exist_ok=True)

print("\nüíæ Salvando modelo final...")

if os.path.exists(f"{SOURCE_WEIGHTS}/best.pt"):
    shutil.copy(f"{SOURCE_WEIGHTS}/best.pt", f"{DEST_WEIGHTS}/best_kaggle.pt")
    shutil.copy(f"{SOURCE_WEIGHTS}/best.pt", f"{DRIVE_CHECKPOINTS}/best_final.pt")
    
    with open(f"{DEST_WEIGHTS}/class_mapping_kaggle.yaml", "w") as f:
        yaml.dump({
            'simplified_names': SIMPLIFIED_NAMES,
            'category_mapping': CATEGORY_MAPPING,
            'original_classes': list(all_classes)
        }, f)
    
    print(f"‚úÖ Modelo salvo: {DEST_WEIGHTS}/best_kaggle.pt")
else:
    print("‚ö†Ô∏è best.pt n√£o encontrado")

# Limpa checkpoints antigos (mant√©m apenas √∫ltimos 3)
checkpoint_files = sorted(glob.glob(f"{DRIVE_CHECKPOINTS}/epoch*.pt"))
if len(checkpoint_files) > 3:
    for old_ckpt in checkpoint_files[:-3]:
        os.remove(old_ckpt)
        
print("\n" + "="*50)
print("‚úÖ TREINAMENTO CONCLU√çDO!")
print("="*50)
print(f"\nüìÅ Modelo final: {DEST_WEIGHTS}/best_kaggle.pt")
print("   Baixe esse arquivo e coloque em models/best.pt no seu PC")

In [None]:
# =========================================================
# 11. VALIDA√á√ÉO DO MODELO TREINADO
# =========================================================
from ultralytics import YOLO
import matplotlib.pyplot as plt
import random
from pathlib import Path
import os

# Configura√ß√£o
DRIVE_ROOT = "/content/drive/MyDrive/colab"
PROJECT_NAME = "cloud-arch-security-mvp"
DRIVE_PROJECT = f"{DRIVE_ROOT}/{PROJECT_NAME}"
DRIVE_CHECKPOINTS = f"{DRIVE_PROJECT}/checkpoints"

# Monta Drive se necess√°rio
from google.colab import drive
if not os.path.exists('/content/drive/MyDrive'):
    drive.mount('/content/drive')

# Procura o modelo
print("\nüîç Procurando modelo treinado...")

model_paths = [
    "runs/detect/train_kaggle/weights/best.pt",
    f"{DRIVE_CHECKPOINTS}/best.pt",
    f"{DRIVE_CHECKPOINTS}/best_final.pt",
    f"{DRIVE_PROJECT}/weights_backup/best_kaggle.pt",
]

best_model_path = None
for path in model_paths:
    if os.path.exists(path):
        best_model_path = path
        print(f"‚úÖ Modelo encontrado: {path}")
        break

if not best_model_path:
    print("‚ùå Nenhum modelo encontrado!")
else:
    val_model = YOLO(best_model_path)
    print(f"üìä Modelo tem {len(val_model.names)} classes:")
    for idx, name in val_model.names.items():
        print(f"   {idx}: {name}")
    
    # Valida√ß√£o
    print("\nüß™ Validando modelo...")
    
    val_results = val_model.val(
        data="dataset/data.yaml",
        split="test",
        plots=True,
        save_json=True
    )
    
    print("\nüìä M√âTRICAS DE VALIDA√á√ÉO:")
    print(f"   mAP50: {val_results.box.map50:.4f}")
    print(f"   mAP50-95: {val_results.box.map:.4f}")
    print(f"   Precis√£o: {val_results.box.mp:.4f}")
    print(f"   Recall: {val_results.box.mr:.4f}")
    
    print("\nüìà mAP50 por categoria:")
    for i, name in enumerate(val_model.names.values()):
        if i < len(val_results.box.ap50):
            ap = val_results.box.ap50[i]
            print(f"   {name}: {ap:.4f}")

    # Teste visual
    print("\nüñºÔ∏è Testando em imagem de exemplo...")
    
    test_images = list(Path("dataset/test/images").glob("*.png")) + \
                  list(Path("dataset/test/images").glob("*.jpg"))
    
    if test_images:
        # Testa 3 imagens aleat√≥rias
        for test_img in random.sample(test_images, min(3, len(test_images))):
            print(f"\n   üì∑ {test_img.name}")
            
            results = val_model(str(test_img), conf=0.25, verbose=False)
            
            detected = set()
            for r in results:
                for box in r.boxes:
                    cls_name = val_model.names[int(box.cls[0])]
                    conf = float(box.conf[0])
                    detected.add(f"{cls_name} ({conf:.2f})")
            
            if detected:
                print(f"      Detectado: {', '.join(detected)}")
            else:
                print("      ‚ö†Ô∏è Nenhuma detec√ß√£o")
            
            # Exibe
            result_img = results[0].plot()
            plt.figure(figsize=(12, 8))
            plt.imshow(result_img)
            plt.axis('off')
            plt.title(f"Detec√ß√µes em {test_img.name}")
            plt.show()
    else:
        print("   ‚ö†Ô∏è Nenhuma imagem de teste encontrada")


üß™ Validando modelo treinado...


NameError: name 'os' is not defined