In [None]:
from transformers import YolosImageProcessor, AutoModelForObjectDetection
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from PIL import Image
import json
import os
from torch.utils.data import  DataLoader
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from transformers import get_scheduler

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-small", 
                                                             num_labels=1,
                                                             ignore_mismatched_sizes=True)
model.to(device)
processor = YolosImageProcessor.from_pretrained("hustvl/yolos-small")

In [None]:
class CustomDataset(Dataset):
    def __init__(self, json_file, images_dir, processor,transform=None):
        with open(json_file, 'r') as file:
            self.data = json.load(file)
        
        self.images_dir = images_dir
        self.transform = transform
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Obtener la información de la imagen
        image_data = self.data[idx]
        image_id = image_data["image_id"]
        image_name = image_data["image_name"]
        image_path = os.path.join(self.images_dir, image_name)
        
        # Cargar la imagen usando PIL
        image = Image.open(image_path)
        
        # Obtener las anotaciones
        annotations = image_data["annotations"]
        
        # Transformaciones, si las hay
        if self.transform:
            image = self.transform(image)
        
        
        # Procesar con el processor
        inputs = self.processor(images=image, annotations={"image_id": image_id, "annotations": annotations}, return_tensors="pt")
        
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),  # Quitar batch dimension
            "labels": inputs["labels"][0]
        }

In [None]:
dataset = CustomDataset(json_file="labels.json", images_dir="frames", processor= processor)
# Obtenemos una lista de los índices del dataset
indices = list(range(len(dataset)))

# Primero dividimos entre entrenamiento y el resto (validación + prueba)
train_indices, temp_indices = train_test_split(indices, test_size=0.2, random_state=42)

# Ahora dividimos el resto entre validación y prueba
val_indices, test_indices = train_test_split(temp_indices, test_size=0.5, random_state=42)

# Crear los datasets con los índices
train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)
test_dataset = torch.utils.data.Subset(dataset, test_indices)

# Crear los DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,collate_fn=lambda x: {
    "pixel_values": torch.stack([item["pixel_values"] for item in x]),
    "labels": [item["labels"] for item in x]  # Lista de diccionarios
} )
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: {
    "pixel_values": torch.stack([item["pixel_values"] for item in x]),
    "labels": [item["labels"] for item in x]  # Lista de diccionarios
})
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: {
    "pixel_values": torch.stack([item["pixel_values"] for item in x]),
    "labels": [item["labels"] for item in x]  # Lista de diccionarios
})
print(len(train_dataset),len(val_dataset), len(test_dataset))

In [None]:
epocas = 100
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
num_training_steps = len(train_loader) * epocas  # 100 épocas
scheduler = get_scheduler("linear", optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
for epoch in range(epocas):  # Número de épocas
    model.train()
    for batch in train_loader:
        # Mover datos a GPU
        pixel_values = batch["pixel_values"].to(device)
        labels = [{k: v.to(device) for k, v in t.items()} for t in batch["labels"]]
        optimizer.zero_grad()
        # Calcular pérdida
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        
    print(f"Época {epoch + 1}, Pérdida: {loss.item()}")

Otras cosas

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: {
    "pixel_values": torch.stack([item["pixel_values"] for item in x]),
    "labels": [item["labels"] for item in x]  # Lista de diccionarios
})
for i, datos in enumerate(train_loader):
    if i<1:
        dp = datos
        print("Lote de imágenes para entrenamiento:", datos["pixel_values"].shape)
        print("Anotaciones:", datos["labels"])
        pixel_values = datos["pixel_values"]
        labels = [{k: v for k, v in t.items()} for t in datos["labels"]]
    else:
        break

In [None]:
model.eval()
#with torch.no_grad():
outputs = model(pixel_values=pixel_values, labels=labels)
loss = outputs.loss
print(loss)

In [None]:
loss

In [None]:
for batch in train_loader:
    pixel_values = batch["pixel_values"]
    labels = [{k: v for k, v in t.items()} for t in batch["labels"]]
    break

In [None]:
pixel_values.shape