In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [7]:
LABEL_PATH = Path("../data/labels/labels_task2.csv")
FRAME_DIR = Path("../data/frames")

df = pd.read_csv(LABEL_PATH)
available_videos = {p.name for p in FRAME_DIR.iterdir() if p.is_dir() and any(p.glob("*.jpg"))}
df = df[df["VIDEO"].isin(available_videos)].reset_index(drop=True)
print(f"Number of videos with frames: {len(df)}")

Number of videos with frames: 30


In [8]:
class OSATSDataset(Dataset):
    def __init__(self, dataframe, frame_dir, transform=None, sequence_length=16):
        self.data = dataframe.copy()
        self.frame_dir = frame_dir
        self.transform = transform
        self.sequence_length = sequence_length
        self.osats_cols = [col for col in dataframe.columns if col.startswith("OSATS_")]

        for col in self.osats_cols:
            self.data[col] = self.data[col].clip(0, 4).astype(np.int64)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        video_id = row["VIDEO"]
        y = row[self.osats_cols].values.astype(np.int64)
        path = self.frame_dir / video_id

        frames = sorted(path.glob("*.jpg"))
        selected = frames[:self.sequence_length]
        if len(selected) == 0:
            raise IndexError(f"No frames for video {video_id}")
        while len(selected) < self.sequence_length:
            selected.append(selected[-1])

        images = [self.transform(Image.open(f).convert("RGB")) for f in selected]
        return torch.stack(images), torch.tensor(y)


In [9]:
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Transforms para imagens (compatível com CNNs e ResNet)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Colunas OSATS (as 8 que vais prever)
osats_cols = [col for col in df.columns if col.startswith("OSATS_")]

# Criar uma coluna auxiliar com a média arredondada das OSATS para estratificação
df["OSATS_MEAN_LABEL"] = df[osats_cols].mean(axis=1).round().astype(int)

# Divisão 70% treino, 30% temp (com estratificação)
train_df, temp_df = train_test_split(
    df, test_size=0.30, random_state=42, stratify=df["OSATS_MEAN_LABEL"]
)

# Divisão 15% validação, 15% teste (sem stratify para evitar erro)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Criação dos datasets
train_dataset = OSATSDataset(train_df, FRAME_DIR, transform)
val_dataset = OSATSDataset(val_df, FRAME_DIR, transform)
test_dataset = OSATSDataset(test_df, FRAME_DIR, transform)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)


In [10]:
from torch.nn.init import kaiming_uniform_, xavier_uniform_

class CNNModel_1(nn.Module):
    def __init__(self, num_classes=40, sequence_length=16, input_shape=(3,224,224)):
        super(CNNModel_1, self).__init__()
        self.sequence_length = sequence_length

        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, 3),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 32, 3),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )

        conv_out_size = self._get_conv_output((sequence_length, *input_shape))
        self.fc1 = nn.Linear(conv_out_size, 100)
        kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(100, num_classes)
        xavier_uniform_(self.fc2.weight)
        self.act2 = nn.Softmax(dim=1)

    def _get_conv_output(self, shape):
        bs = 1
        input = torch.rand(bs, *shape)
        B, T, C, H, W = input.shape
        input = input.view(B * T, C, H, W)
        output_feat = self.layer1(input)
        output_feat = self.layer2(output_feat)
        output_feat = output_feat.view(output_feat.size(0), -1)
        output_feat = output_feat.view(bs, T, -1).mean(dim=1)
        return int(np.prod(output_feat.size()[1:]))

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = out.view(B, T, -1).mean(dim=1)
        out = self.fc1(out)
        out = self.act1(out)
        out = self.fc2(out)
        out = self.act2(out)
        return out


In [11]:
class CNNModel_2(nn.Module):
    def __init__(self, num_classes=40, sequence_length=16, input_shape=(3,224,224)):
        super(CNNModel_2, self).__init__()
        self.sequence_length = sequence_length

        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 16, 3),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, 3),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        conv_out_size = self._get_conv_output((sequence_length, *input_shape))
        self.fc1 = nn.Linear(conv_out_size, num_classes)

    def _get_conv_output(self, shape):
        bs = 1
        input = torch.rand(bs, *shape)
        B, T, C, H, W = input.shape
        input = input.view(B * T, C, H, W)
        output_feat = self.layer1(input)
        output_feat = self.layer2(output_feat)
        output_feat = output_feat.view(output_feat.size(0), -1)
        output_feat = output_feat.view(bs, T, -1).mean(dim=1)
        return int(np.prod(output_feat.size()[1:]))

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = out.view(B, T, -1).mean(dim=1)
        out = self.fc1(out)
        return out


In [12]:
from torch.nn import BatchNorm2d, Dropout
import torch.nn.functional as F

class CNNModel_3(nn.Module):
    def __init__(self, num_classes=40, sequence_length=16, input_shape=(3,224,224)):
        super(CNNModel_3, self).__init__()
        self.sequence_length = sequence_length

        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, 3),
            BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        conv_out_size = self._get_conv_output((sequence_length, *input_shape))
        self.fc1 = nn.Linear(conv_out_size, 600)
        self.drop = Dropout(0.25)
        self.fc2 = nn.Linear(600, 120)
        self.fc3 = nn.Linear(120, num_classes)

    def _get_conv_output(self, shape):
        bs = 1
        input = torch.rand(bs, *shape)
        B, T, C, H, W = input.shape
        input = input.view(B * T, C, H, W)
        output_feat = self.layer1(input)
        output_feat = self.layer2(output_feat)
        output_feat = output_feat.view(output_feat.size(0), -1)
        output_feat = output_feat.view(bs, T, -1).mean(dim=1)
        return int(np.prod(output_feat.size()[1:]))

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = out.view(B, T, -1).mean(dim=1)
        out = F.relu(self.fc1(out))
        out = self.drop(out)
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out


In [13]:
from torch.nn import Dropout2d
import torch.nn.functional as F

class CNNModel_4(nn.Module):
    def __init__(self, num_classes=40, sequence_length=16, input_shape=(3,224,224)):
        super(CNNModel_4, self).__init__()
        self.sequence_length = sequence_length

        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, 5),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            Dropout2d(0.2)
        )

        conv_out_size = self._get_conv_output((sequence_length, *input_shape))
        self.fc1 = nn.Linear(conv_out_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def _get_conv_output(self, shape):
        bs = 1
        input = torch.rand(bs, *shape)
        B, T, C, H, W = input.shape
        input = input.view(B * T, C, H, W)
        output_feat = self.layer1(input)
        output_feat = output_feat.view(output_feat.size(0), -1)
        output_feat = output_feat.view(bs, T, -1).mean(dim=1)
        return int(np.prod(output_feat.size()[1:]))

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)
        out = self.layer1(x)
        out = out.view(out.size(0), -1)
        out = out.view(B, T, -1).mean(dim=1)
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        return out


In [14]:
def train_model(model, dataloader, epochs):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)
            if targets.dim() > 1:
                targets = targets[:, 0]

            if outputs.shape[0] != targets.shape[0]:
                raise ValueError(f"Shape mismatch: outputs {outputs.shape} vs targets {targets.shape}")

            loss = criterion(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(dataloader):.4f}")

    return model


In [15]:
def evaluate_model(model, dataloader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            if targets.dim() > 1:
                targets = targets[:, 0]
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    acc = 100 * correct / total
    print(f"Accuracy: {acc:.2f}%")
    return acc

In [16]:
model1 = CNNModel_1(num_classes=40, sequence_length=16, input_shape=(3,224,224))
model1 = train_model(model1, train_loader, epochs=100)

print("Validação - Modelo 1")
evaluate_model(model1, val_loader)

print("Teste - Modelo 1")
evaluate_model(model1, test_loader)

# Guardar modelo com nome especificado
torch.save(model1.state_dict(), "../outputs/models/model_task2_cnn1.pt")


Epoch [1/100], Loss: 3.5637
Epoch [2/100], Loss: 3.3971
Epoch [3/100], Loss: 3.3976
Epoch [4/100], Loss: 3.2726
Epoch [5/100], Loss: 3.2726
Epoch [6/100], Loss: 3.2726


KeyboardInterrupt: 

In [None]:
model2 = CNNModel_2(num_classes=40, sequence_length=16, input_shape=(3,224,224))
model2 = train_model(model2, train_loader, epochs=100)

print("Validação - Modelo 2")
evaluate_model(model2, val_loader)

print("Teste - Modelo 2")
evaluate_model(model2, test_loader)

torch.save(model2.state_dict(), "../outputs/models/model_task2_cnn2.pt")


Epoch [1/100], Loss: 2.2780
Epoch [2/100], Loss: 1.1471
Epoch [3/100], Loss: 1.0207
Epoch [4/100], Loss: 0.8331
Epoch [5/100], Loss: 0.6566
Epoch [6/100], Loss: 0.5938
Epoch [7/100], Loss: 0.4651
Epoch [8/100], Loss: 0.5090
Epoch [9/100], Loss: 0.3157
Epoch [10/100], Loss: 0.3054
Epoch [11/100], Loss: 0.2842
Epoch [12/100], Loss: 0.1382
Epoch [13/100], Loss: 0.1478
Epoch [14/100], Loss: 0.1138
Epoch [15/100], Loss: 0.0969
Epoch [16/100], Loss: 0.0678
Epoch [17/100], Loss: 0.0536
Epoch [18/100], Loss: 0.0413
Epoch [19/100], Loss: 0.0315
Epoch [20/100], Loss: 0.0286
Epoch [21/100], Loss: 0.0262
Epoch [22/100], Loss: 0.0334
Epoch [23/100], Loss: 0.0216
Epoch [24/100], Loss: 0.0203
Epoch [25/100], Loss: 0.0154
Epoch [26/100], Loss: 0.0149
Epoch [27/100], Loss: 0.0125
Epoch [28/100], Loss: 0.0122
Epoch [29/100], Loss: 0.0099
Epoch [30/100], Loss: 0.0092
Epoch [31/100], Loss: 0.0084
Epoch [32/100], Loss: 0.0101
Epoch [33/100], Loss: 0.0070
Epoch [34/100], Loss: 0.0070
Epoch [35/100], Loss: 0

In [None]:
model3 = CNNModel_3(num_classes=40, sequence_length=16, input_shape=(3,224,224))
model3 = train_model(model3, train_loader, epochs=100)

print("Validação - Modelo 3")
evaluate_model(model3, val_loader)

print("Teste - Modelo 3")
evaluate_model(model3, test_loader)

torch.save(model3.state_dict(), "../outputs/models/model_task2_cnn3.pt")


Epoch [1/100], Loss: 4.2357
Epoch [2/100], Loss: 1.9221
Epoch [3/100], Loss: 1.1928
Epoch [4/100], Loss: 1.5472
Epoch [5/100], Loss: 0.6307
Epoch [6/100], Loss: 0.3455
Epoch [7/100], Loss: 0.5239
Epoch [8/100], Loss: 0.4486
Epoch [9/100], Loss: 0.0568
Epoch [10/100], Loss: 0.1511
Epoch [11/100], Loss: 0.0928
Epoch [12/100], Loss: 0.2287
Epoch [13/100], Loss: 0.0276
Epoch [14/100], Loss: 0.4409
Epoch [15/100], Loss: 0.0475
Epoch [16/100], Loss: 0.0026
Epoch [17/100], Loss: 0.0180
Epoch [18/100], Loss: 0.0077
Epoch [19/100], Loss: 0.0097
Epoch [20/100], Loss: 0.0014
Epoch [21/100], Loss: 0.0012
Epoch [22/100], Loss: 0.0001
Epoch [23/100], Loss: 0.0003
Epoch [24/100], Loss: 0.0009
Epoch [25/100], Loss: 0.0009
Epoch [26/100], Loss: 0.0002
Epoch [27/100], Loss: 0.0010
Epoch [28/100], Loss: 0.0002
Epoch [29/100], Loss: 0.0018
Epoch [30/100], Loss: 0.0002
Epoch [31/100], Loss: 0.0033
Epoch [32/100], Loss: 0.0013
Epoch [33/100], Loss: 0.0006
Epoch [34/100], Loss: 0.0001
Epoch [35/100], Loss: 0

In [None]:
model4 = CNNModel_4(num_classes=40, sequence_length=16, input_shape=(3,224,224))
model4 = train_model(model4, train_loader, epochs=100)

print("Validação - Modelo 4")
evaluate_model(model4, val_loader)

print("Teste - Modelo 4")
evaluate_model(model4, test_loader)

torch.save(model4.state_dict(), "../outputs/models/model_task2_cnn4.pt")


Epoch [1/100], Loss: 8.6049
Epoch [2/100], Loss: 8.4234
Epoch [3/100], Loss: 6.4934
Epoch [4/100], Loss: 3.9202
Epoch [5/100], Loss: 3.8293
Epoch [6/100], Loss: 0.8885
Epoch [7/100], Loss: 0.1558
Epoch [8/100], Loss: 0.0006
Epoch [9/100], Loss: 0.5408
Epoch [10/100], Loss: 0.0000
Epoch [11/100], Loss: 0.0001
Epoch [12/100], Loss: 0.0022
Epoch [13/100], Loss: 0.0641
Epoch [14/100], Loss: 0.0002
Epoch [15/100], Loss: 0.0006
Epoch [16/100], Loss: 0.0024
Epoch [17/100], Loss: 0.0018
Epoch [18/100], Loss: 0.0013
Epoch [19/100], Loss: 0.0001
Epoch [20/100], Loss: 0.0000
Epoch [21/100], Loss: 0.0000
Epoch [22/100], Loss: 0.0000
Epoch [23/100], Loss: 0.0001
Epoch [24/100], Loss: 0.0000
Epoch [25/100], Loss: 0.0000
Epoch [26/100], Loss: 0.0000
Epoch [27/100], Loss: 0.0000
Epoch [28/100], Loss: 0.0000
Epoch [29/100], Loss: 0.0000
Epoch [30/100], Loss: 0.0000
Epoch [31/100], Loss: 0.0000
Epoch [32/100], Loss: 0.0000
Epoch [33/100], Loss: 0.0000
Epoch [34/100], Loss: 0.0000
Epoch [35/100], Loss: 0

## ResNet18 + MLP

In [None]:
frame_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def load_video_frames(video_path, max_frames=16):
    frames = sorted(video_path.glob("*.jpg"))[:max_frames]
    video_tensor = torch.stack([frame_transform(Image.open(f).convert("RGB")) for f in frames])
    if len(frames) < max_frames:
        padding = torch.zeros((max_frames - len(frames), 3, 224, 224))
        video_tensor = torch.cat([video_tensor, padding], dim=0)
    return video_tensor

X = []
y = []

for _, row in df.iterrows():
    video_id = row["VIDEO"]
    video_dir = FRAME_DIR / video_id
    if not video_dir.exists():
        continue
    video_tensor = load_video_frames(video_dir, max_frames=16)
    labels = torch.tensor([
        row["OSATS_RESPECT"], row["OSATS_MOTION"], row["OSATS_INSTRUMENT"],
        row["OSATS_SUTURE"], row["OSATS_FLOW"], row["OSATS_KNOWLEDGE"],
        row["OSATS_PERFORMANCE"], row["OSATS_FINAL_QUALITY"]
    ], dtype=torch.long)
    X.append(video_tensor)
    y.append(labels)

X = torch.stack(X)
y = torch.stack(y)

# Corrigir labels se estiverem na escala 1–5 (passar para 0–4)
if torch.any(y > 4):
    print("Corrigindo escala de labels de 1–5 para 0–4...")
    y = y - 1

# Verificação de segurança
assert torch.all((y >= 0) & (y <= 4)), "Erro: targets fora do intervalo 0–4"


Corrigindo escala de labels de 1–5 para 0–4...


In [None]:
from torch.utils.data import random_split

class OSATSDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = OSATSDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [22]:
class OSATSResNet(nn.Module):
    def __init__(self):
        super().__init__()
        resnet = models.resnet18(pretrained=True)
        resnet.fc = nn.Identity()
        self.backbone = resnet
        self.fc_shared = nn.Linear(512, 128)
        self.heads = nn.ModuleList([nn.Linear(128, 5) for _ in range(8)])

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)
        feat = self.backbone(x).view(B, T, -1)
        feat = feat.mean(dim=1)
        shared = F.relu(self.fc_shared(feat))
        return torch.stack([head(shared) for head in self.heads], dim=1)


In [None]:
loss_fn = nn.CrossEntropyLoss()

def compute_loss(preds, targets):
    loss = 0
    for i in range(8):
        loss += loss_fn(preds[:, i], targets[:, i])
    return loss / 8


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OSATSResNet().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    all_preds, all_targets = [], []

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = compute_loss(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * inputs.size(0)
        preds = torch.argmax(outputs, dim=2)
        all_preds.append(preds.cpu())
        all_targets.append(targets.cpu())

    avg_loss = train_loss / len(train_loader.dataset)
    print(f"Época {epoch+1}/{num_epochs} - Loss treino: {avg_loss:.4f}")

# Guardar o modelo após treino
torch.save(model.state_dict(), "../outputs/models/model_task2_resnet.pt")




Época 1/100 - Loss treino: 1.6255
Época 2/100 - Loss treino: 1.3395
Época 3/100 - Loss treino: 1.1353
Época 4/100 - Loss treino: 0.9684
Época 5/100 - Loss treino: 0.8264
Época 6/100 - Loss treino: 0.7052
Época 7/100 - Loss treino: 0.5988
Época 8/100 - Loss treino: 0.5056
Época 9/100 - Loss treino: 0.4282
Época 10/100 - Loss treino: 0.3620
Época 11/100 - Loss treino: 0.3048
Época 12/100 - Loss treino: 0.2565
Época 13/100 - Loss treino: 0.2155
Época 14/100 - Loss treino: 0.1806
Época 15/100 - Loss treino: 0.1512
Época 16/100 - Loss treino: 0.1267
Época 17/100 - Loss treino: 0.1064
Época 18/100 - Loss treino: 0.0895
Época 19/100 - Loss treino: 0.0755
Época 20/100 - Loss treino: 0.0640
Época 21/100 - Loss treino: 0.0545
Época 22/100 - Loss treino: 0.0467
Época 23/100 - Loss treino: 0.0402
Época 24/100 - Loss treino: 0.0348
Época 25/100 - Loss treino: 0.0304
Época 26/100 - Loss treino: 0.0266
Época 27/100 - Loss treino: 0.0234
Época 28/100 - Loss treino: 0.0208
Época 29/100 - Loss treino: 0

## ResNet18 + GRU

In [24]:
class OSATSResNetGRU(nn.Module):
    def __init__(self, hidden_size=128, num_layers=1):
        super().__init__()
        resnet = models.resnet18(pretrained=True)
        resnet.fc = nn.Identity()
        self.backbone = resnet  

        self.gru = nn.GRU(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.heads = nn.ModuleList([nn.Linear(hidden_size, 5) for _ in range(8)])  

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)
        features = self.backbone(x)  
        features = features.view(B, T, 512) 

        _, h_n = self.gru(features) 
        h_n = h_n.squeeze(0) 

        return torch.stack([head(h_n) for head in self.heads], dim=1)  


In [None]:
def compute_loss(preds, targets):
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0.0

    for i in range(preds.size(1)):  # 8 cabeças
        total_loss += loss_fn(preds[:, i], targets[:, i])
    
    return total_loss / preds.size(1)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OSATSResNetGRU().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = compute_loss(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * inputs.size(0)

    avg_loss = train_loss / len(train_loader.dataset)
    print(f"Época {epoch+1}/{num_epochs} - Loss treino: {avg_loss:.4f}")

# Guardar o modelo
torch.save(model.state_dict(), "../outputs/models/model_task2_resnet_gru.pt")


Época 1/100 - Loss treino: 1.6660
Época 2/100 - Loss treino: 1.1489
Época 3/100 - Loss treino: 0.9110
Época 4/100 - Loss treino: 0.7746
Época 5/100 - Loss treino: 0.6877
Época 6/100 - Loss treino: 0.6271
Época 7/100 - Loss treino: 0.5789
Época 8/100 - Loss treino: 0.5406
Época 9/100 - Loss treino: 0.5102
Época 10/100 - Loss treino: 0.4848
Época 11/100 - Loss treino: 0.4616
Época 12/100 - Loss treino: 0.4415
Época 13/100 - Loss treino: 0.4231
Época 14/100 - Loss treino: 0.4061
Época 15/100 - Loss treino: 0.3901
Época 16/100 - Loss treino: 0.3744
Época 17/100 - Loss treino: 0.3612
Época 18/100 - Loss treino: 0.3490
Época 19/100 - Loss treino: 0.3374
Época 20/100 - Loss treino: 0.3264
Época 21/100 - Loss treino: 0.3158
Época 22/100 - Loss treino: 0.3057
Época 23/100 - Loss treino: 0.2960
Época 24/100 - Loss treino: 0.2868
Época 25/100 - Loss treino: 0.2781
Época 26/100 - Loss treino: 0.2699
Época 27/100 - Loss treino: 0.2620
Época 28/100 - Loss treino: 0.2542
Época 29/100 - Loss treino: 0

## Vision Transformer (ViT) + MLP

In [17]:
class OSATSViT(nn.Module):
    def __init__(self):
        super().__init__()
        vit = models.vit_b_16(pretrained=True)
        vit.heads = nn.Identity()  # remover a cabeça original
        self.backbone = vit  # saída: [B*T, 768]

        self.shared = nn.Linear(768, 128)
        self.heads = nn.ModuleList([nn.Linear(128, 5) for _ in range(8)])

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)             # [B*T, 3, 224, 224]
        features = self.backbone(x)            # [B*T, 768]
        features = features.view(B, T, -1)     # [B, T, 768]
        pooled = features.mean(dim=1)          # média temporal → [B, 768]

        shared = F.relu(self.shared(pooled))   # [B, 128]
        return torch.stack([head(shared) for head in self.heads], dim=1)  # [B, 8, 5]


In [None]:
def compute_loss(preds, targets):
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0.0

    for i in range(preds.size(1)):  # 8 cabeças
        total_loss += loss_fn(preds[:, i], targets[:, i])
    
    return total_loss / preds.size(1)


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OSATSViT().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = compute_loss(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * inputs.size(0)

    avg_loss = train_loss / len(train_loader.dataset)
    print(f"Época {epoch+1}/{num_epochs} - Loss treino: {avg_loss:.4f}")

# Guardar o modelo
torch.save(model.state_dict(), "../outputs/models/model_task2_vit.pt")




Época 1/100 - Loss treino: 1.5967
Época 2/100 - Loss treino: 1.4559
Época 3/100 - Loss treino: 1.4034
Época 4/100 - Loss treino: 1.3681
Época 5/100 - Loss treino: 1.3389
Época 6/100 - Loss treino: 1.3237
Época 7/100 - Loss treino: 1.3076
Época 8/100 - Loss treino: 1.3019
Época 9/100 - Loss treino: 1.3003
Época 10/100 - Loss treino: 1.2948
Época 11/100 - Loss treino: 1.2910
Época 12/100 - Loss treino: 1.2892
Época 13/100 - Loss treino: 1.2845
Época 14/100 - Loss treino: 1.2943
Época 15/100 - Loss treino: 1.2797
Época 16/100 - Loss treino: 1.2858
Época 17/100 - Loss treino: 1.2790
Época 18/100 - Loss treino: 1.2678
Época 19/100 - Loss treino: 1.2639
Época 20/100 - Loss treino: 1.2576
Época 21/100 - Loss treino: 1.2662
Época 22/100 - Loss treino: 1.2545
Época 23/100 - Loss treino: 1.3136
Época 24/100 - Loss treino: 1.2715
Época 25/100 - Loss treino: 1.2908
Época 26/100 - Loss treino: 1.2722
Época 27/100 - Loss treino: 1.2568
Época 28/100 - Loss treino: 1.2614
Época 29/100 - Loss treino: 1

## Evaluation

In [19]:
def validate_model_multihead(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_items = 0

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)  # [B, 8, 5]
            loss = criterion(outputs.view(-1, 5), targets.view(-1))
            total_loss += loss.item() * inputs.size(0)

            preds = outputs.argmax(dim=2)  # [B, 8]
            total_correct += (preds == targets).sum().item()
            total_items += targets.numel()

    avg_loss = total_loss / len(dataloader.dataset)
    avg_acc = total_correct / total_items
    return avg_loss, avg_acc


In [25]:
model1 = CNNModel_1().to(device)
model1.load_state_dict(torch.load("../outputs/models/model_task2_cnn1.pt"))

model2 = CNNModel_2().to(device)
model2.load_state_dict(torch.load("../outputs/models/model_task2_cnn2.pt"))

model3 = CNNModel_3().to(device)
model3.load_state_dict(torch.load("../outputs/models/model_task2_cnn3.pt"))

model4 = CNNModel_4().to(device)
model4.load_state_dict(torch.load("../outputs/models/model_task2_cnn4.pt"))

model5 = OSATSResNet().to(device)
model5.load_state_dict(torch.load("../outputs/models/model_task2_resnet.pt"))

model6 = OSATSResNetGRU().to(device)
model6.load_state_dict(torch.load("../outputs/models/model_task2_resnet_gru.pt"))

model7 = OSATSViT().to(device)
model7.load_state_dict(torch.load("../outputs/models/model_task2_vit.pt"))

models = {
    "CNNModel_1": model1,
    "CNNModel_2": model2,
    "CNNModel_3": model3,
    "CNNModel_4": model4,
    "ResNet + MLP": model5,
    "ResNet + GRU": model6,
    "ViT + MLP": model7
}




In [30]:
def validate_model_multihead(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)  # Esperado: [B, 8, 5]
            B, N, C = outputs.shape  # B=batch, N=critérios, C=classes
            outputs = outputs.view(B * N, C)
            targets = targets.view(B * N)

            loss = criterion(outputs, targets)
            total_loss += loss.item() * inputs.size(0)

            preds = outputs.argmax(dim=1)
            correct += (preds == targets).sum().item()
            total += targets.size(0)

    avg_loss = total_loss / len(dataloader.dataset)
    accuracy = correct / total
    return avg_loss, accuracy


In [None]:
criterion = nn.CrossEntropyLoss()
val_results = {}

for name, model in models.items():
    val_loss, val_acc = validate_model_multihead(model, val_loader, criterion)
    val_results[name] = val_acc
    print(f"{name} - Val Acc: {val_acc:.4f}")


In [None]:
best_model_name = max(val_results, key=val_results.get)
best_model = models[best_model_name]
print(f"Melhor modelo: {best_model_name} com accuracy {val_results[best_model_name]:.4f}")

test_loss, test_acc = validate_model_multihead(best_model, test_loader, criterion)
print(f"Acurácia no Teste: {test_acc:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

osats_cols = [col for col in df.columns if col.startswith("OSATS_")]

best_model.eval()
all_true, all_pred = [], []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = best_model(inputs)
        preds = outputs.argmax(dim=2)

        all_true.append(targets.cpu())
        all_pred.append(preds.cpu())

true = torch.cat(all_true)
pred = torch.cat(all_pred)

for i, col in enumerate(osats_cols):
    cm = confusion_matrix(true[:, i], pred[:, i])
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Matriz de Confusão - {col}")
    plt.xlabel("Predito")
    plt.ylabel("Verdadeiro")
    plt.show()
