# Problema Abordado

# Implementação

## Importações

In [5]:
import torch
from torchvision import transforms, datasets
from torchvision import models
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from datasets import load_dataset
from PIL import Image
import os
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split


## Funções Auxiliares

In [6]:
def train_model(model, train_loader, criterion, optimizer, device, epochs=10):
    model.train()
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        
        for batch in train_loader:
            images = batch["image_tensor"].to(device)
            captions = batch["tokenized_caption"].to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(images, captions[:, :-1])
            
            # Calcula loss
            loss = criterion(
                outputs.view(-1, outputs.size(-1)),
                captions[:, 1:].reshape(-1)
            )
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

In [7]:
#Função para ajustar dimensões dos tensores do dataset

def collate_fn(batch):
    # Extrai cada campo do batch
    images = [item["image_tensor"] for item in batch]
    captions = [item["caption"] for item in batch]
    input_ids = [item["tokenized_caption"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    images_orig = [item["image_original"] for item in batch]

    # Empilha as imagens (todas já têm o mesmo shape, então stack direto)
    image_tensor = torch.stack(images)

    # Faz padding nas sequências de texto
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=50256)  # eos_token_id para GPT-2
    attention_mask_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    return {
        "image_tensor": image_tensor,                    # (B, 3, 224, 224)
        "caption": captions,                             # Lista de strings
        "tokenized_caption": input_ids_padded,           # (B, T)
        "attention_mask": attention_mask_padded,         # (B, T)
        "image_original": images_orig                    # Lista de PIL Images
    }


In [8]:
def evaluate_model(model, test_loader, device):
    model.eval()
    total_loss = 0.0
    
    with torch.no_grad():
        for batch in test_loader:
            images = batch["image_tensor"].to(device)
            captions = batch["tokenized_caption"].to(device)
            
            outputs = model(images, captions[:, :-1])
            loss = criterion(
                outputs.view(-1, outputs.size(-1)),
                captions[:, 1:].reshape(-1)
            )
            
            total_loss += loss.item()
    
    avg_loss = total_loss / len(test_loader)
    print(f"Test Loss: {avg_loss:.4f}")
    return avg_loss

## Dados

### Carregando o dataset


In [30]:


class DeepFashionDataset(Dataset):
    def __init__(self, labelDataset, image_dir, transform=None, tokenizer=None):
        self.df = labelDataset.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

        # Tokenizador padrão: GPT-2
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token  # GPT-2 não tem pad_token original

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        caption = row["caption"]
        img_path = os.path.join(self.image_dir, row["path"].replace("\\", "/"))
        img_path = os.path.normpath(img_path)
        # Imagem original e transformada
        image_pil = Image.open(img_path).convert("RGB")
        image_tensor = self.transform(image_pil) if self.transform else image_pil

        # Tokenização (sem truncamento, padding tratado no collate_fn no Dataloader)
        tokens = self.tokenizer(caption, return_tensors="pt")
        input_ids = tokens["input_ids"].squeeze(0)
        attention_mask = tokens["attention_mask"].squeeze(0)

        return {
            "image_tensor": image_tensor,
            "caption": caption,
            "tokenized_caption": input_ids,
            "attention_mask": attention_mask,
            "image_original": image_pil
        }
    




transform = transforms.Compose([
    transforms.Resize((224,224)), # Padrão dos modelos pretreinados do ImageNet
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.485, 0.456, 0.406],  ## Normalizando dados no padrão do ImageNet
                         std=[0.229, 0.224, 0.225])
])


In [31]:
labels_df = pd.read_csv('datasets/labels_front.csv')
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12278 entries, 0 to 12277
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   image_id      12278 non-null  object
 1   caption       12278 non-null  object
 2   path          12278 non-null  object
 3   gender        12278 non-null  object
 4   product_type  12278 non-null  object
 5   product_id    12278 non-null  object
 6   image_type    12278 non-null  object
dtypes: object(7)
memory usage: 671.6+ KB


### Data Loader

In [32]:
dataset = DeepFashionDataset(
    labelDataset = labels_df,
    image_dir = "datasets/selected_images",
    transform = transform
)


In [33]:
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


batch_size = 32 #Quantos exemplos são processsados juntos durante o treino
#Como não estamos truncando o tamanho dos tokens precisamos garantir que todas as legendas
# tenham o mesmo tamanho de tensor o collate_fn ajustara esse tamanho com

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

## Rede Implementada

In [14]:
class Encoder(nn.Module):
    def __init__(self, output_dim=256):
        super().__init__()
        
        # Camadas convolucionais
        self.features = nn.Sequential(
            # Bloco 1
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Bloco 2
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Bloco 3
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Bloco 4
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        
        # Projeção para a dimensão desejada
        self.projection = nn.Linear(256, output_dim)
        
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.projection(x)
        return x



In [15]:
class Gpt2Decoder(nn.Module):
    def __init__(self, image_dim=256): 
        super().__init__()
        self.gpt = GPT2LMHeadModel.from_pretrained("gpt2")
        
       
        for param in self.gpt.parameters():
            param.requires_grad = False
       
        for param in self.gpt.transformer.h[-4:].parameters():
            param.requires_grad = True
        for param in self.gpt.lm_head.parameters():
            param.requires_grad = True
        
        self.image_proj = nn.Linear(image_dim, self.gpt.config.n_embd)
        
    def forward(self, textTokens, image_features):
        batch_size = textTokens.size(0)
        
        
        img_embed = self.image_proj(image_features).unsqueeze(1)
        
        
        text_embeds = self.gpt.transformer.wte(textTokens)
        
        gpt_input = torch.cat([img_embed, text_embeds], dim=1)
        
        attention_mask = (textTokens != 50256).float()  
        prefix_mask = torch.ones(batch_size, 1, dtype=attention_mask.dtype, 
                               device=attention_mask.device)
        full_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)
        
        outputs = self.gpt(inputs_embeds=gpt_input, attention_mask=full_attention_mask)
        return outputs.logits


In [16]:
class ImageCaptionModel(nn.Module):
    def __init__(self, encoder_output_dim=256):
        super().__init__()
        self.encoder = Encoder(output_dim=encoder_output_dim)
        
        self.decoder = Gpt2Decoder(image_dim=encoder_output_dim)
        
    def forward(self, images, captions):
        image_features = self.encoder(images)
        logits = self.decoder(captions, image_features)
        return logits

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ImageCaptionModel().to(device)


criterion = nn.CrossEntropyLoss(ignore_index=50256)  # Ignora padding tokens
optimizer = optim.Adam(model.parameters(), lr=1e-4)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Treinamento da Rede

In [34]:
train_model(model, train_loader, criterion, optimizer, device, epochs=10)

FileNotFoundError: [Errno 2] No such file or directory: 'datasets\\selected_images\\WOMEN-Tees_Tanks-id_00001690-02_1_front.jpg'

# Qualidade dos Resultados

# Discussão Geral