In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn, device, no_grad
import torch.cuda
from torch.optim import Adam
from sklearn.metrics import f1_score
import time
import os
from PIL import Image
import torchvision.transforms as transforms
import torch.nn.functional as F
import numpy as np


In [2]:
labels = pd.read_csv("../data/train_labels.csv")
image_folder = "../data/Train/"
label_encoder = LabelEncoder()
labels["label"] = label_encoder.fit_transform(labels["label"])
num_classes = len(label_encoder.classes_)
print(num_classes)

9


In [3]:
def crop_pokemon(img_path):
    img = Image.open(img_path)
    gray_img = img.convert("L") # Convert to grayscale
    img_array = np.array(gray_img)

    # Detect black silhouette (thresholding)
    threshold = 3  # Adjust this value if needed, provavelmente 3/4/5 é o melhor, 3 fico 95% mas ha 1 ou outro estranho
    mask = img_array < threshold

    # Get coordinates of silhouette (bounding box)
    coords = np.column_stack(np.where(mask))

    y_min, x_min = coords.min(axis=0)
    y_max, x_max = coords.max(axis=0)

    # Apply padding (5%)
    pad = int(0.05 * min(x_max - x_min, y_max - y_min))
    x_min, y_min = max(0, x_min - pad), max(0, y_min - pad)
    x_max, y_max = min(img.width, x_max + pad), min(img.height, y_max + pad)

    cropped_img = img.crop((x_min, y_min, x_max, y_max))
    
    #Fill it back to 64x64 with transparent pixels
    width, height = cropped_img.size
    new_img = Image.new("RGBA", (64, 64), (0, 0, 0, 0))
    x_offset = (64 - width) // 2
    y_offset = (64 - height) // 2
    new_img.paste(cropped_img, (x_offset, y_offset))

    return new_img

In [4]:
# Class definition assuming cropped training and uncropped testing
class PokemonDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        img_name = os.path.join(self.img_dir, str(self.df.iloc[idx, 0]))
        if not img_name.endswith('.png'):
            img_name += ".png"
            image = Image.open(img_name)
        else:
            image = Image.open(img_name)

        if self.transform:
            image = self.transform(Image.open(img_name))

        if len(self.df.columns) > 1:  # Train Set has labels, Test does not.
            label = self.df.iloc[idx, 1]
            return image, label
        else:
            return image, -1  # X dont care for Test

In [5]:

class CNN(nn.Module):
    def __init__(self, num_classes, dropout=0.3):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2, 2)

        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool4 = nn.MaxPool2d(2, 2)

        # At this point: 64x64 → 32x32 → 16x16 → 8x8 → 4x4
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(256 * 4 * 4, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = self.pool4(F.relu(self.bn4(self.conv4(x))))
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [6]:
# Define transformations for better generalization
transform = transforms.Compose([
    #transforms.RandomHorizontalFlip(), # Random flipping left-right
    #transforms.RandomRotation(15), # Rotate randomly within ±15 degrees
    #transforms.ColorJitter(brightness=0.2, contrast=0.2), # Adjust brightness/contrast
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5, 0.0], std=[0.5, 0.5, 0.5, 1.0])
    #transforms.Lambda(lambda x: x.view(-1))  # Flatten
])

In [7]:
# Split the dataset into 3 (70% train, 10% val, 20% test)
dataset = PokemonDataset(labels, image_folder, transform)
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [8]:
device = device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
batch_size = 16
n_epochs = 120
learning_rate = 0.001

early_stop_epochs=30
early_stop = False

In [10]:
model = CNN(num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [12]:
model.to(device)
start_time = time.time()

# Training loop
for epoch in range(n_epochs):
  model.train()
  running_loss = 0.0
  for images, labels in train_loader:
    images, labels = images.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

  # Validation loop
  model.eval()
  all_labels = []
  all_preds = []
  with no_grad():
    for images, labels in val_loader:
      images, labels = images.to(device), labels.to(device)
      outputs = model(images)
      _, preds = torch.max(outputs, 1)  # Get predicted class indices
      all_labels.extend(labels.cpu().numpy())
      all_preds.extend(preds.cpu().numpy())

  # Calculate F1 Score
  f1 = f1_score(all_labels, all_preds, average="macro")
  print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader):.4f}, F1 Score: {f1:.4f}")

  #if (f1 > 0.800) and not early_stop:
   # epoch = n_epochs - early_stop_epochs
    #early_stop = True
    #print("Early stop on epoch ", epoch)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

# Test loop
model.eval()
correct, total = 0, 0
all_labels = []
all_preds = []
with no_grad():
  for images, labels in test_loader:
    images, labels = images.to(device), labels.to(device)
    outputs = model(images)
    _, predicted = torch.max(outputs, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    all_labels.extend(labels.cpu().numpy())
    all_preds.extend(predicted.cpu().numpy())

accuracy = 100 * correct / total
f1 = f1_score(all_labels, all_preds, average="macro")
print(f"Test Accuracy: {accuracy:.2f}%")
print(f"Test F1 Score: {f1:.4f}")

Epoch [1/50], Loss: 2.0531, F1 Score: 0.1929
Epoch [2/50], Loss: 1.7590, F1 Score: 0.1842
Epoch [3/50], Loss: 1.6107, F1 Score: 0.2551
Epoch [4/50], Loss: 1.3322, F1 Score: 0.4248
Epoch [5/50], Loss: 1.1727, F1 Score: 0.2513
Epoch [6/50], Loss: 0.9732, F1 Score: 0.2774
Epoch [7/50], Loss: 0.8861, F1 Score: 0.5468
Epoch [8/50], Loss: 0.7637, F1 Score: 0.5690
Epoch [9/50], Loss: 0.6532, F1 Score: 0.6169
Epoch [10/50], Loss: 0.5830, F1 Score: 0.7328
Epoch [11/50], Loss: 0.4532, F1 Score: 0.8432
Epoch [12/50], Loss: 0.4519, F1 Score: 0.8803
Epoch [13/50], Loss: 0.3395, F1 Score: 0.9110
Epoch [14/50], Loss: 0.2682, F1 Score: 0.8073
Epoch [15/50], Loss: 0.2894, F1 Score: 0.9034
Epoch [16/50], Loss: 0.2228, F1 Score: 0.7708
Epoch [17/50], Loss: 0.2517, F1 Score: 0.8451
Epoch [18/50], Loss: 0.2608, F1 Score: 0.6175
Epoch [19/50], Loss: 0.1542, F1 Score: 0.8239
Epoch [20/50], Loss: 0.2463, F1 Score: 0.9771
Epoch [21/50], Loss: 0.1663, F1 Score: 0.7475
Epoch [22/50], Loss: 0.1641, F1 Score: 0.78

In [13]:
# Define transformations for better testing
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5, 0.0], std=[0.5, 0.5, 0.5, 1.0])
])

In [14]:
# Test folder path for predictions
test_folder = "../data/Test"

# Create the Test dataset (without labels)
test_files = os.listdir(test_folder)
test_files = [f for f in test_files if f.endswith('.png')]  # Assuming PNG format for test images

# Create a DataFrame to hold the test file names (Ids)
test_df = pd.DataFrame({'Id': test_files})  # Only filenames (Ids)

# Create PokemonDataset for test images (no labels)
test_dataset = PokemonDataset(test_df, test_folder, transform=test_transform)

# Create DataLoader for test data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
# Predict on the test dataset
predictions = []
model.eval()
with torch.no_grad():
    for images, _ in test_loader:  # No labels in test set
        images = images.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)  # Get predicted class indices
        predictions.extend(preds.cpu().numpy())

In [16]:
# Prepare the submission DataFrame
submission = pd.DataFrame({
    'Id': [file.split('.')[0] for file in test_files],  # extract id
    'Category': label_encoder.inverse_transform(predictions)  
})

# Save the submission file
submission.to_csv("task2_submit.csv", index=False)