## Cargando los datasets

In [1]:
from datasets.classification_dataset import DermaClassificationDataset
from pathlib import Path
import pandas as pd

def get_dataset_image_root() -> Path:
    """Returns the path to the root of the classification dataset"""
    return Path("./images/classification//")

def get_classification_csv_path() -> Path:
    """Returns the path to the csv file containing the information about the images
    and their classifications"""
    return Path("./datasets/csv_files/clean_classification_dataset.csv")



In [2]:
from sklearn.model_selection import train_test_split

all_images_df = pd.read_csv(get_classification_csv_path())
train_files, val_files = train_test_split(all_images_df, test_size=0.2, random_state=42)

print(f"Size of training images: {len(train_files)}")
print(f"Size of validation images: {len(val_files)}")


Size of training images: 42180
Size of validation images: 10546


In [3]:
from preprocessing.dataset_transforms import (
    get_classification_train_transforms_v2,
    get_classification_evaluation_transform_v2,
)

train_transform = get_classification_train_transforms_v2(
    image_size=(224, 224), dull_razor_probability=0.5
)
val_transform = get_classification_evaluation_transform_v2()

train_dataset = DermaClassificationDataset(
    root_img_folder=get_dataset_image_root(),
    image_dataframe=train_files,
    transform=train_transform,
)

val_dataset = DermaClassificationDataset(
    root_img_folder=get_dataset_image_root(),
    image_dataframe=val_files,
    transform=val_transform,
)

### Creando los dataloaders

In [4]:
train_dl = train_dataset.get_balanced_dataloader(batch_size=32, num_workers=8)

val_dl = val_dataset.get_balanced_dataloader(batch_size=32, num_workers=8)

100%|[32m██████████[0m| 42180/42180 [00:00<00:00, 629866.86it/s]
100%|[32m██████████[0m| 10546/10546 [00:00<00:00, 932519.50it/s]


In [5]:
from tqdm import tqdm

def get_image_counts_by_category(dataloader):
    """
    Returns a dictionary with the count of images in each category from the given DataLoader.

    Args:
        dataloader (DataLoader): The DataLoader to analyze.

    Returns:
        dict: A dictionary where each key is a category (as an integer) and the value is the total count of images in that category.
    """
    category_counts = {}

    # Iterate over the DataLoader
    for batch in tqdm(dataloader, colour="blue", leave=True):
        _, labels = batch

        # Iterate over the labels in the batch
        for label in labels:
            label = label.item()  # Convert tensor to integer
            if label in category_counts:
                category_counts[label] += 1
            else:
                category_counts[label] = 1

    return category_counts

In [6]:
first = get_image_counts_by_category(train_dl)

print(f"The dataloader have the following distribution:\n\t{first.items()}")


100%|██████████| 1319/1319 [05:20<00:00,  4.12it/s]

The first dataloader have the following distribution:
	dict_items([(1, 10384), (0, 10600), (3, 10579), (2, 10617)])





In [None]:
# for index, batch in tqdm(enumerate(train_dl), desc="Checking speed of the dataloader", colour="blue"):
#     continue

## Entrenamiento

In [7]:
from models.resnet_transfer_learning import get_resnet50
import torchsummary
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model, loss, optimizer = get_resnet50(4)


In [None]:
torchsummary.summary(
    model,
    input_data=(3,224,224)
)

### Funciones de utilidad para el entrenamiento

In [17]:
def train_batch(x, y, model, optimizer, criteria):
    """Train a batch of data and return the loss value and accuracy"""
    model.train()
    optimizer.zero_grad()
    prediction = model(x)
    batch_loss = criteria(prediction, y)
    batch_loss.backward()
    optimizer.step()
    correct = prediction.argmax(dim=1).eq(y).sum().item()
    return batch_loss.item(), correct

@torch.no_grad()
def validate_batch(x, y, model, criteria):
    """Validate a batch of data and return the loss value and the number of correct predictions"""
    model.eval()
    prediction = model(x)
    batch_loss = criteria(prediction, y)
    correct = prediction.argmax(dim=1).eq(y).sum().item()
    return batch_loss.item(), correct
    

In [18]:
def train_epochs(
    model, optimizer, criteria, train_dl, val_dl, epochs=10, device="cuda"
):
    total_train_loss = 0
    total_train_accuracy = 0
    total_val_loss = 0
    total_val_accuracy = 0
    """Train the model for a number of epochs"""
    model.to(device)
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        for index, batch in tqdm(enumerate(train_dl), total=len(train_dl), desc=f"Training epoch {epoch}", colour="red"):
            x, y = batch
            x, y = x.to(device), y.to(device)
            loss, accuracy = train_batch(x, y, model, optimizer, criteria)
            total_train_loss += loss
            total_train_accuracy += accuracy

        print(f"Training Loss: {total_train_loss/len(train_dl)}")
        print(f"Training Accuracy: {total_train_accuracy/len(train_dl)}")

        for index, batch in tqdm(enumerate(val_dl), total=len(val_dl), desc=f"Validation on {epoch}", colour="green"):
            x, y = batch
            x, y = x.to(device), y.to(device)
            loss, accuracy = validate_batch(x, y, model, criteria)
            total_val_loss += loss
            total_val_accuracy += accuracy
        
        print(f"Validation Loss: {total_val_loss/len(val_dl)}")
        print(f"Validation Accuracy: {total_val_accuracy/len(val_dl)}")

In [19]:
train_epochs(
    model,
    optimizer,
    loss,
    train_dl,
    val_dl,
    epochs=3,
)

Epoch 1/3


Training epoch 0:  72%|[31m███████▏  [0m| 951/1319 [12:57<05:00,  1.22it/s]


KeyboardInterrupt: 