In [3]:
import sys
import numpy as np
import timm
import torch
from torch import tensor
import torch.nn as nn
from torchvision.transforms import InterpolationMode, transforms
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from PIL import Image
import os
from tqdm import tqdm
import getpass
import socket

print("Host name: ", socket.gethostname())  # Retrieve the hostname of the current system to determine the environment
print("User name: ", getpass.getuser())  # Retrieve the current user's username

# Set the PyTorch device (GPU/cuda or CPU)
# Check if CUDA (NVIDIA GPU acceleration) is available
if torch.cuda.is_available():
    dev = "cuda"
    gpu_name = torch.cuda.get_device_name(torch.device("cuda"))
    _, max_memory = torch.cuda.mem_get_info()
    max_memory = max_memory / (1000**3)
    print(f"GPU name: {gpu_name}")
    print(f"Max GPU memory: {max_memory} GiB")
else:
    dev = "cpu"
    print("No GPU available.")

# Set PyTorch device based on the chosen device (cuda or cpu)
device = torch.device(dev)

# If the notebook is running on the JASMIN GPU cluster, select the GPU with the most free memory
if socket.gethostname() == "gpuhost001.jc.rl.ac.uk":

    def select_gpu_with_most_free_memory():
        max_memory_available = 0
        gpu_id_with_max_memory = 0
        for i in range(torch.cuda.device_count()):
            torch.cuda.set_device(i)
            mem_free = torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_allocated(i)
            if mem_free > max_memory_available:
                max_memory_available = mem_free
                gpu_id_with_max_memory = i
        return gpu_id_with_max_memory

    best_gpu = select_gpu_with_most_free_memory()
    torch.cuda.set_device(best_gpu)
    print(f"Using GPU: {best_gpu}")

gpu_override = 2
torch.cuda.set_device(gpu_override)
print(f"Using GPU: {gpu_override}")

CROP_SIZE = 182
BACKBONE = "vit_large_patch14_dinov2"
weight_path = "../models/fine-tuned-deepfaune-vit_large_patch14_dinov2.lvd142m.pt"

jasmin = True

if jasmin:
    train_path = "../data/split_data/train"
    val_path = "../data/split_data/val"
    test_path = "../data/split_data/test"
else:
    train_path = "/media/tom-ratsakatika/CRUCIAL 4TB/FCC Camera Trap Data/split_data/train"
    val_path = "/media/tom-ratsakatika/CRUCIAL 4TB/FCC Camera Trap Data/split_data/val"
    test_path = "/media/tom-ratsakatika/CRUCIAL 4TB/FCC Camera Trap Data/split_data/test"

ANIMAL_CLASSES = ["badger", "ibex", "red deer", "chamois", "cat", "goat", "roe deer", "dog", "squirrel", "equid", "genet",
                  "hedgehog", "lagomorph", "wolf", "lynx", "marmot", "micromammal", "mouflon",
                  "sheep", "mustelid", "bird", "bear", "nutria", "fox", "wild boar", "cow"]

class AnimalDataset(Dataset):
    def __init__(self, directory, transform=None, preload_to_gpu=False):
        self.directory = directory
        self.transform = transform
        self.images = []
        self.labels = []
        self.preload_to_gpu = preload_to_gpu

        for label in os.listdir(directory):
            label_dir = os.path.join(directory, label)
            if os.path.isdir(label_dir):
                for image in os.listdir(label_dir):
                    image_path = os.path.join(label_dir, image)
                    self.images.append(image_path)
                    self.labels.append(ANIMAL_CLASSES.index(label))

        if self.preload_to_gpu:
            self.preload_images()

    def preload_images(self):
        self.loaded_images = []
        for image_path in tqdm(self.images, desc="Preloading images to GPU"):
            image = Image.open(image_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            self.loaded_images.append(image.to(device))
        self.labels = torch.tensor(self.labels, device=device)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        if self.preload_to_gpu:
            return self.loaded_images[idx], self.labels[idx]
        else:
            image_path = self.images[idx]
            label = self.labels[idx]
            image = Image.open(image_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, label

class Classifier(nn.Module):
    def __init__(self, freeze_up_to_layer=16):
        super(Classifier, self).__init__()
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = timm.create_model(BACKBONE, pretrained=False, num_classes=len(ANIMAL_CLASSES), dynamic_img_size=True)
        state_dict = torch.load(weight_path, map_location=torch.device(device))['state_dict']
        self.model.load_state_dict({k.replace('base_model.', ''): v for k, v in state_dict.items()})

        # Freeze layers up to the specified layer
        if freeze_up_to_layer is not None:
            for name, param in self.model.named_parameters():
                if self._should_freeze_layer(name, freeze_up_to_layer):
                    param.requires_grad = False

        self.transforms = transforms.Compose([
            transforms.Resize(size=(CROP_SIZE, CROP_SIZE), interpolation=InterpolationMode.BICUBIC, max_size=None, antialias=None),
            transforms.ToTensor(),
            transforms.Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
        ])

    def _should_freeze_layer(self, name, freeze_up_to_layer):
        if 'blocks' in name:
            block_num = int(name.split('.')[1])
            if block_num <= freeze_up_to_layer:
                return True
        return False

    def forward(self, x):
        return self.model(x)

    def predict(self, image):
        img_tensor = self.transforms(image).unsqueeze(0)
        with torch.no_grad():
            output = self.forward(img_tensor)
            probabilities = torch.nn.functional.softmax(output, dim=1)
            top_p, top_class = probabilities.topk(1, dim=1)
            return ANIMAL_CLASSES[top_class.item()], top_p.item()

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(dataloader, desc="Training"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Validation"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return running_loss / len(dataloader), accuracy

def test(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Testing"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

def main():
    num_epochs = 10  # Set the number of epochs
    batch_size = 32  # Set the batch size
    learning_rate = 1e-4  # Reduced learning rate for fine-tuning

    transform = transforms.Compose([
        transforms.Resize((CROP_SIZE, CROP_SIZE), interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
    ])

    print('Loading training data...')
    train_dataset = AnimalDataset(train_path, transform=transform, preload_to_gpu=True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = Classifier(freeze_up_to_layer=16).to(device)  # Freeze up to the 16th layer

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print('Training started...')
    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, criterion, optimizer, device)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss}')

    # Load validation data only when needed
    print('Calculating validation loss...')
    val_dataset = AnimalDataset(val_path, transform=transform)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    val_loss, val_accuracy = validate(model, val_loader, criterion, device)
    print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}%')

    # Load test data only when needed
    print('Testing the model...')
    test_dataset = AnimalDataset(test_path, transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    test_accuracy = test(model, test_loader, device)
    print(f'Test Accuracy: {test_accuracy}%')

if __name__ == '__main__':
    main()


Host name:  gpuhost001.jc.rl.ac.uk
User name:  trr26
GPU name: NVIDIA A100-SXM4-40GB
Max GPU memory: 42.285268992 GiB
Using GPU: 0
Using GPU: 2
Loading training data...


Preloading images to GPU: 100%|██████████| 17336/17336 [1:00:29<00:00,  4.78it/s]


Training started...


Training: 100%|██████████| 542/542 [04:31<00:00,  1.99it/s]


Epoch 1, Train Loss: 1.9487264836516327


Training: 100%|██████████| 542/542 [04:31<00:00,  2.00it/s]


Epoch 2, Train Loss: 1.2281544784778158


Training: 100%|██████████| 542/542 [04:31<00:00,  2.00it/s]


Epoch 3, Train Loss: 0.9185599512609609


Training: 100%|██████████| 542/542 [04:31<00:00,  2.00it/s]


Epoch 4, Train Loss: 0.7142946846590711


Training: 100%|██████████| 542/542 [04:30<00:00,  2.00it/s]


Epoch 5, Train Loss: 0.5972439425265437


Training: 100%|██████████| 542/542 [04:30<00:00,  2.00it/s]


Epoch 6, Train Loss: 0.5107054693067645


Training: 100%|██████████| 542/542 [04:30<00:00,  2.00it/s]


Epoch 7, Train Loss: 0.4359423902944225


Training: 100%|██████████| 542/542 [04:30<00:00,  2.00it/s]


Epoch 8, Train Loss: 0.38702892430978947


Training: 100%|██████████| 542/542 [04:30<00:00,  2.00it/s]


Epoch 9, Train Loss: 0.34668582951395716


Training: 100%|██████████| 542/542 [04:30<00:00,  2.00it/s]


Epoch 10, Train Loss: 0.2914618310205831
Calculating validation loss...


Validation: 100%|██████████| 115/115 [15:21<00:00,  8.01s/it]


Validation Loss: 1.500282765860143, Validation Accuracy: 63.69582992641047%
Testing the model...


Testing: 100%|██████████| 112/112 [06:54<00:00,  3.70s/it]

Test Accuracy: 62.327804329491144%





In [4]:
import torch

# Assuming 'model' is your trained model
# Save the model weights
torch.save(model.state_dict(), 'updated_model_weights.pth')
print("Model weights saved successfully.")



NameError: name 'model' is not defined

## add code that asks, how many more epochs? 0-XX
## how do i keep data loaded on GPU for further experiments?