In [2]:
import os
import json
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.models as models
from torch.optim import Adam
from tqdm import tqdm

data_path = "./processed_data/"

In [3]:
# create image labels from annotations
images = []
labels = []

with open(data_path + "category.json", "r") as f:
    categories = json.load(f)
with open(data_path + "metadata.json", "r") as f:
    metadata = json.load(f)
    
category_indices = {cat['name']: idx for idx, cat in enumerate(categories)}

image_labels = {}

for image, values in metadata.items():
    image_file_name = values["filename"]
    
    if image_file_name not in image_labels:
        image_labels[image_file_name] = [0] * len(categories)
        
    for cat in values["categories"]:
        cat_index = category_indices[cat]
        image_labels[image_file_name][cat_index] = 1
        
image_labels = list(image_labels.items())

In [3]:
# image_labels[:5]

In [4]:
# len(image_labels)

In [6]:
# create Torch Dataset object

class MultiLabelDataset(Dataset):
    def __init__(self, image_labels, root_dir, transform=None):
        """
        Args:
            image_labels (list of tuples): List of tuples (image_path, label_vector).
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.image_labels = image_labels
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.image_labels)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.image_labels[idx][0])
        image = Image.open(img_name)
        labels = self.image_labels[idx][1]
        if self.transform:
            image = self.transform(image)
        return image, torch.FloatTensor(labels)

In [7]:
# for project update using first 10000 images
image_labels = image_labels[:10000]

In [13]:
# load data into DataLoader
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_data, val_data = train_test_split(image_labels, test_size=0.2, random_state=42)

train_dataset = MultiLabelDataset(image_labels=train_data, root_dir=data_path+"images/", transform=transform)
val_dataset = MultiLabelDataset(image_labels=val_data, root_dir=data_path+"images/", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False) 


In [9]:
# define our model

model_vgg16 = models.vgg16(pretrained=True)

for param in model_vgg16.features.parameters():
    param.requires_grad = False

num_features = model_vgg16.classifier[0].in_features
num_categories = len(categories)

model_vgg16.classifier = nn.Sequential(
    nn.Linear(num_features, 4096),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(4096, 4096),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(4096, num_categories),
    nn.Sigmoid()
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_vgg16 = model_vgg16.to(device)

optimizer = Adam(model_vgg16.classifier.parameters(), lr=0.001)
loss_function = nn.BCELoss()

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to C:\Users\hrida/.cache\torch\hub\checkpoints\vgg16-397923af.pth
100%|██████████| 528M/528M [00:34<00:00, 16.0MB/s] 


In [15]:
# train the model

history = {
    'train_loss': [],
    'val_loss': [],
    'val_accuracy': []
}

for epoch in range(5):
    model_vgg16.train()  # Training mode
    running_loss = 0.0

    for inputs, labels in tqdm(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model_vgg16(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    # Validation phase
    model_vgg16.eval()
    val_running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for inputs, labels in tqdm(val_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model_vgg16(inputs)
            loss = loss_function(outputs, labels)
            
            val_running_loss += loss.item()

            # Calculate accuracy
            predicted = outputs > 0.5  # Using 0.5 as threshold
            correct_preds += (predicted == labels.byte()).sum().item()
            total_preds += labels.size(0) * labels.size(1)

    epoch_train_loss = running_loss / len(train_loader)
    epoch_val_loss = val_running_loss / len(val_loader)
    epoch_val_accuracy = correct_preds / total_preds

    print(f"Epoch {epoch+1}, Training Loss: {epoch_train_loss}, Validation Loss: {epoch_val_loss}, Validation Accuracy: {epoch_val_accuracy}")

    # Recording the metrics for this epoch
    history['train_loss'].append(epoch_train_loss)
    history['val_loss'].append(epoch_val_loss)
    history['val_accuracy'].append(epoch_val_accuracy)

print('Finished Training')



100%|██████████| 250/250 [03:23<00:00,  1.23it/s]
100%|██████████| 63/63 [00:48<00:00,  1.30it/s]


Epoch 1, Training Loss: 0.20136496406793594, Validation Loss: 0.19831654642309463, Validation Accuracy: 0.9294


100%|██████████| 250/250 [02:35<00:00,  1.60it/s]
100%|██████████| 63/63 [00:38<00:00,  1.65it/s]


Epoch 2, Training Loss: 0.19371293687820434, Validation Loss: 0.19579832823503585, Validation Accuracy: 0.93102


100%|██████████| 250/250 [02:39<00:00,  1.57it/s]
100%|██████████| 63/63 [00:38<00:00,  1.62it/s]


Epoch 3, Training Loss: 0.19058597487211226, Validation Loss: 0.18565417329470316, Validation Accuracy: 0.92944


100%|██████████| 250/250 [02:41<00:00,  1.55it/s]
100%|██████████| 63/63 [00:39<00:00,  1.61it/s]


Epoch 4, Training Loss: 0.1853734034001827, Validation Loss: 0.18514088245611343, Validation Accuracy: 0.93048


100%|██████████| 250/250 [02:58<00:00,  1.40it/s]
100%|██████████| 63/63 [00:39<00:00,  1.59it/s]

Epoch 5, Training Loss: 0.18152281159162523, Validation Loss: 0.19107283438955033, Validation Accuracy: 0.92926
Finished Training





In [None]:
# plotting accuracy and loss curve
import matplotlib.pyplot as plt

def plot_training_history(history):
    epochs = range(1, len(history['train_loss']) + 1)

    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, history['train_loss'], label='Training Loss')
    plt.plot(epochs, history['val_loss'], label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, history['val_accuracy'], label='Validation Accuracy')
    plt.title('Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()


plot_training_history(history)

In [2]:
# plotting sample predicted outputs vs true labels from validation set

import matplotlib.pyplot as plt
import numpy as np


def tensor_to_image(tensor):
    tensor = tensor * torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    tensor = tensor.to('cpu').detach().numpy().transpose((1, 2, 0))
    tensor = np.clip(tensor, 0, 1)
    return tensor

def show_images_with_predictions(dataloader, model, device, categories, num_images=6):
    model.eval()
    images_so_far = 0
    plt.figure(figsize=(30, 30))

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(dataloader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)

            for j in range(inputs.size()[0]):
                images_so_far += 1
                ax = plt.subplot(num_images//2, 2, images_so_far)
                ax.axis('off')

                predicted_labels = (outputs[j] > 0.5).int()
                pred_labels_text = [categories[idx]["name"] for idx, label in enumerate(predicted_labels) if label == 1]
                true_labels_text = [categories[idx]["name"] for idx, label in enumerate(labels[j]) if label == 1]

                ax.set_title(f"True: {true_labels_text}\nPred: {pred_labels_text}")
                plt.imshow(tensor_to_image(inputs.cpu().data[j]))

                if images_so_far == num_images:
                    model.train()
                    return
        model.train()

show_images_with_predictions(val_loader, model_vgg16, device, categories)
