# Training the classifier
This notebook loads and the resnet50 classififer and performs feature extraction using the generated data from the first notebook.
To predict only pedestrians, the output classes are reduced from 1000 (original ImageNet output size) to two.

In [14]:
# imports
import torch
from torch import nn
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
from torchvision import datasets, transforms, models
import os
import time
import copy
from pathlib import Path

In [2]:
# Top level data directory. Here we assume the format of the directory conforms
#   to the ImageFolder structure
data_dir = "./data/classifierImages/preparedForTrainTestVal"

# Number of classes in the dataset
num_classes = 2

# Batch size for training 
batch_size = 8

# Number of epochs to train for
num_epochs = 15

# Flag for feature extracting. When False, we finetune the whole model,
# when True we only update the reshaped layer params
feature_extract = True

In [22]:
# define transorms
data_transforms = {
    'train': transforms.Compose([        
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([        
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
print("Initializing Datasets and Dataloaders...")
# Create training and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True,) for x in ['train', 'val']}
print('Done.')

Initializing Datasets and Dataloaders...
Done.


In [5]:
# Take a short look how the images are labeled ...
image_datasets['train'].class_to_idx

{'neg_images': 0, 'pos_images': 1}

In [6]:
# set the requires_grad attribute to False if only the last layer should be updated (feature extraction)
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [7]:
# instantiate pretrained model 
model_ft = models.resnet50(pretrained=True)
set_parameter_requires_grad(model_ft, feature_extract)

# remove original output layer and replace it with 2 dimensinal layer to detect, if an image patch contains a pedestrian or not
number_output_features = model_ft.fc.out_features
number_input_features  = model_ft.fc.in_features

model_ft.fc = nn.Linear(number_input_features, num_classes)

print(f'The number of output layers was reduced from {number_output_features} to 2.')

# renset50 uses an adaptive avg pooling layer at the end
# this means the size of the input images does not matter, since the planes are always flattern to the same size
print(f'Since resnet50 uses "{model_ft.avgpool}" the size of the input image can be any size ...')


Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to C:\Users\peter/.cache\torch\checkpoints\resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))


The number of output layers was reduced from 1000 to 2.
Since resnet50 uses "AdaptiveAvgPool2d(output_size=(1, 1))" the size of the input image can be any size ...


In [8]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    #   In train mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                   
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [9]:
# Send the model to GPU
model_ft = model_ft.to(device)

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

Params to learn:
	 fc.weight
	 fc.bias


In [10]:
# Setup the loss function
criterion = nn.CrossEntropyLoss()

# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs)

Epoch 0/14
----------
train Loss: 0.4139 Acc: 0.8182
val Loss: 0.2128 Acc: 0.9524

Epoch 1/14
----------
train Loss: 0.2357 Acc: 0.9130
val Loss: 0.1090 Acc: 0.9702

Epoch 2/14
----------
train Loss: 0.1964 Acc: 0.9289
val Loss: 0.0909 Acc: 0.9762

Epoch 3/14
----------
train Loss: 0.1933 Acc: 0.9269
val Loss: 0.1014 Acc: 0.9643

Epoch 4/14
----------
train Loss: 0.1629 Acc: 0.9526
val Loss: 0.1097 Acc: 0.9643

Epoch 5/14
----------
train Loss: 0.1987 Acc: 0.9289
val Loss: 0.0713 Acc: 0.9762

Epoch 6/14
----------
train Loss: 0.1563 Acc: 0.9447
val Loss: 0.0801 Acc: 0.9762

Epoch 7/14
----------
train Loss: 0.2333 Acc: 0.9111
val Loss: 0.0941 Acc: 0.9702

Epoch 8/14
----------
train Loss: 0.1441 Acc: 0.9368
val Loss: 0.0701 Acc: 0.9821

Epoch 9/14
----------
train Loss: 0.1555 Acc: 0.9387
val Loss: 0.0551 Acc: 0.9762

Epoch 10/14
----------
train Loss: 0.1785 Acc: 0.9427
val Loss: 0.0569 Acc: 0.9821

Epoch 11/14
----------
train Loss: 0.1546 Acc: 0.9387
val Loss: 0.0546 Acc: 0.9821

Ep

In [15]:
#save current model
model_dir = './saved_models/'
model_name = 'pedestrianClassifier15Epochs.pt'
Path(model_dir).mkdir(parents=True, exist_ok=True)
# after training, save your model parameters in the dir 'saved_models'
torch.save(model_ft.state_dict(), model_dir+model_name)

In [58]:
print("Initializing Datasets and Dataloaders for test set ...")
# Create test dataset
image_dataset_test = datasets.ImageFolder(os.path.join(data_dir, 'test'), data_transforms['test'])
# Create test dataloader
dataloader_test = torch.utils.data.DataLoader(image_dataset_test, batch_size=1, shuffle=True)
print('Done.')

Initializing Datasets and Dataloaders for test set ...
Done.


In [59]:
# get scores from test data
def evaluate(dataloader_test, model):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in dataloader_test:          
            images, labels = data
            images =torch.unsqueeze(images,0)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += 1
            if (predicted == labels):
                correct+=1
    print(f'Accuracy of the network on the {total} test images: %d %%' % (100 * correct / total))


In [60]:
evaluate(image_dataset_test, model_ft)

Accuracy of the network on the 172 test images: 96 %
