In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np 
import matplotlib.pyplot as plt

import copy
import time
import sys

In [43]:
sys.path.append('data_ingestion')
from data_ingestion import data_pipeline_pytorch_smaller_dataset, data_ingestion_for_big_dataset

number of positive samples:  430509
number of negative samples:  222442
number of positive samples:  3999
number of negative samples:  4057
number of positive samples:  3877
number of negative samples:  4049


In [30]:
INFERENCE_PATH = './trained_pytorch_model/resnet_finetuned_smaller_dataset.pth'

In [3]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)
if torch.cuda.is_available():
    model = model.cuda()
# Loss Function
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

Using cache found in /Users/pyuvraj/.cache/torch/hub/pytorch_vision_v0.10.0


In [6]:
# check if CUDA is available
use_cuda = torch.cuda.is_available()
# set device to be cuda if available, otherwise it will be set to cpu
device = torch.device("cuda" if use_cuda else "cpu")

In [33]:
# Generators - Small Nudity Dataset
training_set = data_pipeline_pytorch_smaller_dataset.training_dataset
training_generator = data_pipeline_pytorch_smaller_dataset.train_dataloader
validation_set = data_pipeline_pytorch_smaller_dataset.val_dataset
validation_generator = data_pipeline_pytorch_smaller_dataset.val_dataloader
dataloaders = {'train': training_generator, 'val': validation_generator}

In [42]:
# Generators - Large Nudity Dataset
training_set = data_ingestion_pipeline_smaller_dataset.training_dataset
training_generator = data_ingestion_pipeline_smaller_dataset.train_dataloader
validation_set = data_ingestion_pipeline_smaller_dataset.val_dataset
validation_generator = data_ingestion_pipeline_smaller_dataset.val_dataloader
dataloaders = {'train': training_generator, 'val': validation_generator}

AttributeError: module 'data_ingestion.data_ingestion_pipeline_smaller_dataset' has no attribute 'training_dataset'

In [7]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            running_loss = 0.0
            running_corrects = 0
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                # statistics
                running_loss += loss.item()g * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        print()
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
# Train and evaluate
dataset_sizes = {'train': len(training_set), 'val': len(validation_set)}
# Dummy dataset
model_ft = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=2)



Epoch 0/1
----------
train Loss: 0.2045 Acc: 0.9099
val Loss: 0.1427 Acc: 0.9454

Epoch 1/1
----------
train Loss: 0.0794 Acc: 0.9711
val Loss: 0.1367 Acc: 0.9486

Training complete in 30m 43s
Best val Acc: 0.948594


### Testing and evaluation part

In [25]:
test_loader = data_pipeline_pytorch_smaller_dataset.test_dataloader
model_ft.eval()
test_loss = 0
with torch.no_grad():
    test_preds = []
    test_labels = []
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        test_preds += preds.cpu().numpy().tolist()
        test_labels += labels.cpu().numpy().tolist()

In [26]:

# 3. Calculate the test accuracy and confusion matrix
test_acc = accuracy_score(test_labels, test_preds)
conf_mat = confusion_matrix(test_labels, test_preds)
print("Test Loss: {:.4f}".format(test_loss/len(data_pipeline_pytorch_smaller_dataset.test_dataset)))
print("Test Accuracy: {:.4f}".format(test_acc))
print("Confusion Matrix:\n", conf_mat)

Test Loss: 0.0294
Test Accuracy: 0.9921
Confusion Matrix:
 [[3186    9]
 [  40 2969]]


### Download the model for inference

In [32]:
torch.save(model, INFERENCE_PATH)
inference_model = torch.load(INFERENCE_PATH)
inference_model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  