In [15]:
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms, datasets, models
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

In [16]:
import matplotlib.pyplot as plt
import numpy as np
import copy
import time
import os

In [17]:
import utils
from models import ResNet

In [18]:
data_loaders, test_loader = utils.get_data_loaders()

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [19]:
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#net = ResNet.ResNet18()
net = ResNet.ResNet50()
#net = ResNet.ResNet152()

print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())

0
<torch.cuda.device object at 0x7fd5c5eda390>
1


In [21]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
    net = net.cuda()
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True
    criterion = nn.CrossEntropyLoss().cuda()
else:
    print('CPU')
    criterion = nn.CrossEntropyLoss()

Tesla K80


In [22]:
#optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4, nesterov=True)
optimizer = optim.Adam(net.parameters())
#scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 32)
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[81, 122], gamma=0.1)

Size of CIFAR-10

In [23]:
dataset_size = {'train': 4000,'val': 1000,'test': 1000}

Load Trained Model

In [24]:
old = False

In [25]:
SAVE_PATH = './trained-models/resnet50-net.pth'

if old:
    old_epochs = utils.load_checkpoint(net, optimizer, scheduler, SAVE_PATH)

Implement SWATS

In [26]:
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_accuracy = 0.0
    
    for epoch in range(num_epochs):
        
        print(str(epoch) + "/" + str(num_epochs))
        
        if type(scheduler) is torch.optim.lr_scheduler.MultiStepLR:
            scheduler.step()
        
        for phase in ['train', 'val']:
            
            print(phase)
            
            if phase == 'train':
                model.train()
                
            else:
                model.eval()
            
            running_loss = 0.0
            running_corrects = 0
            total = 0
            
            start = time.time()
            
            for index, (inputs, targets) in enumerate(data_loaders[phase]):
                
                inputs = inputs.to(device)
                targets = targets.to(device)

                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = outputs.max(1)
                    loss = criterion(outputs, targets)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                running_loss += loss.item()
                running_corrects += preds.eq(targets).sum().item()
                total += targets.size(0)
                
            epoch_loss = running_loss / total
            epoch_acc = running_corrects / total
                
            print('Loss: ' + str(epoch_loss) + ", Epoch Accuracy: " + str(epoch_acc))
            
            print('Time: ' + str((time.time() - start) / 60))
            
            if phase == 'val' and type(scheduler) is torch.optim.lr_scheduler.ReduceLROnPlateau:
                scheduler.step(epoch_loss)
            
            if phase == 'val' and epoch_acc > best_accuracy:
                best_accuracy = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
                if not os.path.isdir('trained-models'):
                    os.mkdir('trained-models')
                
                state = {
                    
                    'epoch': epoch,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                    
                }

                if os.path.exists(SAVE_PATH):
                    os.remove(SAVE_PATH)
                
                torch.save(state, SAVE_PATH)
    
    print('Best Accuracy: ' + str(best_accuracy))
    
    model.load_state_dict(best_model_wts)
    return model

In [27]:
base_epochs = 25

if old:
    epochs = base_epochs - old_epochs
else:
    epoch = base_epochs

print(epochs)

net = train_model(net, criterion, optimizer, scheduler, epochs)

200
0/200
train
Loss: 0.1774344204083085, Epoch Accuracy: 0.332475
Time: 8.053499134381612
val
Loss: 0.1392613341510296, Epoch Accuracy: 0.4734
Time: 0.5503010431925456
1/200
train
Loss: 0.13380854953676463, Epoch Accuracy: 0.51365
Time: 8.113314584891002
val
Loss: 0.11063577057421208, Epoch Accuracy: 0.5941
Time: 0.5501384417215983
2/200
train
Loss: 0.11051312655732036, Epoch Accuracy: 0.6056
Time: 8.125343203544617
val
Loss: 0.08935387621670961, Epoch Accuracy: 0.6771
Time: 0.5525282780329387
3/200
train
Loss: 0.0962516548551619, Epoch Accuracy: 0.6601
Time: 8.13117868900299
val
Loss: 0.08174375787973404, Epoch Accuracy: 0.715
Time: 0.5507693131764729
4/200
train
Loss: 0.0858205490451306, Epoch Accuracy: 0.700125
Time: 8.12622828880946
val
Loss: 0.07458807299248874, Epoch Accuracy: 0.7411
Time: 0.5502122004826864
5/200
train
Loss: 0.07674619561061263, Epoch Accuracy: 0.730525
Time: 8.113006242116292
val
Loss: 0.06851362332180143, Epoch Accuracy: 0.7593
Time: 0.5495468179384867
6/200


Process Process-278:
Process Process-279:
Process Process-277:
Process Process-280:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
  

KeyboardInterrupt: 

In [None]:
sgd_epochs = 10
utils.load_checkpoint(net, optimizer, scheduler, SAVE_PATH)
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.2, weight_decay=5e-4, nesterov=True)
net = train_model(net, criterion, optimizer, scheduler, sgd_epochs)

0/10
train
Loss: 0.010130136712417153, Epoch Accuracy: 0.968025
Time: 7.771473491191864
val
Loss: 0.024848540477099595, Epoch Accuracy: 0.9202
Time: 0.5503161509831747
1/10
train
Loss: 0.009947656334062048, Epoch Accuracy: 0.9686
Time: 7.778614072004954
val
