In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils import data as dt
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import numpy as np
import warnings
import mlflow
import requests
warnings.simplefilter("ignore")

In [2]:
import pathlib
# BASE_DIR will be like '/home/jovyan/DemoExample/'
BASE_DIR = pathlib.Path().absolute()
print(f"Working dir: {BASE_DIR}")

## Download dataset

In [3]:
def save_file(url, filename):
    # Download file and place it on local storage
    try:
        r = requests.get(url, timeout=10)

        with open(filename, 'wb') as f:
            f.write(r.content)
        print(f"{filename} downloaded from {url}")
    except requests.exceptions.Timeout:
        print(f"No internet connection")

In [4]:
save_file("https://github.com/sbercloud-ai/aicloud-examples/raw/master/quick-start/notebooks_gpu/mnist.npz", BASE_DIR.joinpath("mnist.npz"))

/home/jovyan/DemoExample/mnist.npz downloaded from https://github.com/sbercloud-ai/aicloud-examples/raw/master/quick-start/mnist.npz


## Load dataset

In [5]:
data = np.load(BASE_DIR.joinpath('mnist.npz'))
mnist_images_train = np.expand_dims(data['x_train'], 1)
mnist_labels_train = data['y_train']

mnist_images_test = np.expand_dims(data['x_test'], 1)
mnist_labels_test = data['y_test']
data.close()

dataset_train = dt.TensorDataset(torch.Tensor(mnist_images_train), torch.Tensor(mnist_labels_train).long())
dataset_test = dt.TensorDataset(torch.Tensor(mnist_images_test), torch.Tensor(mnist_labels_test).long())

train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=50)
test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=50)

## Define model

In [6]:
class CNNClassifier(nn.Module):
    """Custom module for a simple convnet classifier"""

    def __init__(self):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.dropout = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.dropout(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x)

In [7]:
clf = CNNClassifier()
device = torch.device(f'cuda:0')

## DataParallel if several GPUs

In [8]:
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    clf = nn.DataParallel(clf)

In [9]:
clf.to(device)

CNNClassifier(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (dropout): Dropout2d(p=0.5, inplace=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

In [11]:
current_time = datetime.now().strftime("%Y%m%d-%H_%M")
writer = SummaryWriter(log_dir=BASE_DIR.joinpath('logs/log_' + current_time))

In [12]:
optimizer = optim.SGD(clf.parameters(), lr=0.01, momentum=0.5)

In [13]:
def train(epoch, clf, optimizer, writer):
    clf.train()  # set model in training mode (need this because of dropout)

    # dataset API gives us pythonic batching
    for batch_id, (data, target) in enumerate(train_loader):

        data = data.to(device)
        target = target.to(device)
        # forward pass, calculate loss and backprop!
        optimizer.zero_grad()
        preds = clf(data)
        loss = F.nll_loss(preds, target)
        loss.backward()

        optimizer.step()

        if batch_id % 100 == 0:
            print(f'train loss = {loss.item()}')
            writer.add_scalar('Train', loss.item(), epoch * len(train_loader) + batch_id)

In [14]:
def test(epoch, clf, writer):
    clf.eval()  # set model in inference mode (need this because of dropout)
    test_loss = 0
    correct = 0

    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        output = clf(data)
        test_loss += F.nll_loss(output, target).item()
        pred = output.data.max(1)[1]  # get the index of the max log-probability
        correct += pred.eq(target.data).cpu().sum()
    
    test_loss = test_loss
    test_loss /= len(test_loader)  # loss function already averages over batch size
    accuracy = 100. * correct / len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))
    
    
    mlflow.log_metric("Test loss", test_loss)  # add mlflow metrics
    mlflow.log_metric("Accuracy", np.round(accuracy.item(),1)) # add mlflow metrics

In [15]:
num_epochs = 3
print(f'Start train with num epoch = {num_epochs}')

mlflow.set_tracking_uri('file:/home/jovyan/mlruns')
mlflow.set_experiment("pytorch_tensorboard_mlflow.ipynb")
with mlflow.start_run(nested=True) as run:
    for epoch in range(num_epochs):
        print("Epoch %d" % epoch)
        train(epoch, clf, optimizer, writer)
        test(epoch, clf, writer)
        torch.save(clf.state_dict(), BASE_DIR.joinpath('logs/log_' + current_time + f"/model_epoch_{epoch}.bin"))
        writer.close()

Start train with num epoch = 3
Epoch 0
train loss = 22.238218307495117
train loss = 2.276198148727417
train loss = 2.2093944549560547
train loss = 1.3249439001083374
train loss = 1.3222427368164062
train loss = 0.6162148118019104
train loss = 0.7235971093177795
train loss = 0.3437134623527527
train loss = 0.48278796672821045
train loss = 0.513648509979248
train loss = 0.24412333965301514
train loss = 0.5696728229522705

Test set: Average loss: 0.2380, Accuracy: 9256/10000 (93%)

Epoch 1
train loss = 0.3267053961753845
train loss = 0.3192499876022339
train loss = 0.4160289764404297
train loss = 0.08153068274259567
train loss = 0.6905174851417542
train loss = 0.15559092164039612
train loss = 0.3709448277950287
train loss = 0.2520889639854431
train loss = 0.3144308030605316
train loss = 0.32689231634140015
train loss = 0.09587522596120834
train loss = 0.4002344608306885

Test set: Average loss: 0.1533, Accuracy: 9526/10000 (95%)

Epoch 2
train loss = 0.15486745536327362
train loss = 0.318