# CIFAR10 Low Precision Training Example
In this notebook, we present a quick example of how to simulate training a deep neural network in low precision with QPyTorch.

In [51]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchvision.transforms as transforms

from qtorch.quant import Quantizer, quantizer
from qtorch.optim import OptimLP
from torch.optim import SGD
from qtorch import FloatingPoint, FixedPoint
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston, load_diabetes

We first load the data. In this example, we will experiment with regression datasets, `boston` and `diabetes` from `scikit-learn.datasets`.

We have a dataset class.

In [52]:
class RegressionDataset(Dataset):
    def __init__(self, X, y):
        self.y = y
        self.X = X
        
        # we mide need some transforms like sklearn scalers
        # self.transform = transforms.ToTensor()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        features = torch.tensor(self.X[idx], dtype=torch.float32)
        target = torch.tensor(self.y[idx], dtype=torch.float32)
        # features = self.X[idx]
        # target = self.y[idx]
        """
        if self.transform:
            features = self.transform(features)
            target = self.transform(target)
        """
        
        return features, target

In [53]:
class LinearLP(nn.Module):
    """
    a low precision Logistic Regression model
    """
    def __init__(self, input_size: int=5, quant: Quantizer=None):
        super(LinearLP, self).__init__()
        self.W = nn.Linear(input_size, 1)
        self.quant = quant
    
    def forward(self, x):
        out = self.W(x)
        if self.quant:
            out = self.quant(out)
        return out

In [42]:
DATA = 'boston'
BATCH_SIZE = 32

if DATA == 'boston':
    data = load_boston()
elif DATA == 'diabetes':
    data = load_diabetes()

X, y = data['data'], data['target']

INPUT_SIZE = X[0].shape[0]

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create custom datasets with transforms
training_data = RegressionDataset(X_train, y_train)
test_data = RegressionDataset(X_test, y_test)

train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

We then define the quantization setting we are going to use. In particular, here we follow the setting reported in the paper "Training Deep Neural Networks with 8-bit Floating Point Numbers", where the authors propose to use specialized 8-bit and 16-bit floating point format.

In [43]:
bit_16 = FloatingPoint(exp=6, man=9)
Q = Quantizer(forward_number=bit_16, backward_number=bit_16,
              forward_rounding="nearest", backward_rounding="nearest")

model = LinearLP(input_size=INPUT_SIZE, quant=None)

In [44]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device=device)

Next, we define a low-precision ResNet. In the definition, we recursively insert quantization module after every convolution layer. Note that the quantization of weight, gradient, momentum, and gradient accumulator are not handled here.

We now use the low-precision optimizer wrapper to help define the quantization of weight, gradient, momentum, and gradient accumulator.

In [45]:
optimizer = SGD(model.parameters(), lr=0.05, momentum=0.9, weight_decay=5e-4)

We can reuse common training scripts without any extra codes to handle quantization.

In [46]:
def run_epoch(loader, model, criterion, optimizer=None, phase="train"):
    assert phase in ["train", "eval"], "invalid running phase"
    loss_sum = 0.0
    correct = 0.0

    if phase=="train": model.train()
    elif phase=="eval": model.eval()

    ttl = 0
    with torch.autograd.set_grad_enabled(phase=="train"):
        for i, (input, target) in tqdm(enumerate(loader), total=len(loader)):
            input = input.to(device=device)
            target = target.to(device=device)
            output = model(input)
            loss = criterion(output, target)
            loss_sum += loss.cpu().item() * input.size(0)
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).sum()
            ttl += input.size()[0]

            if phase=="train":
                loss = loss * 1000 # do loss scaling
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

    correct = correct.cpu().item()
    return {
        'loss': loss_sum / float(ttl),
        'accuracy': correct / float(ttl) * 100.0,
    }

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

Begin the training process just as usual. Enjoy!

In [48]:
for epoch in range(10):
    train_res = run_epoch(train_dataloader, model, F.mse_loss,
                                optimizer=optimizer, phase="train")
    test_res = run_epoch(test_dataloader, model, F.mse_loss,
                                optimizer=optimizer, phase="eval")

  loss = criterion(output, target)
  loss = criterion(output, target)
100%|██████████| 12/12 [00:00<00:00, 498.73it/s]
  loss = criterion(output, target)
100%|██████████| 5/5 [00:00<00:00, 952.95it/s]
100%|██████████| 12/12 [00:00<00:00, 730.74it/s]
100%|██████████| 5/5 [00:00<00:00, 933.15it/s]
100%|██████████| 12/12 [00:00<00:00, 1034.40it/s]
100%|██████████| 5/5 [00:00<00:00, 897.37it/s]
100%|██████████| 12/12 [00:00<00:00, 1339.28it/s]
100%|██████████| 5/5 [00:00<00:00, 1460.72it/s]
100%|██████████| 12/12 [00:00<00:00, 1555.80it/s]
100%|██████████| 5/5 [00:00<00:00, 1501.94it/s]
100%|██████████| 12/12 [00:00<00:00, 1277.52it/s]
100%|██████████| 5/5 [00:00<00:00, 1488.08it/s]
100%|██████████| 12/12 [00:00<00:00, 1419.91it/s]
100%|██████████| 5/5 [00:00<00:00, 1510.59it/s]
100%|██████████| 12/12 [00:00<00:00, 1522.25it/s]
100%|██████████| 5/5 [00:00<00:00, 1104.05it/s]
100%|██████████| 12/12 [00:00<00:00, 1531.00it/s]
100%|██████████| 5/5 [00:00<00:00, 1551.38it/s]
100%|██████████| 1

In [49]:
train_res

{'loss': nan, 'accuracy': 0.0}

In [50]:
test_res

{'loss': nan, 'accuracy': 0.0}

In [None]:
# First define number formats used in forward and backward quantization
from qtorch import FixedPoint, FloatingPoint
# Create a quantizer
from qtorch.quant import Quantizer

# forward_num = FixedPoint(wl=4, fl=2)
# backward_num = FloatingPoint(exp=5, man=2)
bit_8 = FloatingPoint(exp=5, man=2)
bit_16 = FloatingPoint(exp=6, man=9)

# define quantization functions
weight_quant = quantizer(forward_number=bit_16,
                        forward_rounding="nearest")
grad_quant = quantizer(forward_number=bit_16,
                        forward_rounding="nearest")
momentum_quant = quantizer(forward_number=bit_16,
                        forward_rounding="nearest")
acc_quant = quantizer(forward_number=bit_16,
                        forward_rounding="nearest")

# define a lambda function so that the Quantizer module can be duplicated easily
act_error_quant = lambda : Quantizer(forward_number=bit_16, backward_number=bit_16,
                        forward_rounding="nearest", backward_rounding="nearest")


Q = Quantizer(forward_number=bit_16, backward_number=bit_16,
              forward_rounding="nearest", backward_rounding="stochastic")

optimizer = SGD(model.parameters(), lr=0.05, momentum=0.9, weight_decay=5e-4)

Q_optimizer = OptimLP(optimizer,
                    weight_quant=weight_quant,
                    grad_quant=grad_quant,
                    momentum_quant=momentum_quant,
                    acc_quant=acc_quant,
                    grad_scaling=1/1000 # do loss scaling
)

model = PreResNet(act_error_quant)

{'loss': 1.5749474658966065, 'accuracy': 43.63}