In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.utils import make_grid
from torchvision.transforms import RandomCrop
import matplotlib.pyplot as plt
import torch.nn.functional as F


import util #For XNOR NIN https://github.com/jiecaoyu/XNOR-Net-PyTorch

from binarized_modules import  BinarizeLinear,BinarizeConv2d #For BNN VGG https://github.com/itayhubara/BinaryNet.pytorch
from vgg_cifar10_binary import VGG_Cifar10 #For BNN VGG https://github.com/itayhubara/BinaryNet.pytorch



def get_accuracy(model, data_loader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

import torch.nn.functional as F

class BinarizedConv2d(nn.Conv2d):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0,):
        super().__init__(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
        self.binary_weight = None

    def binarize(self):
        self.binary_weight = torch.sign(self.weight)

    def forward(self, x):
        if self.binary_weight is None:
            self.binarize()
        return F.conv2d(x, self.binary_weight, bias=self.bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups)
    
class BinarizedActivation(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return torch.sign(x)
    
class BinActive(torch.autograd.Function):
    '''
    Binarize the input activations and calculate the mean across channel dimension.
    '''
    def forward(self, input):
        self.save_for_backward(input)
        size = input.size()
        mean = torch.mean(input.abs(), 1, keepdim=True)
        input = input.sign()
        return input, mean

    def backward(self, grad_output, grad_output_mean):
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input.ge(1)] = 0
        grad_input[input.le(-1)] = 0
        return grad_input
    
class XNORConv2d(nn.Module):
    def __init__(self, in_channels, out_channels,
            kernel_size=-1, stride=-1, padding=-1, dropout=0):
        super(XNORConv2d, self).__init__()
        self.layer_type = 'BinConv2d'
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dropout_ratio = dropout

        self.bn = nn.BatchNorm2d(in_channels, eps=1e-4, momentum=0.1, affine=True)
        self.bn.weight.data = self.bn.weight.data.zero_().add(1.0)
        if dropout!=0:
            self.dropout = nn.Dropout(dropout)
        self.conv = nn.Conv2d(in_channels, out_channels,
                kernel_size=kernel_size, stride=stride, padding=padding)
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        x = self.bn(x)
        x, mean = BinActive.apply(x)
        if self.dropout_ratio!=0:
            x = self.dropout(x)
        x = self.conv(x)
        x = self.relu(x)
        return x
    
class XnorNet(nn.Module):
    def __init__(self):
        super(XnorNet, self).__init__()
        self.xnor = nn.Sequential(
                nn.Conv2d(3, 192, kernel_size=5, stride=1, padding=2),
                nn.BatchNorm2d(192, eps=1e-4, momentum=0.1, affine=False),
                nn.ReLU(inplace=True),
                XNORConv2d(192, 160, kernel_size=1, stride=1, padding=0),
                XNORConv2d(160,  96, kernel_size=1, stride=1, padding=0),
                nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

                XNORConv2d( 96, 192, kernel_size=5, stride=1, padding=2, dropout=0.5),
                XNORConv2d(192, 192, kernel_size=1, stride=1, padding=0),
                XNORConv2d(192, 192, kernel_size=1, stride=1, padding=0),
                nn.AvgPool2d(kernel_size=3, stride=2, padding=1),

                XNORConv2d(192, 192, kernel_size=3, stride=1, padding=1, dropout=0.5),
                XNORConv2d(192, 192, kernel_size=1, stride=1, padding=0),
                nn.BatchNorm2d(192, eps=1e-4, momentum=0.1, affine=False),
                nn.Conv2d(192,  10, kernel_size=1, stride=1, padding=0),
                nn.ReLU(inplace=True),
                nn.AvgPool2d(kernel_size=8, stride=1, padding=0),
                )

    def forward(self, x):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                if hasattr(m.weight, 'data'):
                    m.weight.data.clamp_(min=0.01)
        x = self.xnor(x)
        x = x.view(x.size(0), 10)
        return x

class FPBlinkNet(nn.Module):
    def __init__(self,dropout_prob=0.5,l1_reg=0.0082):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.res1 = SimpleResidualBlock(32, 32)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.relu2 = nn.ReLU()
        self.res2 = SimpleResidualBlock(64, 64)
        self.dropout1 = nn.Dropout(p=dropout_prob)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
        self.relu3 = nn.ReLU()
        self.res3 = SimpleResidualBlock(128, 128)
        self.fc = nn.Linear(8192, 10)
        self.max_train_acc = 0.0
        self.l1_reg=l1_reg
        #self.lr = lr
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
    def update_max_train_acc(self, acc):
        if acc > self.max_train_acc:
            self.max_train_acc = acc
            
    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.res1(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.res2(out)
        out = self.dropout1(out)
        out = self.conv3(out)
        out = self.bn3(out)
        out = self.relu3(out)
        out = self.res3(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out 
    
class BlinkNet(nn.Module):
    def __init__(self,dropout_prob=0.45,l1_reg=0.000345, lr=0.00008):
        super().__init__()
        self.conv1 = BinarizedConv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.res1 = SimpleResidualBlock(32, 32)
        self.conv2 = BinarizedConv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.relu2 = nn.ReLU()
        self.res2 = SimpleResidualBlock(64, 64)
        self.dropout1 = nn.Dropout(p=dropout_prob)
        self.conv3 = BinarizedConv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
        self.relu3 = nn.ReLU()
        self.res3 = SimpleResidualBlock(128, 128)
        self.fc = nn.Linear(8192, 10)
        self.max_train_acc = 0.0
        self.l1_reg=l1_reg
        self.lr = lr
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
    def update_max_train_acc(self, acc):
        if acc > self.max_train_acc:
            self.max_train_acc = acc
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.res1(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.res2(out)
        out = self.dropout1(out)
        out = self.conv3(out)
        out = self.bn3(out)
        out = self.relu3(out)
        out = self.res3(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out
class SimpleResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.relu1(out)
        out = self.conv2(out)
        return self.relu2(out) + x # ReLU can be applied before or after adding the input

class BinaryBlinkNet(nn.Module):
    def __init__(self,dropout_prob=0.45,l1_reg=0.000345, lr=0.00008):
        super().__init__()
        self.conv1 = BinarizedConv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.res1 = BinarizedResidualBlock(32, 32)
        self.conv2 = BinarizeConv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.relu2 = nn.ReLU()
        self.res2 = BinarizedResidualBlock(64, 64)
        self.dropout1 = nn.Dropout(p=dropout_prob)
        self.conv3 = BinarizedConv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
        self.relu3 = nn.ReLU()
        self.res3 = BinarizedResidualBlock(128, 128)
        self.fc = nn.Linear(8192, 10)
        self.max_train_acc = 0.0
        self.l1_reg=l1_reg
        self.lr = lr
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
    def update_max_train_acc(self, acc):
        if acc > self.max_train_acc:
            self.max_train_acc = acc
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.res1(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.res2(out)
        out = self.dropout1(out)
        out = self.conv3(out)
        out = self.bn3(out)
        out = self.relu3(out)
        out = self.res3(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out
    
class BinarizedResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = BinarizedConv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = BinarizedConv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.relu1(out)
        out = self.conv2(out)
        return self.relu2(out) + x # ReLU can be applied before or after adding the input



#Training function    
def train(train_dl, val_dl, model, loss_fn, optimizer, epochs, history=6, file_name='best_model.pth'):
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    BVE=[]
    epoch_mem_usage=[]
    memory_usage=[]
    
    for epoch in range(epochs):
        model.train()
        #print("Model Lr:", model.lr)
        train_loss = 0
        correct = 0
        total = 0
        best_val_acc=0
        best_epoch=0
        t=0
        for i, (images, labels) in enumerate(train_dl):
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            current_mem_usage = torch.cuda.memory_allocated() / 1024 / 1024
            memory_usage.append(current_mem_usage)
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            
            
        train_loss /= len(train_dl.dataset)
        train_acc = correct / total
        val_loss, val_acc = evaluate(model, val_dl, loss_fn)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), file_name)
            BVE=[val_acc,epoch]
        epoch_mem_usage.append(sum(memory_usage))
        memory_usage=[]
        print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, Epoch Memory Usage: {epoch_mem_usage[epoch]:.4f} MB')      
        
    return train_losses, val_losses, train_accs, val_accs,BVE,epoch_mem_usage


def evaluate(model, data_loader, loss_fn):
    model.eval()
    with torch.no_grad():
        val_loss = 0
        correct = 0
        total = 0
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        val_loss /= len(data_loader.dataset)
        val_acc = correct / total
    return val_loss, val_acc



Below are cells for running the models. Here we've run just 1 epoch of each model just to show everything works

In [2]:
#Data setup

import time

#Epochs low just to make sure things work
num_epochs = 1

#Data augmentation
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset =CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=test_transforms)

# Create data loaders
batch_size = 128
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# Create the model and move it to the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Files already downloaded and verified
Files already downloaded and verified
cuda


In [3]:
# Train the model
b_start_time = time.time()
#Change this depending on model
model = BlinkNet().to(device)

# Define the loss function and optimizer
binary_criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(model.parameters(), lr=0.0000778)

binary_train_losses, binary_train_val_losses, binary_train_accs, binary_train_val_accs,binary_BVE,binary_mem_usage = train(train_loader, test_loader, model, binary_criterion, binary_optimizer, num_epochs, file_name='BlinkNet.pth')
binary_train_acc = get_accuracy(model, train_loader)
binary_test_accuracy = get_accuracy(model, test_loader)
b_end_time = time.time()
print(f'Test accuracy: {binary_test_accuracy:.4f}')
print(f'Train accuracy: {binary_train_acc:.4f}')
print(f"Training process took {b_end_time - b_start_time} seconds")

Epoch 1/1, Training Loss: 0.0148, Training Accuracy: 0.3092, Validation Loss: 0.0129, Validation Accuracy: 0.3977, Epoch Memory Usage: 4095.0068 MB
Test accuracy: 0.3977
Train accuracy: 0.3896
Training process took 71.07526659965515 seconds


In [5]:
# Train the model
b_start_time = time.time()
#Change this depending on model
model = FPBlinkNet().to(device)

# Define the loss function and optimizer
binary_criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(model.parameters(), lr=0.0000778)

binary_train_losses, binary_train_val_losses, binary_train_accs, binary_train_val_accs,binary_BVE,binary_mem_usage = train(train_loader, test_loader, model, binary_criterion, binary_optimizer, num_epochs, file_name='FPBlinkNet.pth')
binary_train_acc = get_accuracy(model, train_loader)
binary_test_accuracy = get_accuracy(model, test_loader)
b_end_time = time.time()
print(f'Test accuracy: {binary_test_accuracy:.4f}')
print(f'Train accuracy: {binary_train_acc:.4f}')
print(f"Training process took {b_end_time - b_start_time} seconds")

Epoch 1/1, Training Loss: 0.0142, Training Accuracy: 0.3361, Validation Loss: 0.0118, Validation Accuracy: 0.4517, Epoch Memory Usage: 3956.2095 MB
Test accuracy: 0.4517
Train accuracy: 0.4292
Training process took 61.24358129501343 seconds


In [8]:
# Train the model
b_start_time = time.time()
#Change this depending on model
model = BinaryBlinkNet().to(device)

# Define the loss function and optimizer
binary_criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(model.parameters(), lr=0.0000778)

binary_train_losses, binary_train_val_losses, binary_train_accs, binary_train_val_accs,binary_BVE,binary_mem_usage = train(train_loader, test_loader, model, binary_criterion, binary_optimizer, num_epochs, file_name='BinaryBlinkNet.pth')
binary_train_acc = get_accuracy(model, train_loader)
binary_test_accuracy = get_accuracy(model, test_loader)
b_end_time = time.time()
print(f'Test accuracy: {binary_test_accuracy:.4f}')
print(f'Train accuracy: {binary_train_acc:.4f}')
print(f"Training process took {b_end_time - b_start_time} seconds")

Epoch 1/1, Training Loss: 1.2008, Training Accuracy: 0.1382, Validation Loss: 0.5814, Validation Accuracy: 0.2204, Epoch Memory Usage: 4672.5337 MB
Test accuracy: 0.2204
Train accuracy: 0.2050
Training process took 59.75531601905823 seconds


In [11]:
# Train the model
b_start_time = time.time()
#Change this depending on model
model = XnorNet().to(device)
pretrained_model = torch.load("nin.best.pth.tar")
best_acc = pretrained_model['best_acc']
model.load_state_dict(pretrained_model['state_dict'])

# Define the loss function and optimizer
binary_criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(model.parameters(), lr=0.0000778)

binary_train_losses, binary_train_val_losses, binary_train_accs, binary_train_val_accs,binary_BVE,binary_mem_usage = train(train_loader, test_loader, model, binary_criterion, binary_optimizer, num_epochs, file_name='XNORkNet.pth')
binary_train_acc = get_accuracy(model, train_loader)
binary_test_accuracy = get_accuracy(model, test_loader)
b_end_time = time.time()
print(f'Test accuracy: {binary_test_accuracy:.4f}')
print(f'Train accuracy: {binary_train_acc:.4f}')
print(f"Training process took {b_end_time - b_start_time} seconds")

Epoch 1/1, Training Loss: 0.0154, Training Accuracy: 0.4839, Validation Loss: 0.0087, Validation Accuracy: 0.6737, Epoch Memory Usage: 8321.7397 MB
Test accuracy: 0.6737
Train accuracy: 0.5640
Training process took 66.16460490226746 seconds


In [12]:
# Train the model
b_start_time = time.time()
#Change this depending on model
model = VGG_Cifar10().to(device)

# Define the loss function and optimizer
binary_criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(model.parameters(), lr=0.0000778)

binary_train_losses, binary_train_val_losses, binary_train_accs, binary_train_val_accs,binary_BVE,binary_mem_usage = train(train_loader, test_loader, model, binary_criterion, binary_optimizer, num_epochs, file_name='BNNVGG.pth')
binary_train_acc = get_accuracy(model, train_loader)
binary_test_accuracy = get_accuracy(model, test_loader)
b_end_time = time.time()
print(f'Test accuracy: {binary_test_accuracy:.4f}')
print(f'Train accuracy: {binary_train_acc:.4f}')
print(f"Training process took {b_end_time - b_start_time} seconds")

Epoch 1/1, Training Loss: 0.0577, Training Accuracy: 0.0011, Validation Loss: 0.0586, Validation Accuracy: 0.0006, Epoch Memory Usage: 283589.7031 MB
Test accuracy: 0.0006
Train accuracy: 0.0010
Training process took 110.6307201385498 seconds


Below cell is for checking memory usage. We tried out multiple methods but max_memory_allocated() seemed the most reliable and accurate. Makes use of model trained above. If you just want to find memory usage without extensive model training, make sure to run model for at least a couple epochs to initialize everything.

In [7]:
import torch
import torchvision
import numpy as np

dummy_input = torch.randn(1, 3, 32, 32).to(device)

starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
#Repetitions used for official stats is 500
repetitions = 1
timings=np.zeros((repetitions,1))
memory_usage = np.zeros(repetitions)
# GPU warm-up
print("g1")
for _ in range(10):
    _ = model(dummy_input)

# Load CIFAR-10 dataset
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True, num_workers=2)
print("H2")
# Measure performance
with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        for data in dataloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            #flops = FlopCountAnalysis(model, inputs)
        ender.record()
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time
        curr_mem = torch.cuda.max_memory_allocated()
        memory_usage[rep] = curr_mem / 1024 / 1024  # convert bytes to megabytes
        print(rep, memory_usage[rep], curr_time)

mean_time = np.sum(timings) / repetitions
std_time = np.std(timings)
mean_memory = np.sum(memory_usage) / repetitions
std_memory = np.std(memory_usage)

#Memory changes based on GPU instance
print("Average time per batch: {:.3f} ms".format(mean_time))
print("Average memory usage: {:.3f} MB".format(mean_memory))

g1
Files already downloaded and verified
H2
0 386.86279296875 2328.977294921875
Average time per batch: 2328.977 ms
Average memory usage: 386.863 MB


In [17]:
#Save to CSV
mem_usage_resnet=np.array(memory_usage)
np.savetxt('mem_usage.csv', mem_usage_resnet, delimiter=',')
time_usage_resnet=np.array(timings)
np.savetxt('time_usage.csv', timings, delimiter=',')