In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F

In [2]:
# Model definition
class CIFAR10_MLP(nn.Module):
    def __init__(self, input_dim=3*32*32, hidden_dims=[2048, 1024, 512, 256], num_classes=10, dropout=0.5):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for hdim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hdim))
            layers.append(nn.BatchNorm1d(hdim))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(dropout))
            prev_dim = hdim
        layers.append(nn.Linear(prev_dim, num_classes))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # flatten
        return self.net(x)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data transforms: normalize CIFAR-10 images
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), 
                         (0.247, 0.243, 0.261))
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), 
                         (0.247, 0.243, 0.261))
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=4)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:08<00:00, 19.3MB/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [4]:
model = CIFAR10_MLP().to(device)
model = torch.jit.script(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [5]:
from utils import train

train_metrics, val_metrics, test_metrics = train(model, train_loader, None, test_loader, 30, optimizer, criterion)


Epoch: 1 Total_Time: 2.0300 Average_Time_per_batch: 0.0052 Train_Accuracy: 0.3367 Train_Loss: 1.8300 
Epoch: 2 Total_Time: 1.5213 Average_Time_per_batch: 0.0039 Train_Accuracy: 0.4234 Train_Loss: 1.6156 
Epoch: 3 Total_Time: 1.5940 Average_Time_per_batch: 0.0041 Train_Accuracy: 0.4572 Train_Loss: 1.5315 
Epoch: 4 Total_Time: 1.1509 Average_Time_per_batch: 0.0029 Train_Accuracy: 0.4774 Train_Loss: 1.4716 
Epoch: 5 Total_Time: 1.3900 Average_Time_per_batch: 0.0036 Train_Accuracy: 0.4919 Train_Loss: 1.4303 
Epoch: 6 Total_Time: 1.3899 Average_Time_per_batch: 0.0036 Train_Accuracy: 0.5060 Train_Loss: 1.3938 
Epoch: 7 Total_Time: 1.4106 Average_Time_per_batch: 0.0036 Train_Accuracy: 0.5193 Train_Loss: 1.3605 
Epoch: 8 Total_Time: 1.4291 Average_Time_per_batch: 0.0037 Train_Accuracy: 0.5265 Train_Loss: 1.3329 
Epoch: 9 Total_Time: 1.5155 Average_Time_per_batch: 0.0039 Train_Accuracy: 0.5387 Train_Loss: 1.3030 
Epoch: 10 Total_Time: 1.5561 Average_Time_per_batch: 0.0040 Train_Accuracy: 0.549

In [6]:
from dpn_3.dpn import DPN as DPN_3
    
model_3 = DPN_3(3*32*32, 100, 10, True).cuda()
#model_3.compile()

In [7]:
model_3 = torch.jit.trace(model_3, torch.randn(128, 3*32*32).cuda())
optimizer = optim.Adam(model_3.parameters())
criterion = nn.CrossEntropyLoss()

In [8]:
train_metrics_3, val_metrics_3, test_metrics_3 = train(model_3, train_loader, None, test_loader, 30, optimizer, criterion)


Epoch: 1 Total_Time: 0.9108 Average_Time_per_batch: 0.0023 Train_Accuracy: 0.3902 Train_Loss: 1.7770 
Epoch: 2 Total_Time: 0.7040 Average_Time_per_batch: 0.0018 Train_Accuracy: 0.4514 Train_Loss: 1.6069 
Epoch: 3 Total_Time: 0.7692 Average_Time_per_batch: 0.0020 Train_Accuracy: 0.4854 Train_Loss: 1.5102 
Epoch: 4 Total_Time: 0.5112 Average_Time_per_batch: 0.0013 Train_Accuracy: 0.5008 Train_Loss: 1.4568 
Epoch: 5 Total_Time: 0.6651 Average_Time_per_batch: 0.0017 Train_Accuracy: 0.5215 Train_Loss: 1.3953 
Epoch: 6 Total_Time: 0.7553 Average_Time_per_batch: 0.0019 Train_Accuracy: 0.5359 Train_Loss: 1.3546 
Epoch: 7 Total_Time: 0.6576 Average_Time_per_batch: 0.0017 Train_Accuracy: 0.5438 Train_Loss: 1.3270 
Epoch: 8 Total_Time: 0.7431 Average_Time_per_batch: 0.0019 Train_Accuracy: 0.5603 Train_Loss: 1.2791 
Epoch: 9 Total_Time: 0.7719 Average_Time_per_batch: 0.0020 Train_Accuracy: 0.5730 Train_Loss: 1.2516 
Epoch: 10 Total_Time: 0.5528 Average_Time_per_batch: 0.0014 Train_Accuracy: 0.582

In [9]:
import torch
from torch import nn

hidden_dims = [2048, 1024, 512, 256]
total = sum(hidden_dims)

blocks = len(hidden_dims)
features = 3 * 32 * 32
neural_blocks = []
for dim in hidden_dims:
    std_dev = torch.sqrt(torch.tensor(1 / features)).to(device)
    neural_blocks.append(torch.randn(dim, features).to(device) * std_dev)
    features += dim

feature_blocks = []
features_start = 0
for i in range(len(neural_blocks)):
    features_end = neural_blocks[i].shape[1]
    block = neural_blocks[i][:, features_start:]
    for j in range(i + 1, len(neural_blocks)):
        block = torch.cat((block, neural_blocks[j][:, features_start:features_end]), dim=0)
    feature_blocks.append(nn.Parameter(block))
    features_start = features_end

biases = biases = nn.Parameter(torch.empty(total).uniform_(0.0, 1.0)).to(device)

In [10]:
from dpn_2.dpn import DPN as DPN_3
    
model_3 = DPN_3(3*32*32, 500, 10, True).cuda()
model_3.weights.extend(feature_blocks)
model_3.biases = biases

In [11]:
#model_3 = torch.jit.trace(model_3, torch.randn(128, 3*32*32).cuda())
optimizer = optim.Adam(model_3.parameters())
criterion = nn.CrossEntropyLoss()

In [12]:
from utils import train
train_metrics_3, val_metrics_3, test_metrics_3 = train(model_3, train_loader, None, test_loader, 50, optimizer, criterion)


Epoch: 1 Total_Time: 1.3111 Average_Time_per_batch: 0.0034 Train_Accuracy: 0.3866 Train_Loss: 2.1423 
Epoch: 2 Total_Time: 1.3928 Average_Time_per_batch: 0.0036 Train_Accuracy: 0.4515 Train_Loss: 1.7034 
Epoch: 3 Total_Time: 1.5970 Average_Time_per_batch: 0.0041 Train_Accuracy: 0.4809 Train_Loss: 1.6308 
Epoch: 4 Total_Time: 1.6079 Average_Time_per_batch: 0.0041 Train_Accuracy: 0.5034 Train_Loss: 1.5838 
Epoch: 5 Total_Time: 1.4312 Average_Time_per_batch: 0.0037 Train_Accuracy: 0.5257 Train_Loss: 1.5342 
Epoch: 6 Total_Time: 1.3885 Average_Time_per_batch: 0.0036 Train_Accuracy: 0.5498 Train_Loss: 1.4823 
Epoch: 7 Total_Time: 1.3523 Average_Time_per_batch: 0.0035 Train_Accuracy: 0.5723 Train_Loss: 1.4059 
Epoch: 8 Total_Time: 1.4013 Average_Time_per_batch: 0.0036 Train_Accuracy: 0.5877 Train_Loss: 1.4131 
Epoch: 9 Total_Time: 1.4345 Average_Time_per_batch: 0.0037 Train_Accuracy: 0.6071 Train_Loss: 1.4018 
Epoch: 10 Total_Time: 1.2307 Average_Time_per_batch: 0.0031 Train_Accuracy: 0.620