In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import print_function
import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from rigl_torch.RigL import RigLScheduler

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [4]:
def train(args, model, device, train_loader, optimizer, epoch, pruner):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
    
        if pruner():
            optimizer.step()

        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            if args.dry_run:
                break


In [5]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss, correct / len(test_loader.dataset)

In [6]:
def ed(param_name, default=None):
    return os.environ.get(param_name, default)

In [7]:
class Args:
    dense_allocation = 0.3 #'percentage of dense parameters allowed. if None, pruning will not be used. must be on the interval (0, 1]')
    delta = ed('DELTA', 100)
    grad_accumulation_n = ed('GRAD_ACCUMULATION_N', 1)
    alpha = ed('ALPHA', 0.3)
    static_topo = 1
    batch_size = ed('BATCH_SIZE', 64)
    test_batch_size = ed('TEST_BATCH_SIZE', 1000)
    epochs = 100
    lr = ed('LR', 1)
    gamma = ed('GAMMA', 0.7)
    no_cuda= False
    dry_run=False,
    seed=1
    log_interval=10
    save_model=1

args = Args()

In [8]:
if args.dense_allocation is None:
    print('-------------------------------------------------------------------')
    print('heads up, RigL will not be used unless `--dense-allocation` is set!')
    print('-------------------------------------------------------------------')
    

use_cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")

train_kwargs = {'batch_size': args.batch_size}
test_kwargs = {'batch_size': args.test_batch_size}
if use_cuda:
    cuda_kwargs = {'num_workers': 1,
                    'pin_memory': True,
                    'shuffle': True}
    train_kwargs.update(cuda_kwargs) 
    test_kwargs.update(cuda_kwargs)

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
dataset1 = datasets.MNIST('../data', train=True, download=True,
                    transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                    transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 40631158.70it/s]


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 56605464.40it/s]

Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 1648877/1648877 [00:00<00:00, 10484946.07it/s]


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 5507524.94it/s]


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw



In [9]:
pruner = lambda: True
if args.dense_allocation is not None:
    T_end = int(0.75 * args.epochs * len(train_loader))
    pruner = RigLScheduler(model, optimizer, dense_allocation=args.dense_allocation, alpha=args.alpha, delta=args.delta, static_topo=args.static_topo, T_end=T_end, ignore_linear_layers=False, grad_accumulation_n=args.grad_accumulation_n)

writer = SummaryWriter(log_dir='./graphs')

print(model)
for epoch in range(1, args.epochs + 1):
    print(pruner)
    train(args, model, device, train_loader, optimizer, epoch, pruner=pruner)
    loss, acc = test(model, device, test_loader)
    scheduler.step()

    writer.add_scalar('loss', loss, epoch)
    writer.add_scalar('accuracy', acc, epoch)

if args.save_model:
    torch.save(model.state_dict(), "mnist_cnn.pt")

Net(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=9216, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)
RigLScheduler(
layers=4,
nonzero_params=[288/288, 5530/18432, 353895/1179648, 384/1280],
nonzero_percentages=[100.00%, 30.00%, 30.00%, 30.00%],
total_nonzero_params=360097/1199648 (30.02%),
total_CONV_nonzero_params=5818/18720 (31.08%),
step=0,
num_rigl_steps=0,
ignoring_linear_layers=False,
sparsity_distribution=uniform,
)

Test set: Average loss: 2.2922, Accuracy: 958/10000 (10%)

RigLScheduler(
layers=4,
nonzero_params=[288/288, 5530/18432, 353895/1179648, 384/1280],
nonzero_percentages=[100.00%, 30.00%, 30.00%, 30.00%],
total_nonzero_params=360097/1199648 (30.02%),
total_CONV_nonzero_params=5818/18720 (31.08%),
step=1,
num_rigl_steps=0,
ignoring_l




Test set: Average loss: 2.2828, Accuracy: 3542/10000 (35%)

RigLScheduler(
layers=4,
nonzero_params=[288/288, 5530/18432, 353895/1179648, 384/1280],
nonzero_percentages=[100.00%, 30.00%, 30.00%, 30.00%],
total_nonzero_params=360097/1199648 (30.02%),
total_CONV_nonzero_params=5818/18720 (31.08%),
step=2,
num_rigl_steps=0,
ignoring_linear_layers=False,
sparsity_distribution=uniform,
)

Test set: Average loss: 2.2689, Accuracy: 3912/10000 (39%)

RigLScheduler(
layers=4,
nonzero_params=[288/288, 5530/18432, 353895/1179648, 384/1280],
nonzero_percentages=[100.00%, 30.00%, 30.00%, 30.00%],
total_nonzero_params=360097/1199648 (30.02%),
total_CONV_nonzero_params=5818/18720 (31.08%),
step=3,
num_rigl_steps=0,
ignoring_linear_layers=False,
sparsity_distribution=uniform,
)

Test set: Average loss: 2.2566, Accuracy: 3007/10000 (30%)

RigLScheduler(
layers=4,
nonzero_params=[288/288, 5530/18432, 353895/1179648, 384/1280],
nonzero_percentages=[100.00%, 30.00%, 30.00%, 30.00%],
total_nonzero_params=

In [10]:
def get_abs_sps(model):
    nonzero = total = 0
    # print(f"TYPE: {type(model)}")

    for name, param in model.named_parameters():
        # print(name)
        tensor = param.detach().clone()
        # nz_count.append(torch.count_nonzero(tensor))
        nz_count = torch.count_nonzero(tensor).item()
        total_params = tensor.numel()
        nonzero += nz_count
        total += total_params
    
    # print(f"TOTAL: {total}")
    abs_sps = 100 * (total-nonzero) / total

    return abs_sps, total, (total-nonzero)

In [11]:
get_abs_sps(model)

(69.96946366392696, 1199882, 839551)