In [None]:
# NORTH setup
# from pytorch.neurops import *

# import copy
# import numpy as np
# from torchvision import datasets, transforms

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### All the following cells are runable

In [1]:
# Our setup
import torch
from torch.functional import F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import copy
import numpy as np
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

from sad_nns.uncertainty import *
from neurops import *

In [2]:
model = ModSequential(
        ModConv2d(in_channels=1, out_channels=8, kernel_size=7, masked=True, padding=1, learnable_mask=True),
        ModConv2d(in_channels=8, out_channels=16, kernel_size=7, masked=True, padding=1, prebatchnorm=True, learnable_mask=True),
        ModConv2d(in_channels=16, out_channels=16, kernel_size=5, masked=True, prebatchnorm=True, learnable_mask=True),
        ModLinear(64, 32, masked=True, prebatchnorm=True, learnable_mask=True),
        ModLinear(32, 10, masked=True, prebatchnorm=True, nonlinearity=""),
        track_activations=True,
        track_auxiliary_gradients=True,
        input_shape = (1, 14, 14)
    ).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

print("This model has {} effective parameters.".format(model.parameter_count(masked = True)))
print("The conversion factor of this model is {} after layer {}.".format(model.conversion_factor, model.conversion_layer))

This model has 15634 effective parameters.
The conversion factor of this model is 4 after layer 2.


In [3]:
dataset = datasets.MNIST('../data', train=True, download=True,
                     transform=transforms.Compose([ 
                            transforms.ToTensor(),
                            transforms.Normalize((0.1307,), (0.3081,)),
                            transforms.Resize((14,14))
                        ]))
train_set, val_set = torch.utils.data.random_split(dataset, lengths=[int(0.9*len(dataset)), int(0.1*len(dataset))])
train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=128, shuffle=True)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                            transforms.ToTensor(),
                            transforms.Normalize((0.1307,), (0.3081,)),
                            transforms.Resize((14,14))
                        ])),
    batch_size=128, shuffle=True)

def train(model, train_loader, optimizer, criterion, epochs=10, val_loader=None, verbose=True):
    model.train()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 100 == 0 and verbose:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
        if val_loader is not None:
            print("Validation: ", end = "")
            test(model, val_loader, criterion)

def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    
    print('Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [4]:
train(model, train_loader, optimizer, criterion, epochs=5, val_loader=val_loader)





Validation: Average loss: 0.0021, Accuracy: 5691/6000 (94.85%)


In [6]:
modded_model = copy.deepcopy(model)
modded_optimizer = torch.optim.SGD(modded_model.parameters(), lr=0.01)
modded_optimizer.load_state_dict(optimizer.state_dict())

for i in range(len(model)-1):
    scores = weight_sum(modded_model[i].weight)
    # scores = weight_sum(modded_model[i].weight) +  weight_sum(modded_model[i+1].weight, fanin=False, conversion_factor=model.conversion_factor if i == model.conversion_layer else -1)
    # scores = activation_variance(modded_model.activations[str(i)])
    # scores = svd_score(modded_model.activations[str(i)])
    # scores = nuclear_score(modded_model.activations[str(i)], average=i<3)
    # scores = modded_model[i+1].batchnorm.weight.abs() if i != modded_model.conversion_layer else modded_model[i+1].batchnorm.weight.abs().reshape(modded_model.conversion_factor,-1).sum(0) 
    # Before trying this line, run the following block: # scores = fisher_info(mask_grads[i])
    print("Layer {} scores: mean {:.3g}, std {:.3g}, min {:.3g}, smallest 25%:".format(i, scores.mean(), scores.std(), scores.min()), end=" ")
    to_prune = np.argsort(scores.detach().cpu().numpy())[:int(0.25*len(scores))]
    print(to_prune)
    modded_model.prune(i, to_prune, optimizer=modded_optimizer)
print("The pruned model has {} effective parameters.".format(modded_model.parameter_count(masked = True)))
print("Validation after pruning: ", end = "")
test(modded_model, val_loader, criterion)
train(modded_model, train_loader, modded_optimizer, criterion, epochs=2, val_loader=val_loader)

Layer 0 scores: mean 3.93, std 0.338, min 3.29, smallest 25%: [1 6]
Layer 1 scores: mean 7.86, std 0.222, min 7.54, smallest 25%: [15 10  8  4]
Layer 2 scores: mean 7.91, std 0.343, min 7.56, smallest 25%: [ 3  9  4 10]
Layer 3 scores: mean 3.1, std 0.211, min 2.72, smallest 25%: [ 1 22 17 26 16  5 15 11]
The pruned model has 9058 effective parameters.
Validation after pruning: Average loss: 0.0081, Accuracy: 4045/6000 (67.42%)
Validation: Average loss: 0.0013, Accuracy: 5817/6000 (96.95%)
Validation: Average loss: 0.0007, Accuracy: 5823/6000 (97.05%)


In [6]:
def collect_mask_grads(model):
    mask_grads = []
    for i in range(len(model.activations)-1):
        mask_grads.append(torch.empty(0, *model[i].mask_vector.shape))
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        for i in range(len(model)-1):
            mask_grads[i] = torch.cat([mask_grads[i], model[i].mask_vector.grad.detach().cpu().unsqueeze(0)])
    return mask_grads

#mask_grads = collect_mask_grads(model)

In [7]:
modded_model_iterative = copy.deepcopy(model)
modded_optimizer_iterative = torch.optim.SGD(modded_model_iterative.parameters(), lr=0.01)
modded_optimizer_iterative.load_state_dict(optimizer.state_dict())

for iter in range(5):
    for i in range(len(modded_model_iterative)-1):
        scores = weight_sum(modded_model_iterative[i].weight)
        # scores = weight_sum(modded_model_iterative[i].weight) +  weight_sum(modded_model_iterative[i+1].weight, fanin=False, conversion_factor=modded_model_iterative.conversion_factor if i == modded_model_iterative.conversion_layer else -1)
        # scores = activation_variance(modded_model_iterative.activations[str(i)])
        # scores = svd_score(modded_model_iterative.activations[str(i)])
        # scores = nuclear_score(modded_model_iterative.activations[str(i)], average=i<3)
        # scores = modded_model_iterative[i+1].batchnorm.weight.abs() if i != modded_model_iterative.conversion_layer else modded_model_iterative[i+1].batchnorm.weight.abs().reshape(modded_model_iterative.conversion_factor,-1).sum(0) 
        print("Layer {} scores: mean {:.3g}, std {:.3g}, min {:.3g}, smallest 15%:".format(i, scores.mean(), scores.std(), scores.min()), end=" ")
        to_prune = np.argsort(scores.cpu().detach().numpy())[:int(0.15*len(scores))]
        print(to_prune)
        modded_model_iterative.prune(i, to_prune, optimizer=modded_optimizer_iterative, clear_activations=True)
    print("The pruned model now has {} effective parameters.".format(modded_model_iterative.parameter_count(masked = True)))
    print("Validation after pruning: ", end = "")
    test(modded_model_iterative, val_loader, criterion)
    train(modded_model_iterative, train_loader, modded_optimizer_iterative, criterion, epochs=2, val_loader=val_loader)

Layer 0 scores: mean 3.87, std 0.285, min 3.57, smallest 15%: [1]
Layer 1 scores: mean 9.2, std 0.308, min 8.52, smallest 15%: [5 6]
Layer 2 scores: mean 9.33, std 0.425, min 8.68, smallest 15%: [6 5]
Layer 3 scores: mean 3.65, std 0.229, min 3.04, smallest 15%: [ 3 25 16 13]
The pruned model now has 12176 effective parameters.
Validation after pruning: Average loss: 0.0014, Accuracy: 5688/6000 (94.80%)
Validation: Average loss: 0.0010, Accuracy: 5856/6000 (97.60%)
Validation: Average loss: 0.0006, Accuracy: 5869/6000 (97.82%)
Layer 0 scores: mean 4.06, std 0.258, min 3.81, smallest 15%: [2]
Layer 1 scores: mean 8.12, std 0.35, min 7.51, smallest 15%: [7 6]
Layer 2 scores: mean 8.36, std 0.43, min 7.54, smallest 15%: [10 13]
Layer 3 scores: mean 3.22, std 0.166, min 2.82, smallest 15%: [ 6 17 24 13]
The pruned model now has 9058 effective parameters.
Validation after pruning: Average loss: 0.0025, Accuracy: 5463/6000 (91.05%)
Validation: Average loss: 0.0009, Accuracy: 5864/6000 (97.73

In [5]:
modded_model_grow = copy.deepcopy(model)
modded_optimizer_grow = torch.optim.SGD(modded_model_grow.parameters(), lr=0.01)
modded_optimizer_grow.load_state_dict(optimizer.state_dict())

for iter in range(5):
    for i in range(len(modded_model_grow)-1):
        #score = orthogonality_gap(modded_model_grow.activations[str(i)])
        max_rank = modded_model_grow[i].width()
        score = effective_rank(modded_model_grow.activations[str(i)])
        to_add = max(score-int(0.95*max_rank), 0)
        print("Layer {} score: {}/{}, neurons to add: {}".format(i, score, max_rank, to_add))
        modded_model_grow.grow(i, to_add, fanin_weights="iterative_orthogonalization", 
                               optimizer=modded_optimizer_grow)
    print("The grown model now has {} effective parameters.".format(modded_model_grow.parameter_count(masked = True)))
    print("Validation after growing: ", end = "")
    test(modded_model_grow, val_loader, criterion)
    train(modded_model_grow, train_loader, modded_optimizer_grow, criterion, epochs=2, val_loader=val_loader)

Layer 0 score: 8/8, neurons to add: 1
Layer 1 score: 16/16, neurons to add: 1
Layer 2 score: 16/16, neurons to add: 1
Layer 3 score: 32/32, neurons to add: 2
The grown model now has 16731 effective parameters.
Validation after growing: Average loss: 0.0021, Accuracy: 5691/6000 (94.85%)
Validation: Average loss: 0.0012, Accuracy: 5810/6000 (96.83%)
Validation: Average loss: 0.0007, Accuracy: 5850/6000 (97.50%)
Layer 0 score: 9/9, neurons to add: 1
Layer 1 score: 17/17, neurons to add: 1
Layer 2 score: 17/17, neurons to add: 1
Layer 3 score: 34/34, neurons to add: 2
The grown model now has 19217 effective parameters.
Validation after growing: Average loss: 0.0007, Accuracy: 5850/6000 (97.50%)
Validation: Average loss: 0.0008, Accuracy: 5872/6000 (97.87%)
Validation: Average loss: 0.0006, Accuracy: 5858/6000 (97.63%)
Layer 0 score: 10/10, neurons to add: 1
Layer 1 score: 18/18, neurons to add: 1
Layer 2 score: 18/18, neurons to add: 1
Layer 3 score: 36/36, neurons to add: 2
The grown mode

In [9]:
modded_model_masked = copy.deepcopy(model)
modded_optimizer_masked = torch.optim.SGD(modded_model_masked.parameters(), lr=0.01)
modded_optimizer_masked.load_state_dict(optimizer.state_dict())

for i in range(len(modded_model_masked)-1):
    neurons = modded_model_masked[i].width()
    modded_model_masked.grow(i, neurons, fanin_weights="kaiming", fanout_weights="kaiming", optimizer=modded_optimizer_masked)
    modded_model_masked.mask(i, list(range(neurons, 2*neurons)))

for iter in range(5):
    for i in range(len(modded_model_masked)-1):
        scores = weight_sum(modded_model_masked[i].get_weights())
        print("Layer {} scores: mean {:.3g}, std {:.3g}, min {:.3g}, smallest 25% to mask:".format(i, scores[scores != 0].mean(), scores[scores != 0].std(), scores[scores != 0].min()), end=" ")
        to_mask = np.argsort(scores.detach().numpy())[sum(scores == 0):sum(scores == 0)+int(0.25*sum(scores != 0))]
        print(to_mask, end=", ")
        modded_model_masked.mask(i, to_mask)
        to_unmask = np.argsort(scores.detach().numpy())[:sum(scores == 0)]
        to_unmask = np.random.choice(to_unmask, size=len(to_mask), replace=False)
        print("random neurons to unmask:", to_unmask)
        modded_model_masked.unmask(i, to_unmask, optimizer=modded_optimizer_masked)
    print("The masked model now has {} effective parameters.".format(modded_model_masked.parameter_count(masked = True)))
    print("Validation after growing: ", end = "")
    test(modded_model_masked, val_loader, criterion)
    train(modded_model_masked, train_loader, modded_optimizer_masked, criterion, epochs=2, val_loader=val_loader, verbose=False)

Layer 0 scores: mean 3.87, std 0.285, min 3.57, smallest 25% to mask: [1 4], random neurons to unmask: [14 13]
Layer 1 scores: mean 17.7, std 0.428, min 16.4, smallest 25% to mask: [ 5  1 13  6], random neurons to unmask: [28 18 17 26]
Layer 2 scores: mean 17.8, std 0.615, min 16.8, smallest 25% to mask: [6 7 5 8], random neurons to unmask: [17 18 23 26]
Layer 3 scores: mean 6.95, std 0.318, min 6.16, smallest 25% to mask: [ 3 25 16 20 14 19  5  9], random neurons to unmask: [63 38 58 48 50 56 47 36]
The masked model now has 28358 effective parameters.
Validation after growing: Average loss: 0.0130, Accuracy: 2576/6000 (42.93%)
Validation: Average loss: 0.0011, Accuracy: 5828/6000 (97.13%)
Validation: Average loss: 0.0007, Accuracy: 5848/6000 (97.47%)
Layer 0 scores: mean 3.88, std 0.428, min 3.06, smallest 25% to mask: [14 13], random neurons to unmask: [ 1 12]
Layer 1 scores: mean 17, std 1.88, min 13.4, smallest 25% to mask: [28 26 17 18], random neurons to unmask: [30 22 21  6]
Lay

In [1]:
modded_model_grow.activations

NameError: name 'modded_model_grow' is not defined