In [3]:
import itertools
import functools

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import os
import time
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import IterableDataset, DataLoader

import random

import copy
import math
import pickle
import argparse

from torch.optim.lr_scheduler import StepLR

from utils import PartialDataset, validation_split

from mup_nets import ReLU_FC_net

from data_loaders import get_cifar10_loaders, get_cifar100_loaders

def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

def test_accuracy():
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            # calculate outputs by running images through the network
            outputs = net(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy of the network on the {total} test images: {100 * correct // total} %')
    return 100 * correct / total


def fully_connected_net(base_net=None):
    net = ReLU_FC_net(input_length,num_layers,width, output_width=output_width, weight_std=weight_std, bias_std=bias_std)
    net = net.to(device)
    optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=weight_decay)
    return net, optimizer


def run_test(identification=None):
    curr_training_iter = 0

    training_losses = []
    test_losses = []


    for epoch in range(num_epochs):

        print(f'Epoch {epoch}')
        running_loss = 0.0
        for i, data in enumerate(train_loader):

            if curr_training_iter % test_interval == 0:
                test_losses.append((curr_training_iter, test_accuracy()))
                print(test_losses[-1])
                
            if save_interval > 0:
                if curr_training_iter % save_interval == 0:
                    assert(identification is not None)
                    curr_entry = {'input_length' : input_length, \
                                  'num_layers' : num_layers, \
                                  'width' : width, \
                                  'output_length' : output_width, \
                                  'weight_std' : weight_std, \
                                  'bias_std' : bias_std, \
                                  'dataset' : dataset, \
                                  'batch_size' : batch_size, \
                                  'lr' : lr, \
                                  'net' : copy.deepcopy(net)}
                    learned_nets[(identification, curr_training_iter)] = curr_entry

            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % train_interval == train_interval-1:
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / train_interval:.3f}')
                running_loss = 0.0

            curr_training_iter += 1

    data_entry = {'input_length' : input_length, \
                  'num_layers' : 2, \
                  'width' : width, \
                  'output_length' : output_width, \
                  'weight_std' : weight_std, \
                  'bias_std' : bias_std, \
                  'dataset' : dataset, \
                  'batch_size' : batch_size, \
                  'lr' : lr, \
                  'training_losses' : training_losses, \
                  'test_losses' : test_losses }

    curr_list = []
    
    if data_filename is not None:
        if os.path.isfile(data_filename):
            curr_list = pickle.load(open(data_filename, 'rb'))
        curr_list.append(data_entry)
        pickle.dump(curr_list, open(data_filename, 'wb'))

    if save_interval > 0:
        assert(identification is not None)
        curr_entry = {'input_length' : input_length, \
                      'num_layers' : num_layers, \
                      'width' : width, \
                      'output_length' : output_width, \
                      'weight_std' : weight_std, \
                      'bias_std' : bias_std, \
                      'dataset' : dataset, \
                      'batch_size' : batch_size, \
                      'lr' : lr, \
                      'net' : copy.deepcopy(net)}
        learned_nets[(identification, curr_training_iter)] = curr_entry

        return curr_entry

In [4]:
learned_nets = {}

In [6]:
width = 100
lr = 0.1
batch_size=64
num_epochs = 40
weight_std=1
bias_std=weight_std
num_layers=10
weight_decay=0.00

test_interval=1000
train_interval=100
save_interval=100000
data_filename=None

no_cuda = False
use_cuda = not no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")


dataset = 'cifar10'
if dataset == 'cifar100':
    output_width=100
    input_length=3072
    train_loader, test_loader = get_cifar100_loaders(batch_size=batch_size)
elif dataset == 'cifar10':
    output_width=10
    input_length=3072
    train_loader, test_loader = get_cifar10_loaders(batch_size=batch_size)
else:
    assert(False)

criterion = nn.CrossEntropyLoss()

0.035660011452152474
Files already downloaded and verified
Files already downloaded and verified


In [None]:
net_folder = 'saved_nets/'
if not os.path.isdir(net_folder):
    os.mkdir(net_folder)
for num_layers in [1,2,3,4,5]:
    for width in [200,400,600,800,1000]:
        for rep in range(5):
            identification = 'fc_cifar_{}_{}_{}'.format(num_layers,width,rep)
            curr_filename = net_folder + identification + '.pkl'
            print(num_layers,width,rep)
            if os.path.isfile(curr_filename):
                continue
            net, optimizer = fully_connected_net()
            print(identification)
            final_net_entry = run_test(identification)
            pickle.dump(final_net_entry, open(curr_filename, 'wb'))

1 200 0
1 200 1
fc_cifar_1_200_1
Epoch 0
Accuracy of the network on the 10000 test images: 10 %
(0, 10.0)
[1,   100] loss: 2.329
[1,   200] loss: 2.210
[1,   300] loss: 2.149
[1,   400] loss: 2.090
[1,   500] loss: 2.053
[1,   600] loss: 2.009
[1,   700] loss: 1.994
Epoch 1
[2,   100] loss: 1.939
[2,   200] loss: 1.931
Accuracy of the network on the 10000 test images: 32 %
(1000, 32.67)
[2,   300] loss: 1.927
[2,   400] loss: 1.899
[2,   500] loss: 1.869
[2,   600] loss: 1.878
[2,   700] loss: 1.866
Epoch 2
[3,   100] loss: 1.834
[3,   200] loss: 1.836
[3,   300] loss: 1.825
[3,   400] loss: 1.813
Accuracy of the network on the 10000 test images: 36 %
(2000, 36.15)
[3,   500] loss: 1.828
[3,   600] loss: 1.811
[3,   700] loss: 1.806
Epoch 3
[4,   100] loss: 1.797
[4,   200] loss: 1.791
[4,   300] loss: 1.772
[4,   400] loss: 1.770
[4,   500] loss: 1.753
[4,   600] loss: 1.759
Accuracy of the network on the 10000 test images: 38 %
(3000, 38.93)
[4,   700] loss: 1.754
Epoch 4
[5,   100] 