# Rough project/code architecture

1. Load MNIST into some kind of native dataformat (i.e. not with a oneliner that gives you a pytorch dataset)
1. Load entire MNIST into a dataset, then make that into a dataloader
1. Train MNIST with pytorch.  make a note of loss and accuracy. You’ll need to choose an architecture, batch size and number of epochs to train for. 
    * Get architecture from pytorch docs. Should be simple. 
    * Number of epochs is “until results get as good as they are going to get”, i.e. by inspection.
    * Batch size I got no idea. 1 is probably fine, but feel free to try others.
1. Split MNIST into N groups randomly. Create N datasets. Train with federated.py. You should get roughly the final loss and accuracy you saw previously. 
    * Use the same architecture as before.
    * Choose number of rounds same way you chose number of epochs in previous step, i.e. until it converges.
    * Number of epochs per round. Maybe try 1, 5 and 10? Go with 1 unless you get very different results.
    * Batch size should probably be the same as before.
1. Repeat, but split MNIST into N groups with the deck stacked (possibly as extremely as 10 groups each containing data from only one class).

### Snippets

#### Visualize an array
See cell 109 here: https://github.com/williamsmj/pytorch-notes/blob/master/pytorch-60-minute-blitz.ipynb

#### Change native data to dataset, change dataset to native data, and loop over dataset doing something other than training
See https://github.com/williamsmj/pytorch-notes/blob/master/dataset_manipulation.ipynb 


In [1]:
data_path = './MNIST-data/raw'

# location of data and labels
test_labels_file = data_path + '/' + 't10k-labels-idx1-ubyte'
test_data_file = data_path + '/' + 't10k-images-idx3-ubyte'
train_labels_file = data_path + '/' + 'train-labels-idx1-ubyte'
train_data_file = data_path + '/' + 'train-images-idx3-ubyte'

In [2]:
#import torch
#import torchvision.datasets as dsets

# pytorch datasets that download MNIST set as needed; used only to download files
#train_set = dsets.MNIST(root=data_path, download=True, train=True)
#test_set = dsets.MNIST(root=data_path, download=True, test=True)

In [3]:
# Load MNIST data into numpy arrays
import numpy as np
import struct

def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)
    
# read data and label files into numpy arrays
test_data = read_idx(test_data_file)
test_labels = read_idx(test_labels_file)
train_data = read_idx(train_data_file)
train_labels = read_idx(train_labels_file)

In [4]:
# Take a look at the data in the arrays

# pick a random sample from the sets of observations
import random
test_sample_num = random.randint(1, len(test_labels) + 1)
train_sample_num = random.randint(1, len(train_labels) + 1)

"""
# print some arrays to confirm we have data
print(test_labels)
print(test_data[test_sample_num])
print(train_labels)
print(train_data[train_sample_num])
"""

# visualize some test samples
import matplotlib.pyplot as plt

img = test_data[test_sample_num]
label = test_labels[test_sample_num]
print('Test sample index: ', test_sample_num)
print('Sample value: ', label)
plt.imshow(img)
plt.show()

# how is the test data distributed?
print('Test sample histogram:')
plt.hist(test_labels, bins=10)
plt.show()

# visualize a training sample
img = train_data[train_sample_num]
label = train_labels[train_sample_num]
print('Training sample index: ', train_sample_num)
print('Sample value: ', label)
plt.imshow(img)
plt.show()

# how is the training data distributed?
print('Training data histogram:')
plt.hist(train_labels, bins=10)
plt.show()

Test sample index:  1297
Sample value:  0


<Figure size 640x480 with 1 Axes>

Test sample histogram:


<Figure size 640x480 with 1 Axes>

Training sample index:  16475
Sample value:  1


<Figure size 640x480 with 1 Axes>

Training data histogram:


<Figure size 640x480 with 1 Axes>

In [5]:
# load numpy arrays into pytorch Datasets, then DataLoaders

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

# create tensors from np arrays
test_data_tensor = torch.from_numpy(test_data)
test_labels_tensor = torch.from_numpy(test_labels)
train_data_tensor = torch.from_numpy(train_data)
train_labels_tensor = torch.from_numpy(train_labels)

# load tensors into datasets
test_dset = TensorDataset(test_data_tensor, test_labels_tensor)
train_dset = TensorDataset(train_data_tensor, train_labels_tensor)

#import torchvision.datasets as dsets
#train_dset = dsets.MNIST(root=data_path, train=True)
#test_dset = dsets.MNIST(root=data_path, train=False)

# create dataloaders
batch_size = 1
test_dloader = DataLoader(test_dset, batch_size=batch_size, shuffle=False)
train_dloader = DataLoader(train_dset, batch_size=batch_size, shuffle=True)

In [6]:
# set up network

import torch.nn as nn

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def name(self):
        return "LeNet"

In [7]:


import torch.nn.functional as F
from torch.autograd import Variable

model = LeNet()

learning_rate = 0.01
momentum = 0.9
num_epochs = 2

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
criterion = nn.CrossEntropyLoss

for epoch in range(num_epochs):
    avg_loss = 0

    for batch_idx, (x, target) in enumerate(train_dloader):
        optimizer.zero_grad()
        x, target = Variable(x), Variable(target)
        out = model(x)
        loss = criterion(out, target)
        avg_loss = avg_loss * 0.9 + loss.item() * 0.1
        loss.backward()
        optimizer.step()
        if (batch_idx+1) % 100 == 0 or (batch_idx+1) == len(train_loader):
            print ('==>>> epoch: {}, batch index: {}, train loss: {:.6f}'.format(
                epoch, batch_idx+1, avg_loss))
    

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [20, 1, 5, 5], but got input of size [1, 28, 28] instead