### Importing libraries

In [1]:
import os
import torch
import optparse
import numpy as np
import torch.nn as nn
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
class Logger(object):
    """Logging in tensorboard without tensorflow ops."""

    def __init__(self, log_dir):
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        """Log a scalar variable.
        Parameter
        ----------
        tag : Name of the scalar
        value : value itself
        step :  training iteration
        """
        # Notice we're using the Summary "class" instead of the "tf.summary" public API.
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def log_histogram(self, tag, values, step, bins=1000):
        """Logs the histogram of a list/vector of values."""
        # Convert to a numpy array
        values = np.array(values)
        
        # Create histogram using numpy        
        counts, bin_edges = np.histogram(values, bins=bins)

        # Fill fields of histogram proto
        hist = tf.HistogramProto()
        hist.min = float(np.min(values))
        hist.max = float(np.max(values))
        hist.num = int(np.prod(values.shape))
        hist.sum = float(np.sum(values))
        hist.sum_squares = float(np.sum(values**2))

        # Requires equal number as bins, where the first goes from -DBL_MAX to bin_edges[1]
        # See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto#L30
        # Thus, we drop the start of the first bin
        bin_edges = bin_edges[1:]

        # Add bin edges and counts
        for edge in bin_edges:
            hist.bucket_limit.append(edge)
        for c in counts:
            hist.bucket.append(c)

        # Create and write Summary
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
        self.writer.add_summary(summary, step)
        self.writer.flush()

### Loading data

In [3]:
def pad_array_temporally(X, padding):
    right_shift = lambda X, i : np.pad(X[:-i], [(i,0),(0,0)], 'constant', constant_values=0)
    left_shift = lambda X, i : np.pad(X[i:], [(0,i),(0,0)], 'constant', constant_values=0)
    before = [right_shift(X, i) for i in range(padding, 0, -1)]
    rest = [left_shift(X,i) for i in range(0, padding+1)]
    return np.concatenate(before + rest, axis=1)

In [4]:
def load_training_data():
    # Getting the labels
    trainy = np.load('data/train_labels.npy', encoding='bytes')
    valy = np.load('data/dev_labels.npy', encoding='bytes')
    trainy = np.concatenate(trainy.tolist())
    valy = np.concatenate(valy.tolist())
    # PCA
    if os.path.exists('data/trainx_pca.npy'):
        trainx = np.load('data/trainx_pca.npy')
        testx = np.load('data/testx_pca.npy')
        valx = np.load('data/valx_pca.npy')
    else:
        trainx = np.load('data/train.npy', encoding='bytes')
        testx = np.load('data/test.npy', encoding='bytes')
        valx = np.load('data/dev.npy', encoding='bytes')
        trainx = np.concatenate(trainx.tolist())
        testx = np.concatenate(testx.tolist())
        valx = np.concatenate(valx.tolist())
        trainx = PCA(n_components=10).fit_transform(trainx)
        testx = PCA(n_components=10).fit_transform(testx)
        valx = PCA(n_components=10).fit_transform(valx)
        np.save('data/trainx_pca.npy', trainxs)
        np.save('data/testx_pca.npy', testxs)
        np.save('data/valx_pca.npy', valxs)
    # add context
    padding = 20
    trainx = pad_array_temporally(trainx, padding)
    valx = pad_array_temporally(valx, padding)
    # Turn into tensors
    trainx = torch.from_numpy(trainx).float()
    trainy = torch.from_numpy(trainy.astype(int))
    testx = torch.from_numpy(testx).float()
    valx = torch.from_numpy(valx).float()
    valy = torch.from_numpy(valy.astype(int))
    # return
    return trainx, trainy, valx, valy

In [5]:
trainx, trainy, valx, valy = load_training_data()

### Training

In [6]:
def training_routine(name, net, dataset, epochs, lr, batch_size=5000, decay=True, logging=False):

    if logging:
        vLog = Logger('./logs/val_acc_{}'.format(name))
        tLog = Logger('./logs/train_acc_{}'.format(name))
    
    train_data, train_labels, val_data, val_labels = dataset
    
    criterion=nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    if decay:
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3)
    
    gpu = torch.cuda.is_available()
    print('Using GPU' if gpu else 'Not using GPU')
    
    net = net.cuda() if gpu else net
    
    for epoch in range(epochs):
        
        scheduler.step()

        train_correct = 0
        train_loss = []

        for batch_n in range(0, train_data.shape[0], batch_size):
            
            if (batch_n // batch_size) % 10 == 0:
                print('\rEpoch {:4} Batch {:6} ({:.2%})'.format(epoch + 1, batch_n // batch_size, batch_n / train_data.shape[0]), end='')
            
            a, b = batch_n, batch_n + batch_size
            batch_data, batch_labels = train_data[a:b], train_labels[a:b]

            if gpu:
                batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda()

            # forward pass
            batch_output = net(batch_data)
            batch_loss = criterion(batch_output, batch_labels)
            
            # backward pass
            batch_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            batch_prediction = batch_output.cpu().detach().argmax(dim=1)
            batch_correct = (batch_prediction.numpy() == batch_labels.cpu().detach().numpy()).sum()
            
            train_correct += batch_correct
            
            if batch_loss:
                train_loss.append(batch_loss.cpu().detach())
        
        train_accuracy = train_correct / train_data.shape[0]
        train_loss = torch.FloatTensor(train_loss)
        
        if logging:
            tLog.log_scalar('accuracy', train_accuracy, epoch + 1)
            for tag, value in net.named_parameters():
                tag = tag.replace('.', '/')
                tLog.log_histogram(tag, value.data.cpu().numpy(), epoch + 1)
                tLog.log_histogram(tag + '/grad', value.grad.data.cpu().numpy(), epoch + 1)
        
        # Once every 10 iterations, print statistics
        if True: #if epoch == 0 or epoch+1 % 10 == 0:
            
            print("\rStatistics for epoch", epoch + 1)
            
            val_loss = 0
            val_correct = 0
            count = 0
            
            for batch_n in range(0, val_data.shape[0], batch_size):
                # create batches
                a, b = batch_n, batch_n + batch_size
                batch_data = val_data[a:b]
                batch_labels = val_labels[a:b]
                
                # use GPU if possible 
                if gpu:
                    batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda()
                
                # Now for the validation set
                batch_output = net(batch_data)
                batch_loss = criterion(batch_output, batch_labels)
                
                # compute the accuracy of the prediction
                val_prediction = batch_output.cpu().detach().argmax(dim=1)
                val_correct += (val_prediction.numpy() == batch_labels.cpu().detach().numpy()).sum()
                
                # sum up to get mean later
                val_loss += val_loss
                count += 1
                
            # compute mean validation loss and accuracy for all batches
            val_loss = val_loss / count
            val_accuracy = val_correct / val_data.shape[0]
                
            print("Training loss :",train_loss.cpu().detach().numpy())
            print("Training accuracy :",train_accuracy)
            print("Validation loss :",val_loss)
            print("Validation accuracy :",val_accuracy)
            print()
            
            if logging:
                vLog.log_scalar('accuracy', val_accuracy, epoch + 1) 
                for tag, value in net.named_parameters():
                    tag = tag.replace('.', '/')
                    tLog.log_histogram(tag, value.data.cpu().numpy(), epoch + 1)
                    tLog.log_histogram(tag + '/grad', value.grad.data.cpu().numpy(), epoch + 1)

            # try to save the neural network
            try:
                torch.save(net, 'neural_net_{}'.format(name))
            except:
                print('Could not save neural network.')

    net = net.cpu()

## Creating the model

In [7]:
def generate_net(layers, batch_norm=True):
    model = []
    for i in range(len(layers)-1):
        model.append(nn.Linear(layers[i], layers[i+1]))
        if i < (len(layers)-2):
            if batch_norm:
                model.append(nn.BatchNorm1d(layers[i+1]))
            model.append(nn.LeakyReLU())
    net = nn.Sequential(*tuple(model))
    return net

In [8]:
name = 'id1'
layers = [410, 400, 400, 400, 138]
batch_norm = True
epochs = 40
lrate = 3e-1
batch_size = 5000
decay = True

net = generate_net(layers, batch_norm)
training_routine(name, net, (trainx, trainy, valx, valy), epochs, lrate, batch_size=batch_size, decay=True, logging=True)

Using GPU
Statistics for epoch 1
Training loss : [4.95939   4.767304  4.6081405 ... 1.4083661 1.3654275 1.4095609]
Training accuracy : 0.4309052817069839
Validation loss : 0.0
Validation accuracy : 0.3773842287544786

Statistics for epoch 2
Training loss : [2.4697251 2.3843856 1.9720542 ... 1.2989067 1.27318   1.302018 ]
Training accuracy : 0.4980918418317179
Validation loss : 0.0
Validation accuracy : 0.3982136400445843

Statistics for epoch 3
Training loss : [2.3138921 2.1974685 1.8101026 ... 1.2426203 1.2258229 1.2467799]
Training accuracy : 0.5173215866125288
Validation loss : 0.0
Validation accuracy : 0.4086948336605438

Statistics for epoch 4
Training loss : [2.2237425 2.0920994 1.7310332 ... 1.2111076 1.1970118 1.2112907]
Training accuracy : 0.5286180357275666
Validation loss : 0.0
Validation accuracy : 0.41660017869576

Statistics for epoch 5
Training loss : [2.1533859 2.0199068 1.6712006 ... 1.1848785 1.1710526 1.1881064]
Training accuracy : 0.5365221389262389
Validation loss 

In [9]:
name = 'id2'
layers = [410, 400, 400, 400, 400, 138]
batch_norm = True
epochs = 40
lrate = 3e-1
batch_size = 5000
decay = True

net = generate_net(layers, batch_norm)
training_routine(name, net, (trainx, trainy, valx, valy), epochs, lrate, batch_size=batch_size, decay=True, logging=True)

Using GPU
Statistics for epoch 1
Training loss : [4.963709  4.828221  4.6629767 ... 1.3878098 1.3414114 1.3750061]
Training accuracy : 0.43582884048750514
Validation loss : 0.0
Validation accuracy : 0.37997501845227954

Statistics for epoch 2
Training loss : [2.4626937 2.321765  1.9038656 ... 1.2710763 1.2457176 1.2751831]
Training accuracy : 0.5052981091372357
Validation loss : 0.0
Validation accuracy : 0.4007386888273315

Statistics for epoch 3
Training loss : [2.287515  2.1489248 1.7500489 ... 1.2096221 1.1967684 1.2189827]
Training accuracy : 0.5251892477735566
Validation loss : 0.0
Validation accuracy : 0.4127199705958816

Statistics for epoch 4
Training loss : [2.1888752 2.0498128 1.6771473 ... 1.1685017 1.1617665 1.1794478]
Training accuracy : 0.537032068540029
Validation loss : 0.0
Validation accuracy : 0.42158752356961215

Statistics for epoch 5
Training loss : [2.1127245 1.9751824 1.6209257 ... 1.1409633 1.1362598 1.1508292]
Training accuracy : 0.5453615661816855
Validation l

In [None]:
name = 'id3'
layers = [410, 400, 400, 400, 400, 400, 138]
batch_norm = True
epochs = 40
lrate = 3e-1
batch_size = 5000
decay = True

net = generate_net(layers, batch_norm)
training_routine(name, net, (trainx, trainy, valx, valy), epochs, lrate, batch_size=batch_size, decay=True, logging=True)

Using GPU
Statistics for epoch 1
Training loss : [4.949481  4.782579  4.615255  ... 1.3419937 1.3199406 1.3531142]
Training accuracy : 0.44084903863250835
Validation loss : 0.0
Validation accuracy : 0.3827316545494207

Statistics for epoch 2
Training loss : [2.4268448 2.3098204 1.9035733 ... 1.2247621 1.2167226 1.2491956]
Training accuracy : 0.511005009906344
Validation loss : 0.0
Validation accuracy : 0.4033459137538959

Statistics for epoch 3
Training loss : [2.2400296 2.1146753 1.7216992 ... 1.1656901 1.1667331 1.1914376]
Training accuracy : 0.5311135062023636
Validation loss : 0.0
Validation accuracy : 0.4159771341144549

Epoch    4 Batch    720 (23.30%)

In [None]:
name = 'id4'
layers = [410, 2000, 1000, 1000, 500, 500, 250, 138]
batch_norm = True
epochs = 40
lrate = 3e-1
batch_size = 5000
decay = True

net = generate_net(layers, batch_norm)
training_routine(name, net, (trainx, trainy, valx, valy), epochs, lrate, batch_size=batch_size, decay=True, logging=True)

In [None]:
name = 'id6'
layers = [410, 400, 400, 400, 138]
batch_norm = True
epochs = 40
lrate = 3e-1
batch_size = 500
decay = True

net = generate_net(layers, batch_norm)
training_routine(name, net, (trainx, trainy, valx, valy), epochs, lrate, batch_size=batch_size, decay=True, logging=True)

In [None]:
name = 'id7'
layers = [410, 800, 800, 400, 400, 400, 200, 200, 138]
batch_norm = True
epochs = 40
lrate = 3e-1
batch_size = 5000
decay = True

net = generate_net(layers, batch_norm)
training_routine(name, net, (trainx, trainy, valx, valy), epochs, lrate, batch_size=batch_size, decay=True, logging=True)

In [None]:
name = 'id8'
layers = [410, 800, 800, 400, 400, 400, 200, 200, 138]
batch_norm = True
epochs = 40
lrate = 3e-1
batch_size = 64
decay = True

net = generate_net(layers, batch_norm)
training_routine(name, net, (trainx, trainy, valx, valy), epochs, lrate, batch_size=batch_size, decay=True, logging=True)

In [None]:
net = nn.Sequential(
    nn.Linear(210, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 138)
)
name = 'cont10_400x400x400_1e-2_bn'
epochs = 20
learning_rate = 1e-2
training_routine(name, net, (trainx, trainy, valx, valy), epochs, learning_rate, batch_size=5000, logging=True)

In [None]:
net = nn.Sequential(
    nn.Linear(210, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 138)
)
name = 'cont10_400x400x400_1e-3_bn'
epochs = 20
learning_rate = 1e-3
training_routine(name, net, (trainx, trainy, valx, valy), epochs, learning_rate, batch_size=5000, logging=True)

In [None]:
net = nn.Sequential(
    nn.Linear(210, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 138)
)
name = 'cont10_400x400x400_1e-4_bn'
epochs = 20
learning_rate = 1e-4
training_routine(name, net, (trainx, trainy, valx, valy), epochs, learning_rate, batch_size=5000, logging=True)

In [None]:
net = nn.Sequential(
    nn.Linear(210, 800),
    nn.BatchNorm1d(800),
    nn.LeakyReLU(),
    nn.Linear(800, 600),
    nn.BatchNorm1d(600),
    nn.LeakyReLU(),
    nn.Linear(600, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 200),
    nn.BatchNorm1d(200),
    nn.LeakyReLU(),
    nn.Linear(200, 138)
)
name = 'cont10_800x600x400x200_1e-2_bn'
epochs = 20
learning_rate = 1e-2
training_routine(name, net, (trainx, trainy, valx, valy), epochs, learning_rate, batch_size=5000, logging=True)

In [None]:
net = nn.Sequential(
    nn.Linear(210, 800),
    nn.BatchNorm1d(800),
    nn.LeakyReLU(),
    nn.Linear(800, 600),
    nn.BatchNorm1d(600),
    nn.LeakyReLU(),
    nn.Linear(600, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 200),
    nn.BatchNorm1d(200),
    nn.LeakyReLU(),
    nn.Linear(200, 138)
)
name = 'cont10_800x600x400x200_5e-2_bn'
epochs = 20
learning_rate = 5e-2
training_routine(name, net, (trainx, trainy, valx, valy), epochs, learning_rate, batch_size=5000, logging=True)

In [None]:
net = nn.Sequential(
    nn.Linear(210, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 138)
)
name = 'cont10_400x400x400_5e-2_bn'
epochs = 20
learning_rate = 5e-2
training_routine(name, net, (trainx, trainy, valx, valy), epochs, learning_rate, batch_size=5000, logging=True)

In [None]:
net = nn.Sequential(
    nn.Linear(210, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 400),
    nn.BatchNorm1d(400),
    nn.LeakyReLU(),
    nn.Linear(400, 138)
)
name = 'cont10_400x400x400_bn_dlr'
epochs = 20
learning_rate = 0.1
training_routine(name, net, (trainx, trainy, valx, valy), epochs, learning_rate, batch_size=5000, logging=True)