### How to use this file
This file shows the vanishing gradient problem. 
Execute it with LR 0.1 and LR 0.01 and have a look at the final plots when the conv-layers change.
Note that the network uses the original loop approach (TIMESTEPS, RGB_CHANNELS, HEIGHT, WIDTH) and the gradients do change (if the learning rate is sufficiently high).



In [1]:
print("No configuration needed")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import time
import os
import copy
import matplotlib.pyplot as plt

#DESIGN PARAMETERS FOR NEURAL NETWORK
NR_LSTM_UNITS = 1
IMAGE_INPUT_SIZE = 550
# Similar input size to "normal" data: 130x130 -> 15x15 with 3 conv layers
# image input of 54 yields output after conv layers of 5
IMAGE_AFTER_CONV_SIZE = 15

#for 3x3 kernels, n=num_layers: len_in = 2^n*len_out + sum[i=1..n](2^i)
#CONV_LAYER_LENGTH = 5

LSTM_INPUT_SIZE = IMAGE_AFTER_CONV_SIZE*IMAGE_AFTER_CONV_SIZE
LSTM_HIDDEN_SIZE = 100

RGB_CHANNELS = 3
TIMESTEPS = 10
BATCH_SIZE = 1 #until now just batch_size = 1
NR_EPOCHS = 50
LEARNING_RATE = 0.01


FORMAT_TIMESTEPS = True

No configuration needed


In [2]:
# Generate dataset
if TIMESTEPS == 10:
    lab = [
        [0,0,0,1,1,2,2,2,1,1],
        [0,1,1,2,2,2,1,1,0,0],
        [1,2,2,2,1,1,0,0,0,1],
        [2,2,1,1,0,0,0,1,1,2],
        [1,1,0,0,0,1,1,2,2,2],
        [0,0,1,1,2,2,2,1,1,0],
        [1,1,2,2,2,1,1,0,0,1],
        [2,2,2,1,1,0,0,0,1,1],
        [2,1,1,0,0,0,1,1,2,2],
        [1,0,0,0,1,1,2,2,2,1]
    ]
elif TIMESTEPS == 20:
        lab = [
        [0,0,0,1,1,2,2,2,1,1,0,0,0,1,1,2,2,2,1,1],
        [0,1,1,2,2,2,1,1,0,0,0,1,1,2,2,2,1,1,0,0],
        [1,2,2,2,1,1,0,0,0,1,1,2,2,2,1,1,0,0,0,1],
        [2,2,1,1,0,0,0,1,1,2,2,2,1,1,0,0,0,1,1,2],
        [1,1,0,0,0,1,1,2,2,2,1,1,0,0,0,1,1,2,2,2],
        [0,0,1,1,2,2,2,1,1,0,0,0,1,1,2,2,2,1,1,0],
        [1,1,2,2,2,1,1,0,0,1,1,1,2,2,2,1,1,0,0,1],
        [2,2,2,1,1,0,0,0,1,1,2,2,2,1,1,0,0,0,1,1],
        [2,1,1,0,0,0,1,1,2,2,2,1,1,0,0,0,1,1,2,2],
        [1,0,0,0,1,1,2,2,2,1,1,0,0,0,1,1,2,2,2,1]
        ]
std_dev = 1
training_set_size = len(lab)
arr = np.full((training_set_size,TIMESTEPS,BATCH_SIZE,RGB_CHANNELS,IMAGE_INPUT_SIZE,IMAGE_INPUT_SIZE),0)
noise_arr = np.random.normal(0,std_dev,arr.shape)
# print(noise_arr.shape)

sequences = lab
for i,ll in enumerate(lab):
    for j,l in enumerate(ll):
        sequences[i][j] = (l-1)*0.5
# print(sequences)

for i, sequence in enumerate(sequences):
    for j, image in enumerate(sequence):
        noise_arr[i][j] = noise_arr[i][j] + np.full(noise_arr[i][j].shape,image)
# print(noise_arr)

dataset = torch.from_numpy(noise_arr)
labelset = torch.tensor(lab)
# print(labels)
# print(dataset)
print("Dataset created")

Dataset created


In [3]:
def accuracy(model):
    print("To be implemented")

#     n_batches_test = len(test_loader)

#     #Time for printing
#     testing_start_time = time.time()

#     print('Start testing...')
#     correct = 0 
#     total = 0
#     with torch.no_grad():
#         for i, batch in enumerate(train_loader):
#             inputs, labels = batch
            
#             data_in = [s.to(device) for s in inputs['flows']]
#             labels = labels.to(device)
#             if not labels.size()[0] == BATCH_SIZE:
#                 # skip uncompleted batch size NN is fixed to BATCHSIZE
#                 continue
#             outputs = model(data_in)
# #             print("Out:", len(outputs), outputs.size())
# #             print("Labels:", len(labels), labels.size())
#             _, predicted = torch.max(outputs.data, 1)
# #             print('predicted:',len(predicted),predicted.size())
#             n_errors = torch.nonzero(torch.abs(labels.long() - predicted)).size(0)
#             total += predicted.numel()
#             # print('predicted',predicted)
#             correct += predicted.numel() - n_errors
#             # print('labels',labels)
#     print('Accuracy {:.2f}%'.format(100*correct/total))
#     print('...testing finished')
# print("Definition done")

In [4]:
#USE RANDOM IMAGES TO SET UP WORKING EXAMPLE
class TEST_CNN_LSTM(nn.Module):
    def __init__(self):
        super(TEST_CNN_LSTM, self).__init__()
                                        # in 54x54
        self.conv1 = nn.Conv2d(3,6,3) #out 52x52
        self.pool1 = nn.MaxPool2d(2,2) #out 26x26
        self.conv2= nn.Conv2d(6,3,3) #out 24x24
        self.pool2 = nn.MaxPool2d(2,2) #out 12x12
        self.conv3 = nn.Conv2d(3,1,3) #out 10x10
        self.pool3 = nn.MaxPool2d(2,2) #out 5x5
        self.conv4= nn.Conv2d(1,1,3) #out 24x24
        self.pool4 = nn.MaxPool2d(2,2) #out 12x12
        self.conv5 = nn.Conv2d(1,1,3) #out 10x10
        self.pool5 = nn.MaxPool2d(2,2) #out 5x5     
        
#         self.lstm = nn.LSTM(LSTM_INPUT_SIZE,
#                             LSTM_HIDDEN_SIZE,
#                             NR_LSTM_UNITS)
        self.fc1 = nn.Linear(LSTM_INPUT_SIZE,100)
        self.fc2 = nn.Linear(100,20)
        self.fc3 = nn.Linear(20,3)
        
        #initialize hidden states of normal LSTM
        self._hidden = (torch.randn(NR_LSTM_UNITS, BATCH_SIZE, LSTM_HIDDEN_SIZE),
                        torch.randn(NR_LSTM_UNITS, BATCH_SIZE, LSTM_HIDDEN_SIZE))

        print("Hidden:", len(self._hidden))
    def forward(self,x):
        #print("Input:", x.size())
        x = x.float()
        
#         print("X arr size", x_arr.size())
#         print("x shape",x.shape)
#         print("x[0]",x[0].shape)
        x_arr = torch.zeros(TIMESTEPS,BATCH_SIZE,1,IMAGE_AFTER_CONV_SIZE,IMAGE_AFTER_CONV_SIZE)
    
        for i in range(TIMESTEPS):#parallel convolutions which are later concatenated for LSTM
            x_tmp_c1 = self.pool1(F.relu(self.conv1(x[i])))
            x_tmp_c2 = self.pool2(F.relu(self.conv2(x_tmp_c1)))
            x_tmp_c3 = self.pool3(F.relu(self.conv3(x_tmp_c2)))
            x_tmp_c4 = self.pool4(F.relu(self.conv4(x_tmp_c3)))
            x_tmp_c5 = self.pool5(F.relu(self.conv5(x_tmp_c4)))
            x_arr[i] = x_tmp_c5
    
        x = torch.cat(tuple(x for x in x_arr),0)
        
#         print("x before LSTM",x.view(TIMESTEPS,BATCH_SIZE,-1).shape) 
#         x, _hidden = self.lstm(x.view(TIMESTEPS,BATCH_SIZE,-1), self._hidden)
#         print("x after LSTM",x.shape) 
        x = x.view(-1,LSTM_INPUT_SIZE)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
print("Class defined")

#TRAINING
test_net = TEST_CNN_LSTM()
criterion = nn.CrossEntropyLoss()
#criterion = nn.BCELoss()
optimizer = optim.SGD(test_net.parameters(), lr=LEARNING_RATE, momentum=0.9)



Class defined
Hidden: 2


In [5]:
parameters_conv1_init = list(test_net.conv1.parameters())
bias_conv1_init = parameters_conv1_init[1].detach().numpy()
weights_conv1_init = parameters_conv1_init[0].detach().numpy().flatten()

parameters_fc1_init = list(test_net.fc1.parameters())
bias_fc1_init = parameters_fc1_init[1].detach().numpy()
weights_fc1_init = parameters_fc1_init[0].detach().numpy().flatten()

print("FC1: Mean",np.mean(weights_fc1_init),"Variance",np.var(weights_fc1_init))
print("Saved initial weights and biases from net")

FC1: Mean 0.00014501961 Variance 0.0014719596
Saved initial weights and biases from net


In [6]:
loss_array = []
conv_var_array = []
fc_var_array = []
print('Start training...')
for epoch in range(NR_EPOCHS): 
    loss = 0.0
    optimizer.zero_grad() 
    
    for data_in, labels in zip(dataset, labelset):
        outputs = test_net(data_in)
        single_loss = criterion(outputs, labels.long())
        loss += single_loss
    
    loss.backward() 
    optimizer.step()
    
    print("Epoch:", epoch, "Loss",loss.data.item())
    parameters_conv1 = list(test_net.conv1.parameters())
    conv_weight_diff = parameters_conv1[0].detach().numpy().flatten()-weights_conv1_init
    parameters_fc1 = list(test_net.fc1.parameters())
    fc_weight_diff = parameters_fc1[0].detach().numpy().flatten()-weights_fc1_init
    conv_var_array.append(np.var(conv_weight_diff))
    fc_var_array.append(np.var(fc_weight_diff))
    loss_array.append(loss.data.item())
print('...Training finished')


Start training...
Forward pass
Loss defined
Optimizer step
Epoch: 0 Loss 11.617700576782227
Forward pass
Loss defined
Optimizer step
Epoch: 1 Loss 10.844573974609375
Forward pass
Loss defined
Optimizer step
Epoch: 2 Loss 9.455743789672852
Forward pass
Loss defined
Optimizer step
Epoch: 3 Loss 7.654208183288574
Forward pass
Loss defined
Optimizer step
Epoch: 4 Loss 5.666996955871582
Forward pass
Loss defined
Optimizer step
Epoch: 5 Loss 3.7423577308654785
Forward pass
Loss defined
Optimizer step
Epoch: 6 Loss 2.071416139602661
Forward pass
Loss defined
Optimizer step
Epoch: 7 Loss 0.8419156074523926
Forward pass
Loss defined
Optimizer step
Epoch: 8 Loss 0.22957563400268555
Forward pass
Loss defined
Optimizer step
Epoch: 9 Loss 0.04066944122314453
Forward pass
Loss defined
Optimizer step
Epoch: 10 Loss 0.00484466552734375
Forward pass
Loss defined
Optimizer step
Epoch: 11 Loss 0.0004100799560546875
Forward pass
Loss defined
Optimizer step
Epoch: 12 Loss 2.86102294921875e-05
Forward pass


KeyboardInterrupt: 

In [None]:
NR_EPOCHS = 100
print("done")

In [None]:
parameters_fc1 = list(test_net.fc1.parameters())
bias_fc1 = parameters_fc1[1].detach().numpy()
weights_fc1 = parameters_fc1[0].detach().numpy().flatten()

parameters_conv1 = list(test_net.conv1.parameters())
bias_conv1 = parameters_conv1[1].detach().numpy()
weights_conv1 = parameters_conv1[0].detach().numpy().flatten()

weights_fc1_m = np.mean(weights_fc1-weights_fc1_init)
weights_fc1_v = np.var(weights_fc1-weights_fc1_init)

print("Conv: Mean",np.mean(weights_conv1_init),"Variance",np.var(weights_conv1_init))
print("Conv: Mean",np.mean(weights_conv1),"Var",np.var(weights_conv1))
print("FC1 Diff: Var",np.var(weights_fc1_init-weights_fc1))

# plt.figure()
# plt.title("Difference in FC-Layers")
# plt.plot(range(len(weights_fc1)),weights_fc1_init-weights_fc1)

# plt.figure()
# plt.title("Difference in Conv-Layers")
# plt.plot(range(len(weights_conv1)),weights_conv1_init-weights_conv1)#,range(len(weights_conv5)),weights_conv5)

plt.figure()
plt.title("Loss")
plt.plot(loss_array)
plt.figure()
plt.title("Variance of difference of conv layer")
plt.plot(conv_var_array)
plt.figure()
plt.title("Variance of difference of fc layer")
plt.plot(fc_var_array)
print("With LR 0.1 the network converges within 100 epochs")
print("With LR 0.01 the network does converge too, but only after around 500 epochs")
print("With LR 0.01 we experience a vanishing gradient problem, the conv layers do not change at all in the beginning")