LSTM Model make using reference code from https://medium.com/@erikhallstrm/hello-world-rnn-83cd7105b767

In [1]:
import tensorflow as tf
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

import time
import random

from tensorflow.python.framework import graph_util
from tensorflow.python.framework import ops

from pandas_ml import ConfusionMatrix

## load all npz files from npz_files directory.

Note:- All files in the "npz_files_directory_train" directory and "npz_files_directory_test" directory ending with .npz files are used for training and testing respectively.

In [2]:
import os

npz_files_directory_train = './npz-files/npz-files/train'
npz_files_directory_test = './npz-files/npz-files/test'
# collect all files from npz directory.
train_files = list()
for f in os.listdir(npz_files_directory_train):
    train_files.append(f)
    
test_files = list()
for f in os.listdir(npz_files_directory_test):
    test_files.append(f)

In [3]:
train_files = [file for file in train_files if file[-3:] == 'npz']
test_files = [file for file in test_files if file[-3:] == 'npz']

In [4]:
print('number of train files', len(train_files))
print('number of test files', len(test_files))

number of train files 6
number of test files 3


## load all numpy arrays into training data x and y.
structure of the npz files = ['c1', 'c2', 'c3', 'Z', 'labels'].
c1, c2, c3 => numpy array containing all class 1, 2, 3 bounding boxes respectively.
Z => the raw PSD files.
labels => pixel wise labels for psd files. same dimensions as the PSD files.

In [5]:
def load_data(directory, files_list):
    """
    Loads all the psd files from the directory in a list.
    directory: string: path where the psds are located. (same as npz_files_directory_train or npz_files_directory_test).
    files_list: list of all file names: train_files or test_files.
    
    returns:
    training_data_x: list of psds(numpy arrays):each element in the list is a single psd.
    training_data_y: list of psd labels(numpy arrays):each element in the list is a single psd label(pixel wise label).
    """
    training_data_x = list()
    training_data_y = list()

    count = 0
    for file_name in files_list:
        data1 = np.load(directory + '/' + file_name)
        training_data_x.append(data1['Z'])
        training_data_y.append(data1['labels'])
        count += 1
    print('files loaded', count)
    return training_data_x, training_data_y 

## convert training_data_x to shape (size of all timesteps, 512)

In [6]:
def processFrequencies(freq, num_steps):
    """
    converts a 512 sized array into a (512, num_steps, 1) sized array.
    e.g [1 2 3 4 5 6 7 8 9 10 ... 512] is converted into 
    [[1 2 3 4 5 6 7 8]
     [2 3 4 5 6 7 8 9]
     [3 4 5 6 7 8 9 10]
     ...
     [512 512 512 512 512 ...]
    ]
    freq: numpy array: each time-slice in a psd.
    num_steps: num of frequencies to consider in each step.
    
    returns:
    returns a numpy array as shown in the example above.
    """
    frequencies = np.zeros((512, num_steps), dtype=np.float32)
    assert freq.shape[0] == 512
    freq = np.reshape(freq, (512, 1))
    for i in range(freq.shape[0]):
        if freq[i:i+num_steps, 0].shape[0] == num_steps:
            frequencies[i, :] = freq[i:i+num_steps, 0]
        else:
#             print(freq[i:i+num_steps, 0].shape[0])
            frequencies[i, :] = np.pad(freq[i:i+num_steps, 0], (0, num_steps-freq[i:i+num_steps, 0].shape[0]), 'edge')
    frequencies = np.reshape(frequencies, (512, num_steps, 1))
    return frequencies

In [7]:
def processLabels(labels, num_classes):
    """
    one-hot encodes labels i
    labels: numpy array of size 512.
    """
    labels_reshaped = np.zeros((512, num_classes))
    for i in range(labels.shape[0]):
        labels_reshaped[i, labels[i]] = 1
    return labels_reshaped

## Convert all x_train data to serial x_train data.

In [8]:
def convertToSerialList(x, y, x_copy, y_copy, num_classes):
    assert len(x) == len(y)
    for i in range(len(x)):
        for j in range(x[i].shape[0]):
            x_copy.append(processFrequencies(x[i][j,:] , num_steps))
            y_copy.append(processLabels(y[i][j,:], num_classes))

In [9]:
def normalizePSD(psd, new_min=-1, new_max=1):
    """
    each PSD is normalized independently.
    normalization formula used = 
    f(x) = ((new_max - new_min) / (psd_max - psd_min)) * f(x) + new_min
    psd: numpy array with shape (1950, 512).
    """
    psd_max = np.max(psd)
    psd_min = np.min(psd)
    scaling_factor = (new_max - new_min) / (psd_max - psd_min)
    psd *= scaling_factor
    psd += new_min
    return psd

In [10]:
def plotPSD(psd):
    plt.plot(psd[0])
    plt.show()

## Creating the RNN model.

In [11]:
# Global model parameters.
# located in a single cell for model tuning.

# num of time steps to consider for each step.
num_steps = 8

# hidden layer dimensions, required parameter for creating LSTM cells.
# no. of LSTM cells in each layer.
hidden_layer_dimension = 16

# num of LSTM layers in the network.
number_layers = 2

# set use_dropout to true to include a dropout layer
use_dropout = False
dropout = 0.4

# sample size parameters.
batch_size = 128
num_epochs = 200
step_size = 512 // batch_size

# learning rate.
learning_rate = 0.001

# no. of examples, values populated when dataset is loaded.
num_samples_train = None
num_samples_test = None

In [12]:
class Input:
    def __init__(self, is_train=True):
        """
        creates two run able objects -> inputs for feeding inputs and labels.
        """
        self.is_train = is_train
        self.frequencies, self.labels = self.getData()
        self.dataset = tf.data.Dataset.from_tensor_slices((self.frequencies, self.labels))
        if self.is_train:
            self.dataset = self.dataset.shuffle(buffer_size=self.frequencies.shape[0])
        self.dataset = self.dataset.apply(tf.contrib.data.unbatch())
        self.dataset = self.dataset.batch(batch_size)
        self.dataset = self.dataset.prefetch(512 // batch_size)
        
        self.iterator = self.dataset.make_initializable_iterator()
        
    def getOutputType(self):
        return self.dataset.output_types
    
    def getOutputShape(self):
        return self.dataset.output_shapes
    
    def getData(self):
        global num_samples_train, num_samples_test
        if self.is_train:
            # load all the psds, labels into a training directory.
            training_x, training_y = load_data(npz_files_directory_train, train_files)
            # normalize each PSD.
            for index in range(len(training_x)):
                training_x[index] = normalizePSD(training_x[index])
            # unpack all psds into a single list, 
            # makes it easier to iterate over the all the psds.
            train_x, train_y = list(), list()
            convertToSerialList(training_x, training_y, train_x, train_y, num_classes)
            # train_x => contains all sequences for all timesteps.
            num_samples_train = len(train_x)
            return np.array(train_x), np.array(train_y)
        else:
            testing_x, testing_y = load_data(npz_files_directory_test, test_files)
            for index in range(len(testing_x)):
                testing_x[index] = normalizePSD(testing_x[index])
            test_x, test_y = list(), list()
            convertToSerialList(testing_x, testing_y, test_x, test_y, num_classes)
            num_samples_test = len(test_x)
            return np.array(test_x), np.array(test_y)

In [13]:
# test = Input()

In [14]:
class Model:
    def __init__(self):
        """
        two layer LSTM model:
                Input
                  |
                LSTM (hidden layer no. of cells)
                  |
                LSTM (hidden layer no. of cells)
                  |
                SOftmax
                  |
        Output Prediction Class.
        """
        # train and test dataset is loaded using tf data.
        self.train_dataset = Input(is_train=True)
        self.test_dataset = Input(is_train=False)
        
        self.handle = tf.placeholder(tf.string, [], name='dataset_handler')
        iterator = tf.data.Iterator.from_string_handle(self.handle, 
                                               self.train_dataset.dataset.output_types,
                                               self.train_dataset.dataset.output_shapes)
        input_data, label_data = iterator.get_next()
        self.input_data = input_data
        self.label_data = label_data
        
        # placeholder for holding an initial-state.
        self.init_state = tf.placeholder(tf.float32, [number_layers, 2, batch_size, hidden_layer_dimension])
        state_per_layer_list = tf.unstack(self.init_state, axis=0)
        rnn_state_tuples = tuple([tf.nn.rnn_cell.LSTMStateTuple(state[0], state[1]) for state in state_per_layer_list])

        # if dropout is enabled, then add a dropout layer.
        if use_dropout:
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropout)
            
        # maintaining state for each sequence.
        self.state = current_state = np.zeros((number_layers, 2, batch_size, hidden_layer_dimension))

        if number_layers > 1:
            cell_list = [self.createLSTMCells() for _ in range(number_layers)]
            cell = tf.nn.rnn_cell.MultiRNNCell(cell_list, state_is_tuple=True)
        elif number_layers == 1:
            cell = self.createLSTMCells()

        # dynamic rnn for performing unrolling of LSTM cell in time.
        self.output, self.state = tf.nn.dynamic_rnn(cell, 
                                          input_data,
                                          dtype=tf.float32, 
                                          initial_state=rnn_state_tuples)
        
        # extract the last output for time=num_seq from the output.
        self.output = tf.transpose(self.output, [1, 0, 2])
        self.output = tf.gather(self.output, int(self.output.shape[0]-1))
        
        # softmax layer
        dense_layer = tf.layers.dense(self.output, units=4, activation=tf.nn.relu)
        self.logits = tf.layers.dense(dense_layer, units=4, activation=None)
        
        # cross entropy loss.
        self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=label_data,
                                                                        logits=self.logits)
        self.loss = tf.reduce_mean(self.cross_entropy)
        
        # optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)
        
        # measuring accuracy.
        prediction = tf.nn.softmax(self.logits, name='label_prediction')
        self.prediction_class = tf.argmax(prediction, axis=1)
        output_labels_class = tf.argmax(label_data, axis=1)
        matching_prediction = tf.equal(output_labels_class, self.prediction_class)
        self.accuracy = tf.reduce_mean(tf.cast(matching_prediction, tf.float32), name='prediction_accuracy')
        
        # for recording test accuracy.
        self.accuracy_dict = dict()
        
    def plotAccuracy(self):
        """
        plot test accuracy for each epoch.
        """
        lists = sorted(self.accuracy_dict.items()) # sorted by epoch, return a list of tuples
        x, y = zip(*lists) # unpack a list of pairs into two tuples
        plt.plot(x, y)
        plt.show()
        
    def saveModel(self, epoch):
        """
        saves currently trained model into .pb format.
        """
        output_node_names = "label_prediction"
        output_graph_definition = graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        )
        model_save_path = ('./saved_models/model_'+str(epoch)+'.pb')
        with tf.gfile.GFile(model_save_path, "wb") as f:
            f.write(output_graph_definition.SerializeToString())
        
    def createLSTMCells(self, cell_type='LSTM'):
        # for creating independent LSTM cells.
        cell = tf.contrib.rnn.LSTMCell(hidden_layer_dimension, forget_bias=1.0)
        return cell
    
    def calculateTestAccuracy(self, epoch):
        """
        calculates test accuracy on the test set.
        """
        # restart test dataset ietrator.
        sess.run(self.test_dataset.iterator.initializer)
        
        # collect test accuracy and predictions.
        test_accuracy_list = list()
        label_list = list()
        current_state = np.zeros((number_layers, 2, batch_size, hidden_layer_dimension))
        counter = 0
        # looping till all values are consumed
        while True:
            try:
                accuracy, prediction_class, current_state, = sess.run([m.accuracy, m.prediction_class, m.state], 
                                                    feed_dict={m.init_state: current_state, m.handle: testing_handle})
                test_accuracy_list.append(accuracy)
                label_list.append(prediction_class)
                counter += 1
                # reset current state after each psd time slice is processed. 
                if counter != 0 and counter % step_size == 0:
                    current_state = np.zeros((number_layers, 2, batch_size, hidden_layer_dimension))
            # stop when all samples from the test dataset are processed.
            except tf.errors.OutOfRangeError:
                break
        print('test accuracy at epoch #', epoch, 'is ', (sum(test_accuracy_list) / len(test_accuracy_list)))
        self.accuracy_dict[epoch] = (sum(test_accuracy_list) / len(test_accuracy_list) )
        print(len(label_list))
        return label_list
    
    def printConfusionMatrix(self, prediction):
        """
        prints the confusion matrix for the current output predictions.
        prediction array: each samples contains output prediction for each batch_size.
        """
        prediction_np = np.array(prediction)
        # convert prediction_np into (test samples, 512) sized array.
        pred_np = np.zeros((num_samples_test, 512))
        row, col = 0, 0
        for i in range(prediction_np.shape[0]):
            pred_np[row, col:col+batch_size] = prediction_np[i]
            col += batch_size
            if col == 512:
                col = 0
                row += 1
        
        # convert actual output labels back from one hot encoded labels to class labels.
        y_test_np = np.argmax(self.test_dataset.labels, axis=2)
        
        # reshape into format required for confusion matrix from pandas_ml.
        y_test_np = np.reshape(y_test_np, [y_test_np.shape[0] * y_test_np.shape[1]])
        pred_np = np.reshape(pred_np, [pred_np.shape[0] * pred_np.shape[1]])
        cm = ConfusionMatrix(y_test_np, pred_np)
        print(cm)

In [15]:
m = Model()

files loaded 6
files loaded 3


In [None]:
cell_start_time = time.time()
prediction = None
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    counter = 0
    
    # initialize train and test handle
    training_handle = sess.run(m.train_dataset.iterator.string_handle())
    testing_handle = sess.run(m.test_dataset.iterator.string_handle())
    
    for epoch in range(num_epochs):
        print('epoch #', epoch, 'started')
        
        # variables.
        epoch_time = time.time()
        counter = 0
        current_state = np.zeros((number_layers, 2, batch_size, hidden_layer_dimension))
        train_accuracy_list = list()
        
        # initializing the train dataset.
        sess.run(m.train_dataset.iterator.initializer)
        # looping till all values are consumed
        while True:
            try:
                if epoch % 10 == 0:
                    optimizer_ops, accuracy, current_state = sess.run([m.optimizer, m.accuracy, m.state], 
                                                                   feed_dict={m.init_state: current_state, 
                                                                              m.handle: training_handle})
                    train_accuracy_list.append(accuracy)
                    counter += 1
                    if counter != 0 and counter % step_size == 0:
                        current_state = np.zeros((number_layers, 2, batch_size, hidden_layer_dimension))   
                else:
                    parameter1, current_state = sess.run([m.optimizer, m.state], 
                                                             feed_dict={m.init_state: current_state, 
                                                                        m.handle: training_handle})
                    counter += 1
                    if counter != 0 and counter % step_size == 0:
                        current_state = np.zeros((number_layers, 2, batch_size, hidden_layer_dimension))
            except tf.errors.OutOfRangeError:
                break
            
        # after training, if epoch number is multiple of 10.
        if epoch % 10 == 0:
            print('train set accuracy at epoch #', epoch, 'is ', (sum(train_accuracy_list) / len(train_accuracy_list)))
            saver.save(sess, './checkpoints/model_checkpoint_'+str(epoch))
            m.saveModel(epoch)
            prediction = m.calculateTestAccuracy(epoch)
            m.printConfusionMatrix(prediction)
        end_time = time.time()
        print('epoch #', epoch, 'ended - ',  end_time - epoch_time)
    saver.save(sess, './checkpoints/model_checkpoint_final')
    prediction = m.calculateTestAccuracy(epoch)
    m.printConfusionMatrix(prediction)
#     m.saveModel(epoch)
cell_end_time = time.time()
print('total time to train the network is ', cell_end_time - cell_start_time)

epoch # 0 started
train set accuracy at epoch # 0 is  0.9303600430427548
INFO:tensorflow:Froze 8 variables.
INFO:tensorflow:Converted 8 variables to const ops.
test accuracy at epoch # 0 is  0.9342165628200751
23432
Predicted      0.0     1.0   2.0  3.0  __all__
Actual                                        
0.0        2688296   58282  2699  213  2749490
1.0          44362  108483   118   89   153052
2.0          69351    1971  5134  173    76629
3.0           6176   13651   219   79    20125
__all__    2808185  182387  8170  554  2999296
epoch # 0 ended -  346.5949547290802
epoch # 1 started
epoch # 1 ended -  254.78855347633362
epoch # 2 started
epoch # 2 ended -  248.08222198486328
epoch # 3 started
epoch # 3 ended -  249.53509283065796
epoch # 4 started
epoch # 4 ended -  247.68801403045654
epoch # 5 started
epoch # 5 ended -  258.2335960865021
epoch # 6 started
epoch # 6 ended -  245.02291822433472
epoch # 7 started
epoch # 7 ended -  246.84036993980408
epoch # 8 started
epoch # 8

In [None]:
m.printConfusionMatrix(prediction)

In [None]:
m.plotAccuracy()

Predicted      0.0     1.0    2.0    3.0  __all__
Actual                                           
0.0        2591457  122997  23270  11766  2749490
1.0         103267   45613   2025   2147   153052
2.0          60231    4473   9225   2700    76629
3.0          14268    2793   1423   1641    20125
__all__    2769223  175876  35943  18254  2999296

Predicted      0.0     1.0    2.0    3.0  __all__
Actual                                           
0.0        2688045   44755  13187   3503  2749490
1.0          37150  110823   1467   3612   153052
2.0          55988    2004  18186    451    76629
3.0           3436   12181    307   4201    20125
__all__    2784619  169763  33147  11767  2999296

In [None]:
Predicted      0.0     1.0    2.0    3.0  __all__
Actual                                           
0.0        2669628   43145  31922   4795  2749490
1.0          36345  109998   3738   2971   153052
2.0          47221     957  27029   1422    76629
3.0           3085    9655    749   6636    20125
__all__    2756279  163755  63438  15824  2999296