## Agenda
[AlexNet](#AlexNet)  
[simple ConvNet](#Simple_ConvNet)  
[Training](#Training)  
[Testing](#Testing)  

In [3]:
import tensorflow as tf
import numpy as np
import os
import re
import io

tf.reset_default_graph();
input=tf.placeholder(tf.float32, (None, 3920), 'input')

## AlexNet
[back to top](#Agenda)  
The famous AlexNet [paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
<img src="assets/AlexNet_architecture.png" width="800" />
<img src="assets/cs231n_alexNet.png" width="800" />

[tensorflow impletation](https://www.cs.toronto.edu/~guerzhoy/tf_alexnet/myalexnet_forward_newtf.py)  
in the following architecture:  
.conv(11, 11, 96, 4, 4, padding='VALID', name='conv1')  
.lrn(2, 2e-05, 0.75, name='norm1')  
.max_pool(3, 3, 2, 2, padding='VALID', name='pool1')  
.conv(5, 5, 256, 1, 1, group=2, name='conv2')  
.lrn(2, 2e-05, 0.75, name='norm2')  
.max_pool(3, 3, 2, 2, padding='VALID', name='pool2')  
.conv(3, 3, 384, 1, 1, name='conv3')  
.conv(3, 3, 384, 1, 1, group=2, name='conv4')  
.conv(3, 3, 256, 1, 1, group=2, name='conv5')  
.max_pool(3, 3, 2, 2, padding='VALID', name='pool3')  
.fc(4096, name='fc6')  
.fc(4096, name='fc7')  
.fc(1000, relu=False, name='fc8')  
.softmax(name='prob')) 

In [5]:
def AlexNet(input, is_training):
    #reshape to 2d [batch_size, 98, 40, 1]  name='reshape_input_1'
    reshape_input_1 = tf.reshape(input, [-1, 98, 40, 1], 'reshape_input_1')
    #conv(6, 3, 96, 2, 2, padding='VALID', name='conv_2')  
    conv_2 = tf.layers.conv2d(reshape_input_1, 96, (6,3), (2,2), 'same', name='conv_2')

    #lrn(2, 2e-05, 0.75, name='norm_3')  
    norm_3 = tf.nn.local_response_normalization(conv_2, 2, None, 2e-05, 0.75, 'norm_3')

    #max_pool(3, 2, 1, 1, padding='VALID', name='pool_4')  
    pool_4 = tf.layers.max_pooling2d(norm_3, (3,2), (1,1), 'same', 'channels_last', 'pool_4')

    #conv(3, 2, 256, 1, 1, group=2, name='conv_5')  
    conv_5 = tf.layers.conv2d(pool_4, 256, (3,2), (1,1), 'same', name='conv_5')

    #lrn(2, 2e-05, 0.75, name='norm_6')  
    norm_6 = tf.nn.local_response_normalization(conv_5, 2, None, 2e-05, 0.75, 'norm_6')

    #max_pool(3, 2, 1, 1, padding='VALID', name='pool_7')  
    pool_7 = tf.layers.max_pooling2d(norm_6, (3,2), (1,1), 'same', 'channels_last', 'pool_7')

    #conv(3, 2, 384, 1, 1, name='conv_8')  
    conv_8 = tf.layers.conv2d(pool_7, 384, (3,2), (1,1), 'valid', name='conv_8')

    #conv(3, 2, 384, 1, 1, group=2, name='conv_9')  
    conv_9 = tf.layers.conv2d(conv_8, 384, (3,2), (1,1), 'valid', name='conv_9')

    #conv(3, 2, 256, 1, 1, group=2, name='conv_10')  
    conv_10 = tf.layers.conv2d(conv_9, 384, (3,2), (1,1), 'valid', name='conv_10')

    #max_pool(3, 2, 2, 2, padding='VALID', name='pool_11')  
    pool_11 = tf.layers.max_pooling2d(conv_10, (3,2), (2,2), 'valid', 'channels_last', 'pool_11')

    #reshape to 1d [-1, ?] name='reshape_input_12'
    size=int(pool_11.get_shape()[1]) * int(pool_11.get_shape()[2]) * int(pool_11.get_shape()[3])
    reshape_input_12 = tf.reshape(pool_11, (-1, size), 'reshape_input_12')

    #fc(1000, name='fc_13')  
    fc_13 = tf.layers.dense(reshape_input_12, 1000, activation=tf.nn.relu, name='fc_13')

    #fc(1000, name='fc_14')  
    fc_14 = tf.layers.dense(fc_13, 1000, activation=tf.nn.relu, name='fc_14')

    #fc(12, relu=False, name='fc_15')  
    fc_15 = tf.layers.dense(fc_14, 12, activation=None, name='fc_15')

    #softmax(name='prob_16'))  
    prob_16 = tf.nn.softmax(fc_15, name='prob_16')
    
    return prob_16


training result for AlexNet:
```
Step 0.000000, loss: 2.484012, accurancy: 0.120000.
Step 1.000000, loss: 2.485092, accurancy: 0.120000.
Step 2.000000, loss: 2.484010, accurancy: 0.060000.
Step 3.000000, loss: 2.485292, accurancy: 0.080000.
Step 4.000000, loss: 2.484657, accurancy: 0.140000.
Step 5.000000, loss: 2.485170, accurancy: 0.050000.
Step 6.000000, loss: 2.484865, accurancy: 0.080000.
Step 7.000000, loss: 2.484811, accurancy: 0.090000.
Step 8.000000, loss: 2.485102, accurancy: 0.110000.
Step 9.000000, loss: 2.486204, accurancy: 0.070000.
Step 10.000000, loss: 2.484865, accurancy: 0.120000.
Step 11.000000, loss: 2.484856, accurancy: 0.070000.
Step 12.000000, loss: 2.485339, accurancy: 0.060000.
Step 13.000000, loss: 2.483910, accurancy: 0.120000.
Step 14.000000, loss: 2.485012, accurancy: 0.130000.
Step 15.000000, loss: 2.484593, accurancy: 0.110000.
Step 16.000000, loss: 2.484843, accurancy: 0.070000.
Step 17.000000, loss: 2.485991, accurancy: 0.090000.
Step 18.000000, loss: 2.483060, accurancy: 0.070000.
Step 19.000000, loss: 2.485357, accurancy: 0.040000.
```
AlexNet is too deep for the feature in shape (98,40), 
the training can't learn anything.

## Simple_ConvNet 
[back to top](#Agenda)  
Inspired by [this](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/speech_commands/models.py#L273)

In [7]:
def simpleConvNet(input):
    """
    architecture as below:
     [Conv2D]<-(weights)
          v
      [BiasAdd]<-(bias)
          v
        [Relu]
          v
      [MatMul]<-(weights)
          v
      [BiasAdd]<-(bias)
          v
      [MatMul]<-(weights)
          v
      [BiasAdd]<-(bias)
          v
      [MatMul]<-(weights)
          v
      [BiasAdd]<-(bias)
      
    output is the logits in shape (batch_size, 12)
    """
    
    reshape_input_1 = tf.reshape(input, [-1, 98, 40, 1], 'reshape_input_1')
    conv_2 = tf.layers.conv2d(reshape_input_1, 186, (98,8), (1,1), 'valid', name='conv_2')
    relu_3 = tf.nn.relu(conv_2,'relu_3')
    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
    relu_3 = tf.nn.dropout(relu_3, dropout_prob, name = 'dropout')
    size=int(relu_3.get_shape()[1]) * int(relu_3.get_shape()[2]) * int(relu_3.get_shape()[3])
    relu_3 = tf.reshape(relu_3, (-1, size), 'reshape_relu_3')
    fc_4 = tf.layers.dense(relu_3, 128,name = 'fc_4')
    fc_5 = tf.layers.dense(fc_4, 128,name = 'fc_5')
    logits = tf.layers.dense(fc_5, 12,name = 'logits')
    
    return logits,dropout_prob

In [8]:
model_name = "simpleConvNet"
if model_name == "AlexNet":
    logits = AlexNet(input)
elif model_name == "simpleConvNet":
    logits,dropout_prob = simpleConvNet(input)

## Training
[back to top](#Agenda)

In [9]:
#training steps
ground_truth_input = tf.placeholder(tf.int64, [None], name='groundtruth_input')
learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input')
cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(ground_truth_input, logits)
train_step = tf.train.GradientDescentOptimizer(learning_rate_input).minimize(cross_entropy_mean)

predicted_indices = tf.argmax(logits, 1)
correct_prediction = tf.equal(predicted_indices, ground_truth_input)
confusion_matrix = tf.confusion_matrix(ground_truth_input, predicted_indices, num_classes=12)
evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [8]:
#load training data
data_dir = "/home/maikfangogoair/tmp/save/"
train_fingerprints=np.load(data_dir+"train_fingerprints.npy")
train_ground_truth=np.load(data_dir+"train_ground_truth.npy")
validate_fingerprints=np.load(data_dir+"validation_fingerprints.npy")
validate_ground_truth=np.load(data_dir+"validation_ground_truth.npy")
test_fingerprints=np.load(data_dir+"test_fingerprints.npy")
test_ground_truth=np.load(data_dir+"test_ground_truth.npy")

In [14]:
print(train_fingerprints.shape)
print(train_ground_truth.shape)
print(validate_fingerprints.shape)
print(validate_ground_truth.shape)
print(test_fingerprints.shape)
print(test_ground_truth.shape)

(22246, 3920)
(22246,)
(3093, 3920)
(3093,)
(3081, 3920)
(3081,)


In [8]:
size=train_ground_truth.shape[0]
saver = tf.train.Saver(tf.global_variables())
init = tf.global_variables_initializer()

In [12]:
batch_size = 100
learning_rate = 1e-3
learning_decay = 0.9
learning_decay_period = 1000
eval_every_steps = 1000
training_steps = 50000

with tf.Session() as sess:
    sess.run(init)
    for i in range(1, training_steps+1):
        start_pos = (i-1) * batch_size % size
        end_pos = (i-1) * batch_size % size + batch_size 
        loss, _ ,accurancy,  = sess.run([cross_entropy_mean, train_step, evaluation_step], \
                feed_dict={
                    input: train_fingerprints[start_pos:end_pos],
                    ground_truth_input: train_ground_truth[start_pos:end_pos],
                    learning_rate_input: learning_rate * (learning_decay**(i // learning_decay_period)),
                    dropout_prob: 0.5
                })
        if i % 100 == 0:
            print("Step %f, loss: %f, accurancy: %f." % (i, loss, accurancy))
        if  i % eval_every_steps == 0:
            saver.save(sess, data_dir + model_name +".ckpt", global_step=i)
            accurancy = sess.run(evaluation_step, \
                feed_dict={
                    input: validate_fingerprints,
                    ground_truth_input: validate_ground_truth,
                    dropout_prob: 1.0
                })
            print("Validation accurancy is %f" % accurancy)

Step 100.000000, loss: 2.276662, accurancy: 0.260000.
Step 200.000000, loss: 2.336545, accurancy: 0.170000.
Step 300.000000, loss: 2.161933, accurancy: 0.230000.
Step 400.000000, loss: 2.096804, accurancy: 0.330000.
Step 500.000000, loss: 1.996582, accurancy: 0.340000.
Step 600.000000, loss: 1.951751, accurancy: 0.400000.
Step 700.000000, loss: 1.986819, accurancy: 0.410000.
Step 800.000000, loss: 1.857643, accurancy: 0.430000.
Step 900.000000, loss: 1.850478, accurancy: 0.360000.
Step 1000.000000, loss: 1.905994, accurancy: 0.350000.
Validation accurancy is 0.467184
Step 1100.000000, loss: 1.826856, accurancy: 0.410000.
Step 1200.000000, loss: 1.850357, accurancy: 0.420000.
Step 1300.000000, loss: 1.778159, accurancy: 0.380000.
Step 1400.000000, loss: 1.716518, accurancy: 0.430000.
Step 1500.000000, loss: 1.647582, accurancy: 0.480000.
Step 1600.000000, loss: 1.697900, accurancy: 0.490000.
Step 1700.000000, loss: 1.806420, accurancy: 0.450000.
Step 1800.000000, loss: 1.604199, accuran

In [13]:
with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, '/home/maikfangogoair/tmp/save/simpleConvNet.ckpt-50000')
    accurancy = sess.run(evaluation_step, \
        feed_dict={
            input: test_fingerprints,
            ground_truth_input: test_ground_truth,
            dropout_prob: 1.0
        })
    print("Test accurancy is %f" % accurancy)

INFO:tensorflow:Restoring parameters from /home/maikfangogoair/tmp/save/simpleConvNet.ckpt-50000


INFO:tensorflow:Restoring parameters from /home/maikfangogoair/tmp/save/simpleConvNet.ckpt-50000


Test accurancy is 0.721194


## Testing
[back to top](#Agenda)

In [21]:
input_list = [x for x in os.listdir(data_dir) if len(re.findall('real_test_fingerprints_.+.npy', x))>0]
label_list = [x for x in os.listdir(data_dir) if len(re.findall('real_test_ground_truth_.+.npy', x))>0]
input_map = {x.split("_")[-1]: x for x in input_list}
label_map = {x.split("_")[-1]: x for x in label_list}

In [46]:
idx_to_word = ['silence', 'unknown', 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

In [47]:
with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, '/home/maikfangogoair/tmp/save/simpleConvNet.ckpt-50000')
    for key in input_map.keys():
        print("predicting:" + key)
        predicted_list = sess.run(predicted_indices, \
            feed_dict={
                input: np.load(data_dir + input_map[key]),
                ground_truth_input: np.load(data_dir + label_map[key]),
                dropout_prob: 1.0
            })
        with io.open(data_dir+"real_test_predict_"+key.replace(".npy","")+".txt",'w') as f:
            with io.open(data_dir+"real_test_file_"+key.replace(".npy","")+".txt",'r') as r_f:
                for x in predicted_list:
                    f.write(r_f.readline().replace("\n","")+","+idx_to_word[x]+"\n")

INFO:tensorflow:Restoring parameters from /home/maikfangogoair/tmp/save/simpleConvNet.ckpt-50000


INFO:tensorflow:Restoring parameters from /home/maikfangogoair/tmp/save/simpleConvNet.ckpt-50000


predicting:47559.npy
predicting:0.npy
predicting:15853.npy
predicting:110971.npy
predicting:158530.npy
predicting:142677.npy
predicting:126824.npy
predicting:79265.npy
predicting:95118.npy
predicting:31706.npy
predicting:63412.npy


In [48]:
!cat /home/maikfangogoair/tmp/save/real_test_predict_*txt \
> /home/maikfangogoair/tensorflow-speech-recognition-challenge/submit_entry_2.csv