# Speech Recognition
The objective is to use the __Speech Commands Datasets__ to identify keywords like
* yes
* no
* up
* down
* left
* right
* on
* off
* stop
* go
* Everything else is considered either *unknown* or *silence*

In [1]:
# Import all necessary modules
import pandas as pd
import numpy as np
import tensorflow as tf
import cv2
import os
import scipy.io.wavfile
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

TRAIN_YES = './train/audio/yes/'
TRAIN_NO = './train/audio/no/'
TRAIN_UP = './train/audio/up/'
TRAIN_DOWN = './train/audio/down/'
TRAIN_LEFT = './train/audio/left/'
TRAIN_RIGHT = './train/audio/right/'
TRAIN_ON = './train/audio/on/'
TRAIN_OFF = './train/audio/off/'
TRAIN_STOP = './train/audio/stop/'
TRAIN_GO = './train/audio/go/'

# noise
TRAIN_UNKNOWN = [
    './train/audio/bird/', './train/audio/dog/', './train/audio/eight/', 
    './train/audio/four/', './train/audio/happy/', './train/audio/nine/',
    './train/audio/one/', './train/audio/seven/', './train/audio/six/',
    './train/audio/three/', './train/audio/two/', './train/audio/wow/',
    './train/audio/zero/', './train/audio/bed/', './train/audio/cat/',
    './train/audio/house/', './train/audio/marvin/', './train/audio/sheila/', 
    './train/audio/three/', './train/audio/_background_noise_/'
]

TEST_DIR = './test/audio/'

## Read training/testing data from respective directives

In [2]:
train = list()
labels = list()
test = list()

def read_data(train, labels, test, is_testing=False):
    if is_testing == False:
        print('Reading training data...')
        
        print('Reading sample audios for YES')
        for audio in tqdm(os.listdir(TRAIN_YES)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_YES + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in YES with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[0] = 1
                labels.append(label)
                
        print('Reading sample audios for NO')
        for audio in tqdm(os.listdir(TRAIN_NO)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_NO + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in NO with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[1] = 1
                labels.append(label)
                
        print('Reading sample audios for UP')
        for audio in tqdm(os.listdir(TRAIN_UP)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_UP + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in UP with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[2] = 1
                labels.append(label)
                
        print('Reading sample audios for DOWN')
        for audio in tqdm(os.listdir(TRAIN_DOWN)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_DOWN + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in DOWN with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[3] = 1
                labels.append(label)
                
        print('Reading sample audios for LEFT')
        for audio in tqdm(os.listdir(TRAIN_LEFT)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_LEFT + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in LEFT with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[4] = 1
                labels.append(label)
                
        print('Reading sample audios for RIGHT')
        for audio in tqdm(os.listdir(TRAIN_RIGHT)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_RIGHT + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in RIGHT with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[5] = 1
                labels.append(label)
                
        print('Reading sample audios for ON')
        for audio in tqdm(os.listdir(TRAIN_ON)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_ON + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in ON with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[6] = 1
                labels.append(label)
                
        print('Reading sample audios for OFF')
        for audio in tqdm(os.listdir(TRAIN_OFF)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_OFF + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in OFF with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[7] = 1
                labels.append(label)
                
        print('Reading sample audios for STOP')
        for audio in tqdm(os.listdir(TRAIN_STOP)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_STOP + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in STOP with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[8] = 1
                labels.append(label)
                
        print('Reading sample audios for GO')
        for audio in tqdm(os.listdir(TRAIN_GO)):
            ext = os.path.splitext(audio)[1]
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TRAIN_GO + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                if clip.shape[0] > 16000:
                    print('Found a clip in GO with shape more than 16000')
                train.append(clip)
                
                label = np.zeros(11)
                label[9] = 1
                labels.append(label)
                
        
        print('Reading sample audios for UNKNOWNS')
        for d in TRAIN_UNKNOWN:
            for audio in tqdm(os.listdir(d)):
                ext = os.path.splitext(audio)[1]
            
                if ext is not 'md' or ext is not 'MD':
                    
                    try:
                        rate, clip = scipy.io.wavfile.read(d + audio)
                        if clip.shape[0] < 16000:
                            clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                    
                        if clip.shape[0] > 16000:                            
                            print('Found a clip in UNKNOWN with shape more than 16000')
                            print(clip.shape)
                            continue
                        train.append(clip)
                    
                        label = np.zeros(11)
                        label[10] = 1
                        labels.append(label)
                        
                    except:
                        print(d + audio)
        
        train = np.array(train)
        labels = np.array(labels)
        
        np.save('train.npy', train)
        np.save('labels.npy', labels)
                    
    else:
        print('Reading testing data...')
        df = pd.DataFrame(columns=['fname', 'label'])
        i = 0
        
        for audio in tqdm(os.listdir(TEST_DIR)):
            ext = os.path.splitext(audio)[1]
            df['fname'][str(i)] = audio
            
            if ext is not 'md' or ext is not 'MD':
                rate, clip = scipy.io.wavfile.read(TEST_DIR + audio)
                if clip.shape[0] < 16000:
                    clip = np.pad(clip, (0, 16000 - len(clip)), 'constant')
                
                if clip.shape[0] > 16000:                            
                    print('Found a clip in TEST DIR with shape more than 16000')
                    print(clip.shape)
                
                test.append(clip)
        
        test = np.array(test)
        np.save('test.npy', test)
        
        df.to_csv('predictions.csv', sep='\t')

In [3]:
if os.path.isfile('./train.npy') == False or os.path.isfile('./labels.npy') == False:
    read_data(train, labels, test)
else:
    train = np.load('train.npy')
    labels = np.load('labels.npy')
    
if os.path.isfile('./test.npy') == False:
    read_data(train, labels, test, is_testing=True)

  1%|          | 969/158538 [00:00<00:16, 9679.74it/s]

Reading testing data...


100%|██████████| 158538/158538 [04:55<00:00, 536.81it/s]


In [4]:
# np.set_printoptions(threshold=np.nan)
train = np.load('train.npy')
print(train.shape)

labels = np.load('labels.npy')
print(labels.shape)
# train[0].shape
# print(train[1].shape)

(62987, 16000)
(62987, 11)


### Input

The neural network needs to read the audio data, one-hot encoded labels, and dropout keep probability. Implement the following functions
* Implement neural_net_input
    * Return a TF Placeholder
    * Set the shape using `shape` with batch size set to None.
    * Name the TensorFlow placeholder "x" using the TensorFlow name parameter in the TF Placeholder.
* Implement neural_net_label
    * Return a TF Placeholder
    * Set the shape using n_classes with batch size set to None.
    * Name the TensorFlow placeholder "y" using the TensorFlow name parameter in the TF Placeholder.
* Implement neural_net_keep_prob_input
    * Return a TF Placeholder for dropout keep probability.
    * Name the TensorFlow placeholder "keep_prob" using the TensorFlow name parameter in the TF Placeholder.
    * These names will be used at the end of the project to load your saved model.

Note: _None_ for shapes in TensorFlow allow for a dynamic size.

In [5]:
def neural_net_input(shape):
    return tf.placeholder(tf.float32, [None, shape[1]], 'x')

def neural_net_label(n_classes=11):
    return tf.placeholder(tf.float32, [None, n_classes], 'y')

def neural_net_keep_prob():
    return tf.placeholder(tf.float32, None, 'keep_prob')

### Input Layer
Implement the input_layer function whose output should be a layer with ___16000___ neurons. Shortcut option: you can use classes from the [TensorFlow Layers](https://www.tensorflow.org/api_docs/python/tf/layers) or [TensorFlow Layers (contrib)](https://www.tensorflow.org/api_guides/python/contrib.layers) packages for this layer. For more of a challenge, only use other TensorFlow packages.

In [6]:
def input_layer(x_tensor, n_neurons, keep_prob):
    features_count=16000
    weight = tf.Variable(tf.truncated_normal((features_count, n_neurons), 0, 0.1))
    bias = tf.Variable(tf.zeros(n_neurons))
    
    model = tf.add(tf.matmul(x_tensor, weight), bias)
    model = tf.nn.relu(model)
    model = tf.nn.dropout(model, keep_prob)
    return model

### Fully-Connected Layer
Implement the `fully_conn` function to create a fully connected layer which is also the hidden layer. Shortcut option: you can use classes from the [TensorFlow Layers](https://www.tensorflow.org/api_docs/python/tf/layers) or [TensorFlow Layers (contrib)] packages for this layer. For more of a challenge, only use other TensorFlow packages.

In [7]:
def fully_conn(x_tensor, n_neurons, keep_prob):
    x_dim = x_tensor.get_shape().as_list()
    
    weight = tf.Variable(tf.truncated_normal(list((x_dim[1],) + (n_neurons,)), 0, 0.1))
    bias = tf.Variable(tf.zeros(n_neurons))
    
    model = tf.add(tf.matmul(x_tensor, weight), bias)
    model = tf.nn.relu(model)
    model = tf.nn.dropout(model, keep_prob)
    return model

### Output Layer
Implement the output function to create the final output layer. Since we need to classify ___11___ labels, this layer will have 11 neurons. Shortcut option: you can use classes from the [TensorFlow Layers](https://www.tensorflow.org/api_docs/python/tf/layers) or [TensorFlow Layers (contrib)](https://www.tensorflow.org/api_guides/python/contrib.layers) packages for this layer. For more of a challenge, only use other TensorFlow packages.

__Note__: Activation, softmax, or cross entropy should __not__ be applied to this.

In [8]:
def output_layer(x_tensor, n_classes=11):
    x_dim = x_tensor.get_shape().as_list()
    weight = tf.Variable(tf.truncated_normal(list((x_dim[1],) + (n_classes,)), 0, 0.01))
    bias = tf.Variable(tf.zeros(n_classes))
    
    output = tf.add(tf.matmul(x_tensor, weight), bias)
    
    return output

### Create Neural Network Model
Implement the function `conv_net` to create a convolutional neural network model. The function takes in a batch of images, x, and outputs logits. Use the layers you created above to create this model:
* Create one input layer
* Apply 1, 2, or 3 hidden layers
* Apply an Output Layer
* Return the output

In [9]:
def neural_net(x, y, keep_prob):
    model = input_layer(x, 128, keep_prob)
#     model = fully_conn(model, 128, keep_prob)
    model = fully_conn(model, 64, keep_prob)
    model = fully_conn(model, 32, keep_prob)
    model = fully_conn(model, 16, keep_prob)
    model = output_layer(model)
    
    return model

##############################
## Build the Neural Network ##
##############################

# Remove previous weights, bias, inputs, etc..
tf.reset_default_graph()

# Inputs
x = neural_net_input(train.shape)
y = neural_net_label()
keep_prob = neural_net_keep_prob()

# Model
logits = neural_net(x, y, keep_prob)

# Name logits Tensor, so that is can be loaded from disk after training
logits = tf.identity(logits, name='logits')

# Loss and Optimization
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

# Accuracy
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy')

## Train the Neural Network
### Single Optimization
Implement the function `train_neural_network` to do a single optimization. The optimization should use `optimizer` to optimize in session with a `feed_dict` of the following:
* x for image input
* y for labels
* keep_prob for keep probability for dropout
This function will be called for each batch, so `tf.global_variables_initializer()` has already been called.

Note: Nothing needs to be returned. This function is only optimizing the neural network.

In [10]:
def train_neural_network(session, optimizer, keep_probability, feature_batch, label_batch):
    session.run(optimizer, feed_dict={x:feature_batch, y:label_batch, keep_prob:keep_probability})

## Show Stats
Implement the function `print_stats` to print loss and validation accuracy. Use the global variables `valid_features` and `valid_labels` to calculate validation accuracy. Use a keep probability of 1.0 to calculate the loss and validation accuracy.

In [11]:
def print_stats(session, feature_batch, label_batch, cost, accuracy, val_features, val_labels):
    loss = session.run(cost, feed_dict={x:feature_batch, y:label_batch, keep_prob:1.0})
    valid_acc = session.run(accuracy, feed_dict={
        x: val_features,
        y: val_labels,
        keep_prob: 1.0
    })
    print('Loss: {:>10.4f} Validation Accuracy: {:.6f}'.format(loss, valid_acc))

## Hyperparameters
Tune the following parameters:
* Set epochs to the number of iterations until the network stops learning or start overfitting
* Set batch_size to the highest number that your machine has memory for. Most people set them to common sizes of memory:
    * 64
    * 128
    * 256
    * ...
* Set keep_probability to the probability of keeping a node using dropout

In [12]:
# TODO: Tune Parameters
epochs = 10
batch_size = 32
keep_probability = 0.7

## Train on a Single Batch
Instead of training the neural network on all the batches of data, let's use a single batch. This should save time while you iterate on the model to get a better accuracy. Once the final validation accuracy is 50% or greater, run the model on all the data in the next section.

In [None]:
print('Training...')
with tf.Session() as sess:
    # Initializing the variables
    sess.run(tf.global_variables_initializer())
    
    
    # Training cycle
    for epoch in range(epochs):
        # Loop over all batches
        i = 0
        index = batch_size

        batch_features = train[i:index]
        batch_labels = labels[i:index]
            
        batch_features, val_features, batch_labels, val_labels = train_test_split(batch_features, 
                                                                                     batch_labels, test_size=0.1)
        train_neural_network(sess, optimizer, keep_probability, batch_features, batch_labels)    
        print('Epoch {:>2}, Speech Batch {}:  '.format(epoch + 1, 0), end='')
        print_stats(sess, batch_features, batch_labels, cost, accuracy, val_features, val_labels)

## Fully Train the Model
Now that you got a good accuracy with a single batch, try it with all five batches.

In [13]:
save_model_path = './model'

print('Training...')
with tf.Session() as sess:
    # Initializing the variables
    sess.run(tf.global_variables_initializer())
    
    n_batches = train.shape[0] // batch_size        
    
    # Training cycle
    for epoch in range(epochs):
        i = 0
        index = batch_size        
        # Loop over all batches        
        for batch_i in range(1, n_batches + 1):
            batch_features = train[i:index]
            batch_labels = labels[i:index]
            
            batch_features, val_features, batch_labels, val_labels = train_test_split(batch_features, 
                                                                                     batch_labels, test_size=0.1)
            
            train_neural_network(sess, optimizer, keep_probability, batch_features, batch_labels)
            if batch_i == 1 or batch_i % 100 == 0 or batch_i == 984:
                print('Epoch {:>2}, Speech Batch {}:  '.format(epoch + 1, batch_i), end='')
                print_stats(sess, batch_features, batch_labels, cost, accuracy, val_features, val_labels)

            i += batch_size
            index += batch_size
            
    # Save Model
    saver = tf.train.Saver()
    save_path = saver.save(sess, save_model_path)

Training...
Epoch  1, Speech Batch 1:  Loss:    25.2190 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 100:  Loss:     0.1317 Validation Accuracy: 1.000000
Epoch  1, Speech Batch 200:  Loss:     1.5928 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 300:  Loss:     2.5106 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 400:  Loss:     2.3795 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 500:  Loss:     2.5137 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 600:  Loss:     2.6212 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 700:  Loss:     2.7324 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 800:  Loss:     2.7323 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 900:  Loss:     2.4750 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 984:  Loss:     2.3052 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 1000:  Loss:     2.2755 Validation Accuracy: 0.000000
Epoch  1, Speech Batch 1100:  Loss:     2.1010 Validation Accuracy: 1.000000


Epoch  6, Speech Batch 300:  Loss:     3.3896 Validation Accuracy: 0.000000
Epoch  6, Speech Batch 400:  Loss:     3.2092 Validation Accuracy: 0.000000
Epoch  6, Speech Batch 500:  Loss:     3.0739 Validation Accuracy: 0.000000
Epoch  6, Speech Batch 600:  Loss:     3.2460 Validation Accuracy: 0.000000
Epoch  6, Speech Batch 700:  Loss:     3.1219 Validation Accuracy: 0.000000
Epoch  6, Speech Batch 800:  Loss:     0.7245 Validation Accuracy: 1.000000
Epoch  6, Speech Batch 900:  Loss:     0.6626 Validation Accuracy: 1.000000
Epoch  6, Speech Batch 984:  Loss:     0.6158 Validation Accuracy: 1.000000
Epoch  6, Speech Batch 1000:  Loss:     0.6074 Validation Accuracy: 1.000000
Epoch  6, Speech Batch 1100:  Loss:     0.5579 Validation Accuracy: 1.000000
Epoch  6, Speech Batch 1200:  Loss:     0.5134 Validation Accuracy: 1.000000
Epoch  6, Speech Batch 1300:  Loss:     0.4733 Validation Accuracy: 1.000000
Epoch  6, Speech Batch 1400:  Loss:     0.4371 Validation Accuracy: 1.000000
Epoch  

## Checkpoint
The model has been saved to disk.
## Test Model
Test your model against the test dataset. This will be your final accuracy.

In [23]:
loaded_graph = tf.Graph()

with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(save_model_path + '.meta')
    loader.restore(sess, save_model_path)
    
    loaded_y = loaded_graph.get_tensor_by_name('y:0')
    predictor = tf.argmax(loaded_y,1)
    
    test = np.load('test.npy')
    
    for clip in test:
        predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": clip}, num_epochs=1, shuffle=False)
        predictions = list(classifier.predict(input_fn=predict_input_fn))
#         predicted_classes = [p["classes"] for p in predictions]
#         predictions = sess.run([predictor], feed_dict={x: clip})
        print(predictions)
#         print(sess.run(y, feed_dict={x: clip}))

INFO:tensorflow:Restoring parameters from ./model


NameError: name 'classifier' is not defined