In [None]:
#################################
#   Frame Level Speech Recognition with Neural Networks
#################################


import os
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np


def loadRaw(path, name):
    return (
        np.load(os.path.join(path, '{}.npy'.format(name)), encoding='bytes', allow_pickle=True), 
        np.load(os.path.join(path, '{}_labels.npy'.format(name)), encoding='bytes', allow_pickle=True)
    )
# Load Data
temp_X, temp_Y = loadRaw("C:/Users/LENOVO/Downloads/1603866758-5e748a2d5fc288e9f69c5f86 (3)/d611ef4f2eccddb5581e0ac617ce38eb-fa231bbdb1390a35acb20526d7302f26b5f30ea2", 'dev')

#   One Frame - One Phonem Model.
# In this model, we assume a 1 frame to 1 phoneme mapping. 
# This means that the input layer has a length of 40, while the output layer has a length of 138. 
# To handle the categorical labels of the data, we use Sparse Categorical Cross entropy as the loss function and the ADAM optimizer. 
# The activation function for all layers, except the last layer, is ReLU, while the last layer uses Softmax.
# We experimented with the number of layers and found that there was only a slight improvement in test accuracy between 2 and 3 layers. 
# However, to be cautious, we chose to use 4 layers. 
# The number of nodes in each layer was chosen randomly, but this can also be optimized.


# Concat and shuffle all frames
train_X = np.concatenate(temp_X)
train_Y = np.concatenate(temp_Y)
Arguments = np.arange(len(train_X))
np.random.shuffle(Arguments)
train_X = train_X[Arguments]
train_Y = train_Y[Arguments]
modelA = tf.keras.models.Sequential([
    tf.keras.layers.Dense(90, activation='relu'),
    tf.keras.layers.Dense(270, activation='relu'),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(138, activation='softmax'),
])
lossFunction = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
modelA.compile(optimizer='adam',
              loss=lossFunction,
              metrics=['accuracy'])
modelA.evaluate(train_X[10000:25000], train_Y[10000:25000])

historyA = modelA.fit(train_X[:100000], train_Y[:100000], epochs=50, verbose=2)


plt.plot(historyA.history['accuracy'])
plt.title("Naive model, accuracy vs epoch")
plt.show()

modelA.evaluate(train_X[10000:25000], train_Y[10000:25000])



#   Context Based Model
# In this model, we consider adjacent frames to contain information relevant to the phoneme being predicted. 
# Since most of the sounds we produce last longer than a single frame of 10ms, we combine i-1, i, and i+1 frames to form a list of 120 elements, 
# where the ith phoneme is the corresponding label. 
# Due to the categorical nature of data labels, we use SparseCategoricalCrossentropy as the loss function and ADAM optimizer, 
# which is commonly used in similar models. 
# The activation function for all layers except the last one is relu, while softmax is used for the final layer.To determine the optimal number of layers, we increased the number of layers and checked whether the test accuracy increased. 
# We found that there was only a slight improvement between 2 and 3 layers, so we chose 4 layers to err on the side of caution. 
# The number of nodes in each layer was randomly selected but can be optimized as well.


trainX_wc = []
trainY_wc = []
for datapoint in range(len(temp_X)):
    for frame in range(1, len(temp_X[datapoint])-2):
        # for each frame in each datapoint, concat neighbouring frames
        trainX_wc.append(np.concatenate(temp_X[datapoint][frame-1:frame+2]))
        trainY_wc.append(temp_Y[datapoint][frame])
trainX_wc = np.array(trainX_wc)
trainY_wc = np.array(trainY_wc)
Arguments = np.arange(len(trainX_wc))
np.random.shuffle(Arguments)
trainX_wc = trainX_wc[Arguments]
trainY_wc = trainY_wc[Arguments]
modelB = tf.keras.models.Sequential([
    tf.keras.layers.Dense(180, activation='relu'),
    tf.keras.layers.Dense(270, activation='relu'),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dense(138, activation='softmax'),
])

lossFunction = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

modelB.compile(optimizer='adam',
              loss=lossFunction,
              metrics=['accuracy'])

modelB.evaluate(trainX_wc[10000:25000], trainY_wc[10000:25000])

historyB = modelB.fit(trainX_wc[:100000], trainY_wc[:100000], epochs=50, verbose=2)

plt.plot(historyB.history['accuracy'])
plt.title("Context based model, accuracy vs epoch")
plt.show()

modelB.evaluate(trainX_wc[25000:75000], trainY_wc[25000:75000])



###   The Naive model yielded an accuracy of 32.65%, while the Context-based model achieved 42.36%. 
# Additionally, the graph shows that the Context-based model trains to the maximum value more rapidly than the Naive model. 
# This indicates that using the Context-based model has a significant advantage over the Naive model. 
# To determine the optimal amount of context, one can run the code with a higher number of context frames and identify the point of maximum accuracy. 
# It is essential to note that selecting an overly extended segment may result in phoneme overlaps, leading to a decrease in accuracy.