# Importing Libraries

In [1]:
# TensorFlow and Keras Imports
import tensorflow as tf
from tensorflow.keras import regularizers as rg
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed

# Garbage Collection and Pickle Imports
import gc
import pickle as pkl

# Visualization and Data Handling Imports
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd

# Scikit-learn Import
from sklearn.model_selection import train_test_split

# JSON Import
import json

# Time-related Import
import time

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Load Training Data

In [2]:
features_path = "/kaggle/input/timitpreprocessed/features.pkl"
labels_path = "/kaggle/input/timitpreprocessed/labels.pkl"

with open(features_path, 'rb') as pickle_file:
    features = pkl.load(pickle_file)
    
with open(labels_path, 'rb') as pickle_file:
    labels = pkl.load(pickle_file)

# Load Testing Data

In [3]:
test_features_path = "/kaggle/input/timitpreprocessed/test_features.pkl"
test_labels_path = "/kaggle/input/timitpreprocessed/test_labels.pkl"

with open(test_features_path, 'rb') as pickle_file:
    test_features = pkl.load(pickle_file)
    
with open(test_labels_path, 'rb') as pickle_file:
    test_labels = pkl.load(pickle_file)

## Create mapping functions for Phonemes

In [4]:
map_phonemes_61_to_39 = {
            'iy':'iy',  'ih':'ih',   'eh':'eh',  'ae':'ae',    'ix':'ih',  'ax':'ah',   'ah':'ah',  'uw':'uw',
            'ux':'uw',  'uh':'uh',   'ao':'aa',  'aa':'aa',    'ey':'ey',  'ay':'ay',   'oy':'oy',  'aw':'aw',
            'ow':'ow',  'l':'l',     'el':'l',  'r':'r',      'y':'y',    'w':'w',     'er':'er',  'axr':'er',
            'm':'m',    'em':'m',     'n':'n',    'nx':'n',     'en':'n',  'ng':'ng',   'eng':'ng', 'ch':'ch',
            'jh':'jh',  'dh':'dh',   'b':'b',    'd':'d',      'dx':'dx',  'g':'g',     'p':'p',    't':'t',
            'k':'k',    'z':'z',     'zh':'sh',  'v':'v',      'f':'f',    'th':'th',   's':'s',    'sh':'sh',
            'hh':'hh',  'hv':'hh',   'pcl':'h#', 'tcl':'h#', 'kcl':'h#', 'qcl':'h#','bcl':'h#','dcl':'h#',
            'gcl':'h#','h#':'h#',  '#h':'h#',  'pau':'h#', 'epi': 'h#','nx':'n',   'ax-h':'ah','q':'h#' 
        }

phonemes_list_61 = list(map_phonemes_61_to_39.keys())
phonemes_list_39 = list(set(map_phonemes_61_to_39.values()))

label_to_phoneme39 = {}
phoneme39_to_label = {}
for index,phoneme in enumerate(phonemes_list_39):
    label_to_phoneme39[phoneme] = index + 1
    phoneme39_to_label[index + 1] = phoneme

map_phonemes_39_to_61 = {}

for phoneme61,phoneme39 in map_phonemes_61_to_39.items():
    if not phoneme39 in map_phonemes_39_to_61:
        map_phonemes_39_to_61[phoneme39] = []
    map_phonemes_39_to_61[phoneme39].append(phoneme61)

# Build LSTM Model

In [5]:


def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    model = tf.keras.Sequential()

    model.add(LSTM(1000, input_shape=(None, input_dim), return_sequences=True))
    model.add(LSTM(1000, return_sequences=True))
    model.add(LSTM(1000, return_sequences=True))
    model.add(TimeDistributed(Dense(output_dim, activation='softmax')))
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer=opt, loss='categorical_crossentropy', run_eagerly=True, metrics=['accuracy'])
    return model

n_mels = 64
fft_length =  384
# Get the model
model = build_model(
    input_dim= n_mels*3,
    output_dim=len(phoneme39_to_label),
    rnn_units=512,
)
model.summary(line_length=110)

Model: "sequential"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 lstm (LSTM)                                     (None, None, 1000)                          4772000          
                                                                                                              
 lstm_1 (LSTM)                                   (None, None, 1000)                          8004000          
                                                                                                              
 lstm_2 (LSTM)                                   (None, None, 1000)                          8004000          
                                                                                                              
 time_distributed (TimeDistributed)              (None, None, 39)                           

## Create callback class to computer Phoneme Error Rate

In [6]:
performance_rnn = {}

import numpy as np
from tensorflow.keras.callbacks import Callback
import Levenshtein as lev

class PERCallback(Callback):
    def __init__(self, X_val, y_val, phoneme_mapping):
        self.X_val = X_val
        self.X_val = self.X_val.reshape(self.X_val.shape[0], 1, self.X_val.shape[1])
        self.y_val = y_val
        self.phoneme_mapping = phoneme_mapping

    def on_epoch_end(self, epoch, logs=None):
        # Get the model predictions
        predictions = self.model.predict(self.X_val)
        
        # Convert one-hot encoded vectors to phoneme sequences
        reference_phonemes = [self.one_hot_to_phoneme(vec) for vec in self.y_val]
        predicted_phonemes = [self.one_hot_to_phoneme(vec) for vec in predictions]

        # Calculate PER
        per = self.calculate_per(" ".join(reference_phonemes), " ".join(predicted_phonemes))
        performance_rnn[epoch] = per
        # Print PER
        print(f'\n Phoneme Error Rate after epoch {epoch}: {per}%')

    def one_hot_to_phoneme(self, one_hot_vector):
        index = np.argmax(one_hot_vector)
        return self.phoneme_mapping.get(index-1, "")
    
    import Levenshtein as lev
    
    def calculate_per(self, reference, hypothesis):
        distance = lev.distance(reference, hypothesis)
        per = distance / len(reference)
        return per * 100

In [7]:
def preprocess_features(features, labels):
    # Map needs a fuction. The features are already preprocessed
    return features, labels

# Train the model

In [9]:
# Define the number of epochs.
batch_size = 128

# This will split your data so that 70% is used for training and 30% for testing.
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.3, random_state=42)

X_train_reshaped = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_val_reshaped = X_val.reshape(X_val.shape[0], 1, X_val.shape[1])

y_train_reshaped = y_train.reshape(y_train.shape[0], 1, y_train.shape[1])
y_val_reshaped = y_val.reshape(y_val.shape[0], 1, y_val.shape[1])

train_dataset = tf.data.Dataset.from_tensor_slices(
    (X_train_reshaped, y_train_reshaped)
)
train_dataset = (
    train_dataset.map(preprocess_features, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

validation_dataset = tf.data.Dataset.from_tensor_slices(
     (X_val_reshaped, y_val_reshaped)
)
validation_dataset = (
    validation_dataset.map(preprocess_features, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

epochs = 10

# Train the model
start_time = time.time()

history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[PERCallback(X_val, y_val, phoneme39_to_label)]
)

end_time = time.time()

# Calculate the training time
training_time = end_time - start_time
print(f"Training time: {training_time:.2f} seconds")

Epoch 1/10

 Phoneme Error Rate after epoch 0: 30.428978611665958%
Epoch 2/10

 Phoneme Error Rate after epoch 1: 29.1427160715488%
Epoch 3/10

 Phoneme Error Rate after epoch 2: 28.143097605314544%
Epoch 4/10

 Phoneme Error Rate after epoch 3: 27.43649707116727%
Epoch 5/10

 Phoneme Error Rate after epoch 4: 26.718226092420945%
Epoch 6/10

 Phoneme Error Rate after epoch 5: 26.279148057544273%
Epoch 7/10

 Phoneme Error Rate after epoch 6: 25.86242341270732%
Epoch 8/10

 Phoneme Error Rate after epoch 7: 25.600107727180916%
Epoch 9/10

 Phoneme Error Rate after epoch 8: 25.41490674865902%
Epoch 10/10

 Phoneme Error Rate after epoch 9: 25.15241151783109%
Training time: 9722.65 seconds


# Save training performance metrics

In [10]:

# Collect training history metrics
training_metrics = {
    'epochs': list(range(1, len(history.history['accuracy']) + 1)),
    'accuracy': history.history['accuracy'],
    'loss': history.history['loss'],
    'val_acc': history.history['val_accuracy'],
    'val_loss': history.history['val_loss']
}

# Save the training metrics to a JSON file
with open('/kaggle/working/history_rnn.json', 'w') as file:
    json.dump(training_metrics, file, indent=4)
    
# Save the PER metrics to a JSON file
with open('/kaggle/working/per_rnn.json', 'w') as file:
    json.dump(performance_rnn, file, indent=4)

# Testing block

In [11]:
def compute_per(X, y, model, phoneme_mapping):
    X = X.reshape(X.shape[0], 1, X.shape[1])
    # Get the model predictions
    predictions = model.predict(X)
    
    # Convert one-hot encoded vectors to phoneme sequences
    reference_phonemes = [one_hot_to_phoneme(vec, phoneme_mapping) for vec in y]
    predicted_phonemes = [one_hot_to_phoneme(vec, phoneme_mapping) for vec in predictions]

    # Calculate PER
    per = calculate_per(" ".join(reference_phonemes), " ".join(predicted_phonemes))
    
    return per

def one_hot_to_phoneme(one_hot_vector, phoneme_mapping):
    index = np.argmax(one_hot_vector)
    return phoneme_mapping.get(index-1, "")
    
def calculate_per(reference, hypothesis):
    distance = lev.distance(reference, hypothesis)
    per = distance / len(reference)
    return per * 100

Testing block# Testing

In [12]:
X_test = test_features
y_test = test_labels
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
y_test = y_test.reshape(y_test.shape[0], 1, y_test.shape[1])

test_dataset = tf.data.Dataset.from_tensor_slices(
    (X_test, y_test)
)

test_dataset = (
    test_dataset.map(preprocess_features, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

## Save the model

In [13]:
# Save the model as an HDF5 file
model.save("/kaggle/working/rnn_model.h5")


# Compute testing performance metrics

In [14]:
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)



In [15]:
# Compute PER on the test data
per = compute_per(test_features, test_labels, model, phoneme39_to_label)

print(f'Loss: {loss}, Accuracy: {accuracy}, Phoneme Error Rate: {per}%')

Loss: 1.3605653047561646, Accuracy: 0.5855926275253296, Phoneme Error Rate: 25.233395946059606%
