# Imports

### Imports

In [None]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import librosa
import keras
import random
from tqdm import tqdm
import keras
from training_models import *


### Read the file paths

In [None]:
df = pd.read_csv('preprocessed/file_paths.csv')
df_train = df[:int(len(df)*0.8)]
df_val = df[int(len(df)*0.8):]

In [None]:
df.head()

# Data Preprocessing

### Batch pad the audio files

In [None]:
audio_paths = list(df['audio path'])
batch_size = 32

audio_data = []
for i in tqdm(range(0, len(audio_paths), batch_size)):
    batch_paths = audio_paths[i:i + batch_size]
    max_length = max(np.transpose(np.load(j)).shape[0] for j in batch_paths)
    for k in range(len(batch_paths)):
        audio = np.transpose(np.load(batch_paths[k]))
        audio = librosa.util.fix_length(audio, size=max_length, axis=0)
        audio_data.append(audio)

### Batch pad the text files

In [None]:
text_paths = list(df['text path'])
batch_size = 32

text_data = []
for i in tqdm(range(0, len(text_paths), batch_size)):
    batch_paths = text_paths[i:i + batch_size]
    max_length = max(np.transpose(np.load(j)).shape[0] for j in batch_paths)
    for k in range(len(batch_paths)):
        text = np.transpose(np.load(batch_paths[k]))
        text = librosa.util.fix_length(text, size=max_length, axis=0)
        text_data.append(text)

### Save the batch padded data

In [None]:
audio_data_padded = np.asarray(audio_data)
text_data_padded = np.asarray(text_data)
np.save("padded_data/audio data padded.npy", audio_data_padded)
np.save("padded_data/text data padded.npy", text_data_padded)

# Data Experimentation

### Load the batch padded data

In [None]:
audio_data = np.load("padded_data/audio data padded.npy", allow_pickle= True)

In [None]:
text_data = np.load("padded_data/text data padded.npy", allow_pickle= True)

In [None]:
batch_size = 32

### Experimenting with data shapes

In [None]:
dummy = np.zeros((32, 60))

In [None]:
text_data[:32][0]

In [None]:
dummy.shape

In [None]:
dummy[0] = text_data[0]

In [None]:
dummy[0]

In [None]:
dummy.shape

In [None]:
type(dummy[0])

In [None]:
type(audio_data[1])

# Model Training

### CTC Loss function

In [None]:
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

### Vocabulary

In [None]:
# The set of characters accepted in the transcription.
characters = [x for x in "abcdefghijklmnopqrstuvwxyz "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)

In [None]:
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

### Model 1

In [None]:
model = simple_rnn_model(
    input_dim=13,
    output_dim=30,
)

In [None]:
model.compile(optimizer='adam', loss=CTCLoss)

In [None]:
loss_metrics = []
batch_size = 32

In [None]:
epoch_loop = tqdm([*range(20)])
batch_loop = tqdm([*range(0, len(audio_data), batch_size)])

In [None]:
for epoch in epoch_loop:
    batch_num = 1
    for i in batch_loop:
        audio = audio_data[i:i + batch_size]
        text = text_data[i:i + batch_size]
        
        audio_shape = audio[0].shape
        text_shape = text[0].shape
        
        audio_batch = np.zeros((32, audio_shape[0], 13))
        text_batch = np.zeros((32, text_shape[0]))

        for j, data in enumerate(zip(audio, text)):
            audio_batch[j] = data[0]
            text_batch[j] = data[1]

        z = model.train_on_batch(audio_batch, text_batch)
        loss_metrics.append(z)
        avg_loss_metrics = np.mean(loss_metrics)
        epoch_loop.set_postfix_str("Epoch:= {}      Batch:= {}      Loss:= {}      Average Loss:= {}".format(epoch + 1, batch_num, round(z, 3), round(avg_loss_metrics, 3)))
        batch_num += 1

In [None]:
model.save('models/model1.h5')