# Import libraries

In [1]:
import numpy as np
import pandas as pd
import json

import random
from python_speech_features import mfcc
import librosa
import librosa.display
from IPython.display import Audio
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
%matplotlib inline




import _pickle as pickle
from numpy.lib.stride_tricks import as_strided

from keras.layers import (Input, Lambda)

from keras.callbacks import ModelCheckpoint   
import os
import sys
sys.path.append("../")
# Custom scripts
from scripts.generate_amharic_characters import GenerateCharacters
from scripts.metadata_loader import MetaDataLoader
from scripts.audio_generator import make_audio_gen
from scripts.model_arch import model_1, train

char_gen = GenerateCharacters()
md_loader = MetaDataLoader()


2022-06-04 15:16:57.595960: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-04 15:16:57.595995: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
amharic_root = "../data/AMHARIC"

# Generate the unique characters

In [3]:
train_corpus_path = "../data/AMHARIC/transcriptions_amharic.csv"
characters = char_gen.get_characters(md_path=train_corpus_path)
characters = sorted(characters)
characters = characters[1:]

In [4]:
char_map = {}
char_map[""] = 0
char_map["<SPACE>"] = 1
index = 2
for c in characters:
    char_map[c] = index
    index += 1
index_map = {v+1: k for k, v in char_map.items()}

# Visualize a sample audio

In [5]:
audio_path_root = '../data/AMHARIC/train/wav/'
audio_paths = os.listdir(audio_path_root)
audio_paths = list(map(lambda x: os.path.join(audio_path_root, x), audio_paths))


In [6]:
samples, sample_rate = librosa.load(audio_paths[100])
samples

array([-0.00433602, -0.00485682, -0.00401515, ..., -0.00366886,
       -0.00373237,  0.        ], dtype=float32)

In [7]:
Audio(audio_paths[555])
# samples

# Convert corpus to json

In [8]:
trian_corpus = pd.read_csv(train_corpus_path)
trian_corpus.key = trian_corpus.key.apply(lambda p: os.path.join(amharic_root , p.replace("data/", "")) )

In [9]:
train_len = int(len(trian_corpus) * 0.8)
train_set = trian_corpus.iloc[:train_len, :]
validation = trian_corpus.iloc[train_len:, :]

train_set = train_set.to_dict(orient='records')
validation = validation.to_dict(orient='records')

with open(os.path.join(amharic_root, "train_corpus.json"), "w") as f:
    json.dump(train_set, f, ensure_ascii=False)
with open(os.path.join(amharic_root, "valid_corpus.json"), "w") as f:
    json.dump(validation, f, ensure_ascii=False)


In [10]:
TRAIN_CORPUS = os.path.join(amharic_root, "train_corpus.json")
VALID_CORPUS = os.path.join(amharic_root, "valid_corpus.json")

MFCC_DIM = 13
SPECTOGRAM = True
EPOCHS = 1
MODEL_NAME = "RNN_model"

################ Reminder MINI_BATCH_SIZE=250 
MINI_BATCH_SIZE = 250

SORT_BY_DURATION=False
MAX_DURATION = 10.0

audio_gen = make_audio_gen(TRAIN_CORPUS, 
                        VALID_CORPUS, 
                        spectrogram=False, 
                        mfcc_dim=MFCC_DIM,
                        minibatch_size=MINI_BATCH_SIZE, 
                        sort_by_duration=SORT_BY_DURATION,
                        max_duration=MAX_DURATION, 
                        char_map=char_map)
# add the training data to the generator
audio_gen.load_train_data()
audio_gen.load_validation_data()

8700
2175


In [11]:
model = model_1(input_dim=13,
                units=5,
                activation='relu',
                output_dim=len(char_map)+1)

2022-06-04 15:17:06.003063: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-06-04 15:17:06.003112: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-172-31-60-46.ec2.internal): /proc/driver/nvidia/version does not exist
2022-06-04 15:17:06.003911: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None, 13)]        0         
                                                                 
 rnn (GRU)                   (None, None, 5)           300       
                                                                 
 batch_normalization (BatchN  (None, None, 5)          20        
 ormalization)                                                   
                                                                 
 time_distributed (TimeDistr  (None, None, 224)        1344      
 ibuted)                                                         
                                                                 
 softmax (Activation)        (None, None, 224)         0         
                                                                 
Total params: 1,664
Trainable params: 1,654
Non-trainable par

In [12]:

train(audio_gen, 
    input_to_softmax=model, 
    model_name=MODEL_NAME, 
    epochs=EPOCHS, 
    minibatch_size=MINI_BATCH_SIZE
    )

  callbacks=[checkpointer], verbose=verbose, use_multiprocessing=True)




# Predictions

In [22]:
from keras import backend as K
def predict(data_gen, index, partition, model, verbose=True):
    """ Print a model's decoded predictions
    Params:
        data_gen: Data to run prediction on
        index (int): Example to visualize
        partition (str): Either 'train' or 'validation'
        model (Model): The acoustic model
    """
    audio_path,data_point,transcr,prediction = predict_raw(data_gen, index, partition, model)
    output_length = [model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
                prediction, output_length, greedy=False)[0][0])+1).flatten().tolist()
    predicted = ''.join(int_sequence_to_text(pred_ints)).replace("<SPACE>", " ")
    wer_val = wer(transcr, predicted)
    if verbose:
        display(Audio(audio_path, embed=True))
        print('Truth: ' + transcr)
        print('Predicted: ' + predicted)
        print("wer: %d" % wer_val)
    return wer_val

In [14]:
def predict_raw(data_gen, index, partition, model):
    """ Get a model's decoded predictions
    Params:
        data_gen: Data to run prediction on
        index (int): Example to visualize
        partition (str): Either 'train' or 'validation'
        model (Model): The acoustic model
    """

    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')
        
    prediction = model.predict(np.expand_dims(data_point, axis=0))
    return (audio_path,data_point,transcr,prediction)

In [18]:
def int_sequence_to_text(int_sequence):
    """ Convert an integer sequence to text """
    text = []
    for c in int_sequence:
        ch = index_map[c]
        text.append(ch)
    return text
# Code adapted from https://martin-thoma.com/word-error-rate-calculation/
def wer(r, h):
    """
    Calculation of WER with Levenshtein distance.

    Works only for iterables up to 254 elements (uint8).
    O(nm) time ans space complexity.

    Parameters
    ----------
    r : list
    h : list

    Returns
    -------
    int

    Examples
    --------
    >>> wer("who is there".split(), "is there".split())
    1
    >>> wer("who is there".split(), "".split())
    3
    >>> wer("".split(), "who is there".split())
    3
    """
    # initialisation
    import numpy
    d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitution = d[i-1][j-1] + 1
                insertion    = d[i][j-1] + 1
                deletion     = d[i-1][j] + 1
                d[i][j] = min(substitution, insertion, deletion)

    return d[len(r)][len(h)]
def calculate_wer(model, model_name, data_gen, partition, length):
    start = time.time()
    def wer_single(i):
        wer = predict(data_gen, i, partition, model, verbose=False)
        if (i%100==0) and i>0:
            print("processed %d in %d minutes" % (i, ((time.time() - start)/60)))
        return wer
    wer = list(map(lambda i: wer_single(i), range(1, length)))
    print("Total time: %f minutes" % ((time.time() - start)/60))
    filename = 'models/' + model_name + '_' + partition + '_wer.pickle'
    with open(filename, 'wb') as handle:
        pickle.dump(wer, handle)
    return wer
def load_wer(model_name, partition):
    filename = 'models/' + model_name + '_' + partition + '_wer.pickle'
    return pickle.load(open(filename, "rb"))
def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
    """
    Compute the spectrogram for a real signal.
    The parameters follow the naming convention of
    matplotlib.mlab.specgram

    Args:
        samples (1D array): input audio signal
        fft_length (int): number of elements in fft window
        sample_rate (scalar): sample rate
        hop_length (int): hop length (relative offset between neighboring
            fft windows).

    Returns:
        x (2D array): spectrogram [frequency x time]
        freq (1D array): frequency of each row in x

    Note:
        This is a truncating computation e.g. if fft_length=10,
        hop_length=5 and the signal has 23 elements, then the
        last 3 elements will be truncated.
    """
    assert not np.iscomplexobj(samples), "Must not pass in complex numbers"

    window = np.hanning(fft_length)[:, None]
    window_norm = np.sum(window**2)

    # The scaling below follows the convention of
    # matplotlib.mlab.specgram which is the same as
    # matlabs specgram.
    scale = window_norm * sample_rate

    trunc = (len(samples) - fft_length) % hop_length
    x = samples[:len(samples) - trunc]

    # "stride trick" reshape to include overlap
    nshape = (fft_length, (len(x) - fft_length) // hop_length + 1)
    nstrides = (x.strides[0], x.strides[0] * hop_length)
    x = as_strided(x, shape=nshape, strides=nstrides)

    # window stride sanity check
    assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)])

    # broadcast window, compute fft over columns and square mod
    x = np.fft.rfft(x * window, axis=0)
    x = np.absolute(x)**2

    # scale, 2.0 for everything except dc and fft_length/2
    x[1:-1, :] *= (2.0 / scale)
    x[(0, -1), :] /= scale

    freqs = float(sample_rate) / fft_length * np.arange(x.shape[0])

    return x, freqs
def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
                          eps=1e-14):
    """ Calculate the log of linear spectrogram from FFT energy
    Params:
        filename (str): Path to the audio file
        step (int): Step size in milliseconds between windows
        window (int): FFT window size in milliseconds
        max_freq (int): Only FFT bins corresponding to frequencies between
            [0, max_freq] are returned
        eps (float): Small value to ensure numerical stability (for ln(x))
    """
    with soundfile.SoundFile(filename) as sound_file:
        audio = sound_file.read(dtype='float32')
        sample_rate = sound_file.samplerate
        if audio.ndim >= 2:
            audio = np.mean(audio, 1)
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must not be greater than half of "
                             " sample rate")
        if step > window:
            raise ValueError("step size must not be greater than window size")
        hop_length = int(0.001 * step * sample_rate)
        fft_length = int(0.001 * window * sample_rate)
        pxx, freqs = spectrogram(
            audio, fft_length=fft_length, sample_rate=sample_rate,
            hop_length=hop_length)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
    return np.transpose(np.log(pxx[:ind, :] + eps))

In [20]:
model = model_1(input_dim=13,
                units=5,
                activation='relu',
                output_dim=len(char_map)+1)
model.load_weights('../models/RNN_model.h5')

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None, 13)]        0         
                                                                 
 rnn (GRU)                   (None, None, 5)           300       
                                                                 
 batch_normalization_2 (Batc  (None, None, 5)          20        
 hNormalization)                                                 
                                                                 
 time_distributed_2 (TimeDis  (None, None, 224)        1344      
 tributed)                                                       
                                                                 
 softmax (Activation)        (None, None, 224)         0         
                                                                 
Total params: 1,664
Trainable params: 1,654
Non-trainable p

In [24]:
index_map[0]=''
predict(audio_gen,14, 'train', model)
_,_,_,raw_pred = predict_raw(audio_gen,14, 'train', model)
raw_pred_char = np.vstack([sorted(char_map.keys(), key=lambda k: char_map[k]) + ['BLANK'], raw_pred[0]])




Truth:  ኢትዮጵያ ውስጥ የ ነበሩት ኤርትራውያን ም ተመሳሳይ እድል ነበራቸው 
Predicted:  
wer: 43
