In [1]:
!pip install python_speech_features
!pip install jiwer
!pip install wandb



In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import tensorflow as tf
from scipy.io import wavfile
from tqdm import tqdm
import os
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import soundfile as sf
from scipy import signal
import pandas as pd
import python_speech_features
from tensorflow import keras
import itertools
from jiwer import wer
import wandb
from wandb.keras import WandbCallback
import pandas as pd

In [4]:
#!wget http://206.12.93.90:8080/LJSpeech-1.1/LJSpeech-1.1.tar.gz
#!tar -xzvf  LJSpeech-1.1.tar.gz 2> /dev/null
#!rm LJSpeech-1.1.tar.gz

In [5]:
df = pd.read_csv('LJSpeech-1.1/metadata.csv', sep='|', header=None)
df.columns = ["filename", "transcript", "col3"]
df = df[["filename", "transcript"]]
df.head(3)

Unnamed: 0,filename,transcript
0,LJ001-0001,"Printing, in the only sense with which we are ..."
1,LJ001-0002,in being comparatively modern.
2,LJ001-0003,For although the Chinese took impressions from...


In [6]:
wandb.init(project='DeepSpeech2', entity='ngocdunghuynh')

[34m[1mwandb[0m: Currently logged in as: [33mngocdunghuynh[0m (use `wandb login --relogin` to force relogin)


In [7]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, dataset, batch_size=32, ds_type = "train"):
        'Initialization'
        self.dataset = dataset
        self.batch_size = batch_size
        self.max_len_train = 0
        self.ds_type =  ds_type
        self.char_mapping = {' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10,
                'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20,
                'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, "'": 27,'': 28}
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.dataset) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        batch_data = [self.dataset.iloc[k] for k in indexes]
        audios, labels = self.__data_generation(batch_data)
        return audios, labels

        'text processing'
    def text_to_idx(self, text):
        text  = text.lower()
        idx = []
        for chr in text:
            if chr in self.char_mapping :
                idx.append(self.char_mapping[chr])
        return idx
          
        'normalize raw audio'
                
    def normalize(self, audio):
        gain = 1.0 / (np.max(np.abs(audio)) + 1e-5)
        return audio * gain
        
        'standardize FBANK'
    def standardize(self,features):
        mean = np.mean(features)
        std = np.std(features)
        return (features - mean) / std
        
        'FBAnk processing'
    def audio_to_features(self, audio):
        sf, audio = wavfile.read(f"./LJSpeech-1.1/wavs/{audio}.wav")
        audio = self.normalize(audio.astype(np.float32))
        audio = (audio * np.iinfo(np.int16).max).astype(np.int16)
        feat, energy = python_speech_features.fbank(
            audio, nfilt=160, winlen=0.02,winstep=0.01, winfunc = np.hanning)
        features = np.log(feat)
        return  self.standardize(features)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.dataset))

    def __data_generation(self, batch_data):
        audios = []
        labels = []
        label_len = []
        audio_len = []

        for filename, transcript in batch_data:
            audio = self.audio_to_features(filename)
            audios.append(audio)
            audio_len.append(len(audio))

            label = self.text_to_idx(transcript)
            labels.append(label)
            label_len.append(len(label))
            
        max_audio_len = max(audio_len)
        max_label_len = max(label_len)
        audios = pad_sequences(audios, maxlen = max_audio_len, dtype='float32', value=0, padding='post')
        labels = pad_sequences(labels, maxlen = max_label_len, value=28, padding='post')
        return audios, labels
split = int(len(df)*0.99)
df_train = df[:split]
df_val = df[split:]

In [8]:
wandb.log({
        "audio features": "FBANK",
        "n_feature": 160,
        "vocal_size": 29,
        "winlen": 0.02,
        "winstep": 0.01,
        "winfunc" : "np.hanning"
     })

In [9]:
training_gen = DataGenerator(df_train)
valid_gen = DataGenerator(df_val)

In [10]:
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from typing import List, Callable, Tuple

In [11]:
def Deep_Speech(input_dim = 160, rnn_layers = 5, rnn_units = 128, output_dim = 29):
    input = layers.Input([None, input_dim], name='input')
    x = layers.Lambda(K.expand_dims,arguments=dict(axis=-1), name = "expand_dim")(input)
    x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding='same', use_bias=False, name='conv_1')(x)
    x = layers.BatchNormalization(name='conv_1_bn')(x)
    x = layers.ReLU(name='conv_1_relu')(x)
    x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding='same', use_bias=False, name='conv_2')(x)
    x = layers.BatchNormalization(name='conv_2_bn')(x)
    x = layers.ReLU(name='conv_2_relu')(x)
    x = layers.Conv2D(filters=64, kernel_size=[11, 21], strides=[1, 2], padding='same', use_bias=False, name='conv_3')(x)
    x = layers.BatchNormalization(name='conv_3_bn')(x)
    x = layers.ReLU(name='conv_3_relu')(x)
    x = layers.Reshape([-1, input_dim//4*32])(x)
    for i in range(1, rnn_layers+1):
        recurrent = layers.GRU(units = rnn_units, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=True, reset_after=True, name=f'gru_{i}')
        x = layers.Bidirectional(recurrent, name=f'bidirectional_{i}',merge_mode='concat')(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    x = layers.TimeDistributed(layers.Dense(units=rnn_units*2), name='dense_1')(x)
    x = layers.ReLU(name='dense_1_relu')(x)
    x = layers.Dropout(rate=0.5)(x)
    output = layers.TimeDistributed(layers.Dense(units=output_dim), name='output')(x)
    model = tf.keras.Model(input, output, name='DeepSpeech2')
    return model

In [12]:
wandb.log({
        "conv2d_filter": 32,
        "conv layers": 3,
        "conv strides": [[11, 41], [11, 21],[11, 21]],
        "conv strides": [[2, 2], [1, 2],[1, 2]],
        "RNN number": 5,
        "Bidirectional" : True,
        "Batch_size": 32,
        "Droprate": 0.5
     })

In [13]:
 def get_loss() -> Callable:
        """ The CTC loss using TensorFlow's `ctc_loss`. """
        def get_length(tensor):
            lengths = tf.math.reduce_sum(tf.ones_like(tensor), 1)
            return tf.cast(lengths, tf.int32)

        def ctc_loss(labels, logits):
            label_length = get_length(labels)
            logit_length = get_length(tf.math.reduce_max(logits, 2))
            labels = tf.cast(labels, tf.int32)
            return tf.nn.ctc_loss(labels, logits, label_length, logit_length,
                                  logits_time_major=False, blank_index=-1)
        return ctc_loss
loss = get_loss()
optimizer = tf.optimizers.Adam(
    learning_rate=1e-4,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-8
)

In [15]:
model = Deep_Speech(input_dim=160, output_dim=29,rnn_units=800)
import os.path
savemodel = "drive/MyDrive/savemodel2/"
if os.path.exists(savemodel):
    print("Load model")
    #model.load_weights(savemodel)
model.compile(optimizer, loss)

Load model


In [16]:
wandb.log({
        "optimizer": "Adam",
        "learning_rate": 0.0004,
     })

In [17]:
batch_size = 32

In [19]:
class callback(keras.callbacks.Callback):

    def __init__(self,data):
        super().__init__()
        self.batch = batch
        self.idx_mapping = [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m',
           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '']

    def GreedyDecoder(self, batch_logits):
        best_candidates = np.argmax(batch_logits, axis=2)
        #remove  repeated
        decoded = [np.array([k for k, _ in itertools.groupby(best_candidate)]) for best_candidate in best_candidates]
        return decoded

    def get_batch_transcripts(self,sequences):
        return [''.join(self.idx_mapping[char_label] for char_label in sequence if char_label not in (-1, 28)) for sequence in sequences]

    def on_epoch_end(self, epoch: int, logs=None):
        X, y = self.batch
        batch_logits = model.predict(X)
        decoded_labels = self.GreedyDecoder(batch_logits)
        predictions = self.get_batch_transcripts(decoded_labels)

        transcripts = ["".join([self.idx_mapping[i] for i in label]) for label in y]
        wer_score = wer(transcripts, predictions)
        print("\n")
        print(f"WER SCORE: {wer_score}")
        wandb.log({"WER":wer_score})
        print("*"*15)
        for i in np.random.randint(0, 32, 2):
          print(f"Transcript: {transcripts[i]}")
          print(f"prediction: {predictions[i]}")
          print("*"*15)
          

batch = next(iter(valid_gen))
callback = callback(batch)
val_monitor = WandbCallback(monitor="val_loss",mode="min", save_weights_only=(True), save_model=(False),  validation_steps=True, log_batch_frequency = 1, log_evaluation_frequency = 1)

In [22]:
ckeck_point = tf.keras.callbacks.ModelCheckpoint("./drive/MyDrive/savemodel2/", monitor="val_loss", verbose=1, save_weights_only=True, mode="min")

In [None]:
# continuing run after 5 epochs
History = model.fit(training_gen,
          validation_data =valid_gen, epochs = 100, callbacks = [callback, ckeck_point,  val_monitor])

Epoch 1/100
Instructions for updating:
Prefer tf.tensor_scatter_nd_add, which offers the same functionality with well-defined read-write semantics.
Instructions for updating:
Prefer tf.tensor_scatter_nd_update, which offers the same functionality with well-defined read-write semantics.


WER SCORE: 1.0
***************
Transcript: based on its experience during this period the secret service now recommends that additional personnel be made available to prs
prediction: 
***************
Transcript: the commission recommends that prompt and favorable consideration be given to this request
prediction: '
***************

Epoch 00001: saving model to ./drive/MyDrive/savemodel2/
Epoch 2/100


WER SCORE: 0.8890977443609023
***************
Transcript: the commission further recommends that the secret service coordinate its planning as closely as possible with all of the federal agencies from which it receives information
prediction: the commiion frthe repormns ta the sere seris corn is plane is 

In [None]:
wandb.finish