# Importações

In [1]:
import os
import librosa

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd

from scipy.io import wavfile

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras import backend as K
from keras.utils import np_utils
from keras.layers import Bidirectional, BatchNormalization, GRU, TimeDistributed
from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.layers import Concatenate
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

from tensorflow import convert_to_tensor, concat

import warnings
warnings.filterwarnings("ignore")

2022-12-06 17:13:22.728650: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-06 17:13:23.112250: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-06 17:13:23.112266: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-06 17:13:24.397594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

# Separando arquivos de audio

In [2]:
train = pd.read_csv(
    "./TTS-Portuguese-Corpus_22khz/train_TTS-Portuguese_Corpus_metadata.csv", 
    sep = "|",
    index_col = False
)

train = train[['wav_filename','transcript']]
train.head()

Unnamed: 0,wav_filename,transcript
0,wavs/sample-631.wav,"Depois que foi atropelado, só atravessa na fai..."
1,wavs/sample-2757.wav,A cidade também tem uma instituição de ensino ...
2,wavs/sample-5578.wav,Também os astronautas depressa se juntaram às...
3,wavs/sample-3712.wav,"Nessa idade, começou a praticar balé."
4,wavs/sample-3434.wav,Um exemplo de conhecimento de terceiro tipo é...


In [3]:
test = pd.read_csv("./TTS-Portuguese-Corpus_22khz/test_TTS-Portuguese_Corpus_metadata.csv", sep = "|",
    index_col = False)

test = test[['wav_filename','transcript']]
test.head()

Unnamed: 0,wav_filename,transcript
0,wavs/sample-5672.wav,A juventude tinha que revolucionar a escola
1,wavs/sample-5655.wav,A inauguração da vila é quarta ou quinta-feira
2,wavs/sample-5656.wav,Vote se você tiver o título de eleitor
3,wavs/sample-5755.wav,A inauguração da vila é quarta ou quinta-feira.
4,wavs/sample-5659.wav,Em muitas cidades a população está diminuindo.


# Usando librosa para converter arquivos de audio

https://librosa.org/doc/latest/generated/librosa.load.html

In [7]:
os.path.exists('./all_waves_df.pkl')

True

In [8]:
if os.path.exists('./all_waves_df.pkl'):
    all_waves_df = pd.read_pickle('./all_waves_df.pkl')
else:
    dir_path = './TTS-Portuguese-Corpus_22khz/'
    wav_paths = train['wav_filename']
    all_waves = []

    for index, wav_path in enumerate(wav_paths):
        samples, sample_rate = librosa.load(dir_path + wav_path)
        samples = librosa.resample(samples, orig_sr=sample_rate, target_sr = 8000)   
        all_waves.append((samples, train.loc[index,'transcript']))
    
    all_waves_df = pd.DataFrame(all_waves, columns = ['tensor', 'transcript'])
    all_waves_df.to_pickle('./all_waves_df.pkl')

In [5]:
all_waves_df.head()

Unnamed: 0,tensor,transcript
0,"[-0.0002558262, 0.007106832, 0.015172318, 0.01...","Depois que foi atropelado, só atravessa na fai..."
1,"[0.0007501879, -5.3689382e-05, -0.0017680669, ...",A cidade também tem uma instituição de ensino ...
2,"[0.018647028, 0.036927525, 0.04546329, 0.04682...",Também os astronautas depressa se juntaram às...
3,"[0.001063928, 0.0019695, 0.0022690534, 0.00247...","Nessa idade, começou a praticar balé."
4,"[0.0063640825, 0.00304457, -0.010580226, -0.02...",Um exemplo de conhecimento de terceiro tipo é...


In [6]:
all_waves_df['tensor_len'] = all_waves_df['tensor'].map(lambda x: len(x))

In [7]:
max_tensor_size = all_waves_df['tensor_len'].max()

In [8]:
all_waves_df['tensor_with_padding'] = all_waves_df['tensor'].map(lambda x: librosa.util.fix_length(x,size=max_tensor_size))

# Arquitetura da rede neural

Reshape arrays 2D para 3D pois o input para camada Conv1D deve ser um array 3D.

In [9]:
all_waves = np.array(all_waves).reshape(-1, len(all_waves), 1)

Biblioteca para brincar com os hiperparâmetros da rede neural: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

Criei um dicionário para facilitar a utilização dos hiperparâmetros da rede neural, mas ele usa tantos que talvez seja melhor separá-los de outra forma. Ainda estou estudando cada parâmetro para entender melhor o que ele fez.

In [10]:
hyperparams = {
    'momentum': 0.99,
    'epsilon': 1e-3,
    'strides': 1,
    'max_pooling': 3,
    'dropout': 0.3,
    'gru_batch_size': 128,
    'padding': 'valid',
    'activation_relu': 'relu',
    'activation_softmax': 'softmax',
    'merge_mode': 'sum',
    'center': True,
    'scale': True
}

In [11]:
def nn_architecture(len_wave_array, label_array):

    K.clear_session()

    inputs = Input(shape = (len_wave_array, 1))

    # First Batch Normalization layer
    x = BatchNormalization(
        axis = -1, 
        momentum = hyperparams['momentum'], 
        epsilon = hyperparams['epsilon'], 
        center = hyperparams['center'], 
        scale = hyperparams['scale'] 
    )(inputs)

    # First Conv1D layer
    x = Conv1D(
        8, 13, 
        padding = hyperparams['padding'], 
        activation = hyperparams['activation_relu'], 
        strides = hyperparams['strides']
    )(x)
    x = MaxPooling1D(hyperparams['max_pooling'])(x)
    x = Dropout(hyperparams['dropout'])(x)

    # Second Conv1D layer
    x = Conv1D(
        16, 11, 
        padding = hyperparams['padding'], 
        activation = hyperparams['activation_relu'], 
        strides = hyperparams['strides']
    )(x)
    x = MaxPooling1D(hyperparams['max_pooling'])(x)
    x = Dropout(hyperparams['dropout'])(x)

    # Third Conv1D layer
    x = Conv1D(
        32, 9, 
        padding = hyperparams['padding'], 
        activation = hyperparams['activation_relu'], 
        strides = hyperparams['strides']
    )(x)
    x = MaxPooling1D(hyperparams['max_pooling'])(x)
    x = Dropout(hyperparams['dropout'])(x)

    # Second Batch Normalization layer
    x = BatchNormalization(
        axis = -1, 
        momentum = hyperparams['momentum'], 
        epsilon = hyperparams['epsilon'], 
        center = hyperparams['center'], 
        scale = hyperparams['scale'] 
    )(x)

    # Bidirectional GRUs
    x = Bidirectional(GRU(hyperparams['gru_batch_size'], return_sequences = True), merge_mode = hyperparams['merge_mode'])(x)
    x = Bidirectional(GRU(hyperparams['gru_batch_size'], return_sequences = True), merge_mode = hyperparams['merge_mode'])(x)
    x = Bidirectional(GRU(hyperparams['gru_batch_size'], return_sequences = False), merge_mode = hyperparams['merge_mode'])(x)

   # Third Batch Normalization layer
    x = BatchNormalization(
        axis = -1, 
        momentum = hyperparams['momentum'], 
        epsilon = hyperparams['epsilon'], 
        center = hyperparams['center'], 
        scale = hyperparams['scale'] 
    )(x)

    # Dense Layer 1
    x = Dense(256, activation = hyperparams['activation_relu'])(x)
    outputs = Dense(len(label_array), activation = hyperparams['activation_softmax'])(x)

    model = Model(inputs, outputs)
    print(model.summary())

    return model

In [12]:
early_stop = EarlyStopping(
    monitor = 'val_loss', 
    mode = 'min', 
    verbose = 1, 
    patience = 10, 
    min_delta = 0.0001
)

checkpoint = ModelCheckpoint(
    'speech2text_model.hdf5', 
    monitor = 'val_acc', 
    verbose = 1, 
    save_best_only = True,
    mode = 'max'
)

A função de perda foi definida como "entropia cruzada categórica" pois se tratava de um problema de multi-classificação.
Esse link fala sobre funções de perda, mas ainda preciso dar uma estudada melhor no assunto: https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html.

Nas células abaixo ele chama x_train, y_train e x_valid, y_valid.
Entendi que x_valid, y_valid é o que costumamos chamar de x_test, y_test.
Mas ele já disponibilizou os datasets divididos em treino e teste, então não soube direito o que colocar nessas quatro variáveis. 

Célula abaixo resulta em erro porque não há elementos em len_8000_waves.

In [13]:
# len_8000_x_train, len_8000_x_valid, len_8000_y_train, len_8000_y_valid = train_test_split(
#     np.array(len_8000_waves),
#     np.array(len_8000_y),
#     # stratify = y, -> comentado porque nossas "classes" têm apenas 1 elemento cada, e para stratify precisa de pelo menos 2
#     test_size = 0.2,
#     random_state = 777,
#     shuffle = True
# )

Célula abaixo estava demorando demais para rodar, então comentei.

In [14]:
# hist = len_8000_model.fit(
#     x = np.asarray(len_8000_x_train).astype('float32'), 
#     y = np.asarray(len_8000_y_train).astype('float32'),
#     epochs = 100, 
#     callbacks = [early_stop, checkpoint], 
#     batch_size = 32, 
#     validation_data = (len_8000_x_valid, len_8000_y_valid)
# )

## Considerando todas as ondas
Mesma lógica da seção acima, mas trocando tudo que tem "len_8000_" para "all_".

In [15]:
all_labels = all_waves_df['transcript'].values

In [16]:
print(all_labels.shape)

(3024,)


In [17]:
all_waves_model = nn_architecture(max_tensor_size, all_labels)
all_waves_model

2022-12-06 16:44:43.693154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-06 16:44:43.693173: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-06 16:44:43.693188: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (claudio-vostro5490): /proc/driver/nvidia/version does not exist
2022-12-06 16:44:43.693546: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 361440, 1)]       0         
                                                                 
 batch_normalization (BatchN  (None, 361440, 1)        4         
 ormalization)                                                   
                                                                 
 conv1d (Conv1D)             (None, 361428, 8)         112       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 120476, 8)        0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 120476, 8)         0         
                                                                 
 conv1d_1 (Conv1D)           (None, 120466, 16)        1424  

<keras.engine.functional.Functional at 0x7feef9a252e0>

In [18]:
all_waves_model.compile(loss = 'categorical_crossentropy', optimizer = 'nadam', metrics = ['accuracy'])

In [19]:
all_waves_label_enconder = LabelEncoder()
all_waves_y = all_waves_label_enconder.fit_transform(all_labels)

all_waves_classes = list(all_waves_label_enconder.classes_)
all_waves_y = np_utils.to_categorical(all_waves_y, num_classes = len(all_labels))

In [20]:
all_waves_x_train, all_waves_x_test, all_waves_y_train, all_waves_y_test = train_test_split(
    np.array(all_waves_df['tensor_with_padding'].values),
    np.array(all_waves_y),
    test_size = 0.2,
    random_state = 777,
    shuffle = True
)

A célula abaixou rodou por 15min e não printou nada...

In [21]:
l= []
for i in all_waves_x_train:
    l.append(convert_to_tensor(i.reshape(1,len(i))))

In [22]:
t = l[0]
for i in l[1:]:
    t = Concatenate(axis=0)([t,i])

2022-12-06 16:44:59.568750: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 400475520 exceeds 10% of free system memory.
2022-12-06 16:44:59.664753: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 401921280 exceeds 10% of free system memory.
2022-12-06 16:44:59.760527: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 403367040 exceeds 10% of free system memory.
2022-12-06 16:44:59.862497: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 404812800 exceeds 10% of free system memory.
2022-12-06 16:44:59.956855: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 406258560 exceeds 10% of free system memory.


In [23]:
print(all_waves_y_train.shape)
print(t.shape)

(2419, 3024)
(2419, 361440)


In [24]:
hist = all_waves_model.fit(
    x = t, 
    y = np.asarray(all_waves_y_train).astype('float32'),
    epochs = 100, 
    callbacks = [early_stop, checkpoint], 
    batch_size = 32, 
    validation_data = (all_waves_x_test, all_waves_y_test)
)

Epoch 1/100
 1/76 [..............................] - ETA: 5:14:13 - loss: 8.0126 - accuracy: 0.0000e+00

Error: Canceled future for execute_request message before replies were done

In [None]:
plt.plot(hist.history['loss'], label = 'train')
plt.plot(hist.history['val_loss'], label = 'test')
plt.legend()
plt.show()