In [1]:
from tensorflow.keras import callbacks
from utils import get_dataset
from SpeechModel import SpeechModel
from tensorflow.keras.callbacks import EarlyStopping
from os import mkdir, path
import argparse
# parser = argparse.ArgumentParser(description="Script to train the model as described in the paper.")
# parser.add_argument("epochs", type=int,  help="Number of Epochs")

# parser.add_argument("-nc", type=bool, help="Disable caching. Enabled by default.")

# # args for batchsize, data_directory, validation_split, random state, etc
# args = parser.parse_args()

# if args.epochs:
#     EPOCHS = args.epochs
# else:
#     EPOCHS = 10

# if args.nc:
#     CACHE= False
# else:
#     CACHE = True


In [2]:
def get_framed_mel_spectrograms(wav, sr=22050):
    # The duration of clips is 3 seconds, ie. 3000 miliseconds. Do some quick math to figure out frame_length.
    frame_length = tf.cast(sr * (25 / 1000), tf.int32)  # 25 ms
    frame_step = tf.cast(sr * (10 / 1000), tf.int32)  # 10 ms
    stft_out = tf.signal.stft(
        wav,
        frame_length=frame_length,
        frame_step=frame_step,
        window_fn=tf.signal.hamming_window,
    )
    num_spectrogram_bins = tf.shape(stft_out)[-1]
    stft_abs = tf.abs(stft_out)
    lower_edge_hz, upper_edge_hz = 20.0, 8000.0
    num_mel_bins = 64
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, sr, lower_edge_hz, upper_edge_hz
    )
    mel_spectrograms = tf.tensordot(stft_abs, linear_to_mel_weight_matrix, 1)

    # mel_spectrograms.set_shape(
    #     stft_abs.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:])
    # )

    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    log_mel_d1 = log_mel_spectrograms - \
        tf.roll(log_mel_spectrograms, -1, axis=0)
    log_mel_d2 = log_mel_d1 - tf.roll(log_mel_d1, -1, axis=0)

    log_mel_three_channel = tf.stack(
        [log_mel_spectrograms, log_mel_d1, log_mel_d2], axis=-1
    )

    framed_log_mels = tf.signal.frame(
        log_mel_three_channel, frame_length=64, frame_step=32, pad_end=False, axis=0
    )

    return framed_log_mels

In [4]:
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
import librosa
import numpy as np
CACHE = True
EMOTION_DICT_RAVDEES = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}
def load_wav(file_path):
    file_path = file_path.numpy()
    wav, sr = librosa.load(file_path, mono=True, duration=3)

    pre_emp = 0.97
    wav = np.append(wav[0], wav[1:] - pre_emp * wav[:-1])

    wav = tf.convert_to_tensor(wav, dtype=tf.float32)
    sr = tf.convert_to_tensor(sr, dtype=tf.float32)
    return wav, sr
def get_dataset(DATA_DIR: str, cache: bool = True):
    def decompose_label(file_path: str):
        return label_to_int[file_path.split("-")[2]]

    def tf_compatible_file_loader(file_path):
        wav, sr = tf.py_function(load_wav, [file_path], [
                                 tf.float32, tf.float32])
        return wav, sr

    file_path_list = os.listdir(DATA_DIR)
    label_to_int = dict({(key, i)
                        for i, key in enumerate(EMOTION_DICT_RAVDEES.keys())})
    # print(len(file_path_list))
    # print(file_path_list[768].split("-"))
    labels = [decompose_label(file_path) for file_path in file_path_list]
    file_path_list = [DATA_DIR + "/" +
                      file_path for file_path in file_path_list]
    
    train_fps, val_fps, train_labels, val_labels = train_test_split(
        file_path_list, labels, test_size=0.1
    )
    # print("train_fps:", len(train_fps))
    # print("val_fps:", len(val_fps))
    # print("train labels", len(train_labels))
    # print("validation labels", len(val_labels))
    train_files_ds = tf.data.Dataset.from_tensor_slices(train_fps)
    train_wav_ds = train_files_ds.map(
        tf_compatible_file_loader,  num_parallel_calls=tf.data.AUTOTUNE
    )
    train_mfcc_ds = train_wav_ds.map(
        get_framed_mel_spectrograms,  num_parallel_calls=tf.data.AUTOTUNE
    )
    train_labels_ds = tf.data.Dataset.from_tensor_slices(train_labels)

    train_ds = tf.data.Dataset.zip((train_mfcc_ds, train_labels_ds))

    val_files_ds = tf.data.Dataset.from_tensor_slices(val_fps)
    val_wav_ds = val_files_ds.map(
        tf_compatible_file_loader,  num_parallel_calls=tf.data.AUTOTUNE
    )
    val_mfcc_ds = val_wav_ds.map(
        get_framed_mel_spectrograms,  num_parallel_calls=tf.data.AUTOTUNE
    )

    val_labels_ds = tf.data.Dataset.from_tensor_slices(val_labels)

    val_ds = tf.data.Dataset.zip((val_mfcc_ds, val_labels_ds))

    if cache:
        train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE).cache()
        val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE).cache()
    else:
        train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)
        val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE)
    return train_ds, val_ds
train_ds , validation_ds = get_dataset("dataset", cache=CACHE)


2022-04-25 22:31:34.711138: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
EPOCHS = 30


MODEL_SAVE_DIR = "saved_model"



SP = SpeechModel()
model = SP.create_model()

ESCallback = EarlyStopping(patience=5, restore_best_weights=True, verbose=True)


Downloading ResNet Weights


In [6]:
model.fit(train_ds, epochs=EPOCHS, validation_data=validation_ds, callbacks=[ESCallback])

if not path.exists(MODEL_SAVE_DIR):
    mkdir(MODEL_SAVE_DIR)
model.save(MODEL_SAVE_DIR + "/" + str(EPOCHS) + "epochs_SpeechModel")

Epoch 1/30


2022-04-25 22:32:11.927615: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:689] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "109" frequency: 1600 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 3145728 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




2022-04-25 22:58:29.067075: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:689] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "109" frequency: 1600 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 3145728 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 00010: early stopping


2022-04-26 02:58:17.447665: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-04-26 03:01:17.025667: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at save_restore_v2_ops.cc:136 : RESOURCE_EXHAUSTED: saved_model/30epochs_SpeechModel/variables/variables_temp/part-00000-of-00001.data-00000-of-00001.tempstate16237429839950581410; No space left on device


ResourceExhaustedError: saved_model/30epochs_SpeechModel/variables/variables_temp/part-00000-of-00001.data-00000-of-00001.tempstate16237429839950581410; No space left on device [Op:SaveV2]