In [1]:
import glob
import numpy as np
import os
import librosa
import soundfile as sf
import pickle

import sys
sys.path.append("../pyfiles/")
from util import play_audio, silence_removal, min_max
from get_mel_spectrogram import *

# Introduction
---
In this notebook, I'm going to explain the detail of the datasets I'll use: [RAVDESS](https://www.kaggle.com/uwrfkaggler/ravdess-emotional-speech-audio) and [TESS](https://www.kaggle.com/ejlok1/toronto-emotional-speech-set-tess). Additionally, some preprocessing techniques are employed in this notebook for training, such as silent removal and mel-spectrogram conversion. At first, I'm going to introduce the basic information of the datasets.

---
I won't explain some features which are not related to my usage.

### Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS)
This dataset includes speech and song audio whose sampling rate is 48kHz and audio depth is 16bit.
2 statements are spoken by 24 actors(12 female and 12 male) in 8 emotions such as "neutral", "happy", and so on.
Please visit [this link](https://www.kaggle.com/uwrfkaggler/ravdess-emotional-speech-audio) to download or you can download quickly by referring `A-download_Download_TESS_RAVDESS.ipynb`.

### Toronto emotional speech set (TESS)
This is also an audio dataset labeled with 7 emotions such as "fear", "disgust" and so on.
The sampling rate is 24414Hz.
The sentence is spoken by 2 female speakers and every audio is prefaced with "Say the word" followed by a certain word. Please visit [this link](https://www.kaggle.com/ejlok1/toronto-emotional-speech-set-tess) to download or, of course, you can download quickly by referring `A-download_Download_TESS_RAVDESS.ipynb`.

---
Since they have different sampling rates, it's downsampled to 22050Hz and its silent section is removed.
As the data length is different from each other, it's adjusted by random-zero-padding or random-cropping.
In addition, the audio labeled with "calm" is integrated into the label "neutral" due to the similarity.
Eventually, the dataset is utilized in the following condition.
- sampling rate: 22050 Hz
- no silent section
- emotions: "neutral", "happy", "sad", "angry"
- audio depth: 16 bit

---

# Preparation
---
the original directory structure is not intuitive and it's difficult to understand at a glance. I've organized a new 'directory' named 'audio_files' to store all data in a methodical order. 

---

## RAVDESS

In [4]:
# ----- adjustable parameters ----- #
ravdess_speech_dir = "./../../../dataset/RAVDESS/speech/"
ravdess_song_dir = "./../../../dataset/RAVDESS/song/"
# --------------------------------- #

os.makedirs(ravdess_song_dir+"Actor_18", exist_ok=True)
actor_speech_dir = glob.glob(ravdess_speech_dir + "Actor*/")
actor_speech_dir.sort()
actor_song_dir = glob.glob(ravdess_song_dir + "Actor*/")
actor_song_dir.sort()

r_actor_ids = np.array([str(i) for i in range(1, 25)])
r_emotions = ["neutral", "calm", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
r_statements = ["kids", "dogs"]
r_types = ["normal_intensity_1", "normal_intensity_2", "strong_intensity_1", "strong_intensity_2"]

In [5]:
audio_files = {}
audio_files["ravdess"] = {}
for record_type in ["speech", "song"]:
    audio_files["ravdess"][record_type] = {}
    for i in range(len(r_actor_ids)):
        audio_files["ravdess"][record_type][r_actor_ids[i]] = {} 
        files = glob.glob(actor_speech_dir[i] + "*.wav")
        for j in range(len(r_emotions)):
            audio_files["ravdess"][record_type][r_actor_ids[i]][r_emotions[j]] = {}
            for k in range(len(r_statements)):
                audio_files["ravdess"][record_type][r_actor_ids[i]][r_emotions[j]][r_statements[k]] = []
                for path in files:
                    basename = os.path.basename(path).split(".")[0]
                    if int(basename[6:8])==j+1:
                        if int(basename[12:14])==k+1:
                            audio_files["ravdess"][record_type][r_actor_ids[i]][r_emotions[j]][r_statements[k]].append(path)
                    audio_files["ravdess"][record_type][r_actor_ids[i]][r_emotions[j]][r_statements[k]].sort()

---
Make some directories in a selected directory.

---

In [13]:
# ----- adjustable parameters ----- #
ravdess_save_dir = "./../../../dataset/RAVDESS/preprocessed/"
# --------------------------------- #

os.makedirs(ravdess_save_dir, exist_ok=True)
for data_type in ["audio", "feature"]:
    for tp in ["speech", "song"]:
        for actor in r_actor_ids:
            dir_actor = ravdess_save_dir + f"{data_type}/{tp}/{actor}"
            os.makedirs(dir_actor, exist_ok=True)
            for emotion in r_emotions:
                dir_path = ravdess_save_dir + f"{data_type}/{tp}/{actor}/{emotion}"
                os.makedirs(dir_path, exist_ok=True)

## TESS

In [6]:
# ----- adjustable parameters ----- #
tess_speech_dir = "./../../../dataset/TESS/"
# --------------------------------- #

dir_list = glob.glob(tess_speech_dir + "*F*/")
dir_list.sort()

t_actor_ids = ["OAF", "YAF"]
t_emotions = ["neutral", "happy", "sad", "angry", "fear", "disgust", "surprised"]
t_emotions_ = ["neutral", "happy", "sad", "angry", "fear", "disgust", "ps"]
commands = []
for path in glob.glob(dir_list[4]+"*wav"):
    basename = os.path.basename(path)
    commands.append(basename.split("_")[1])
commands.sort()

In [7]:
audio_files["tess"] = {}
for actor in t_actor_ids:
    audio_files["tess"][actor] = {} 
    for i in range(len(t_emotions)):
        emotion = t_emotions[i]
        audio_files["tess"][actor][emotion] = {}
#         print(actor, emotion)
        for dir in dir_list:
            files = glob.glob(dir + f"{actor}*{t_emotions_[i]}*.wav")
#             print(actor, emotions_[i])
#             print(dir)
            if bool(len(files)):
                files.sort()
                for j in range(len(commands)):
                    command = commands[j]
                    try:
                        audio_files["tess"][actor][emotion][command] = files[j]
                    except IndexError:
                        continue
                break

---
Make some directories in a selected directory.

---

In [14]:
# ----- adjustable parameters ----- #
tess_save_dir = "./../../../dataset/TESS/preprocessed/"
# --------------------------------- #

os.makedirs(tess_save_dir, exist_ok=True)

for data_type in ["audio", "feature"]:
    for actor in t_actor_ids:
        dir_actor = tess_save_dir + f"{data_type}/{actor}"
        os.makedirs(dir_actor, exist_ok=True)
        for emotion in t_emotions:
            dir_path = tess_save_dir + f"{data_type}/{actor}/{emotion}"
            os.makedirs(dir_path, exist_ok=True)

## Others
---
Some audio files have enough length for mel-spectrogram computation and some don't. Therefore, I need to compensate them, and it's done by a function transform defined below. And 'args' indicates the parameters for mel-spectrogram computation.

---

In [10]:
def transform(array, target=2**15):
    ndim = array.ndim
    if ndim==1:
        array = np.reshape(array, (array.shape[0], 1))
    length = array.shape[0]
    zeros = np.zeros((target-length, 1))
    start = int((target-length)/2)
    new_array = np.concatenate([zeros[:start,:], min_max(array, mean0=True), zeros[start:,:]])
    if ndim==1:
        new_array = np.reshape(new_array, (new_array.shape[0],))
    return new_array

In [11]:
args = {}
args["max_wav_value"] = 2**15
args["filter_length"] = 1024
args["hop_length"] = 256
args["win_length"] = 1024
args["n_mel_channels"] = 80
args["sampling_rate"] = 22050
args["mel_fmin"] = 0
args["mel_fmax"] = 8000
timesize = 160

# Preprocessing

In [None]:
# ----- adjustable parameters ----- #
save = False # save or not
# --------------------------------- #

for actor in r_actor_ids:
    print(actor)
    for emotion in r_emotions:
        for statement in r_statements:
            for tp in ["speech", "song"]:
                path_list = audio_files["ravdess"][tp][actor][emotion][statement]
                for i in range(len(path_list)):
                    path = path_list[i]
                    x, fs = librosa.load(path, args["sampling_rate"])
                    x = silence_removal(x)
                    if len(x) > 3*fs:
                        mel = audio2mel(None, args, x, fs)
                    else:
                        x = transform(x, 3 * fs)
                        mel = audio2mel(None, args, x, fs)
                    
                    audio_path = ravdess_save_dir + f"audio/{tp}/{actor}/{emotion}/speech_{actor}_{emotion}_{statement}_{r_types[i]}.wav"
                    feature_path = ravdess_save_dir + f"feature/{tp}/{actor}/{emotion}/speech_{actor}_{emotion}_{statement}_{r_types[i]}.pkl"
                    if save:
                        sf.write(audio_path, x, fs, subtype='PCM_24')
                        with open(feature_path, mode='wb') as f:
                            pickle.dump(mel, f)

### TESS

In [None]:
# ----- adjustable parameters ----- #
save = False
# --------------------------------- #

for actor in t_actor_ids:
    print(actor)
    for emotion in t_emotions:
        print("   ", emotion)
        for command in commands:
            path = audio_files["tess"][actor][emotion][command]
            x, fs = librosa.load(path, args["sampling_rate"])
            x = silence_removal(x)
            if len(x) > 3*fs:
                mel = audio2mel(None, args, x, fs)
            else:
                x = transform(x, 3 * fs)
                mel = audio2mel(None, args, x, fs)

            audio_path = tess_save_dir + f"audio/{actor}/{emotion}/{actor}_{emotion}_{command}.wav"
            feature_path = tess_save_dir + f"feature/{actor}/{emotion}/{actor}_{emotion}_{command}.pkl"
            
            if save:
                sf.write(audio_path, x, fs, subtype='PCM_24')
                with open(feature_path, mode='wb') as f:
                    pickle.dump(mel, f)