# Audio Data Preparation for the Spectrograms, Data Balancing and Augmentation

In [41]:
import json
import os
import shutil
import librosa as librosa
import sounddevice as sd
import soundfile as sf
import random
import copy
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import zoom

In [61]:
with open('data/mixed_data.json', 'r') as f:
    data = json.load(f)

ids = list(data.keys())

In [63]:
len(ids)

633

Only keeping the male and female utterances (633):

In [94]:

directory = 'data/audio/mixed/'
keys = ids

files = os.listdir(directory)

for filename in files:
    #Check if the file is a .wav file
    if filename.endswith('.wav'):
        #ID from filename
        id = filename.split('.')[0]
        
        #Check if the id is not in the keys
        if id not in keys:
            file_path = os.path.join(directory, filename)
            os.remove(file_path)

In [95]:
directory = 'data/audio/mixed/'

files = os.listdir(directory)

wav_files = [file for file in files if file.endswith('.wav')]
num_wav_files = len(wav_files)

print(f'There are {num_wav_files} .wav files in the audio directory.')

There are 633 .wav files in the audio directory.


Adding female utterances to a separate folder:

In [74]:
# Open the JSON file
with open('data/F_data.json', 'r') as file:
    # Load the contents of the file into a Python object
    dataF = json.load(file)

In [80]:
keys_F = dataF.keys()
print("There are" , len(keys_F), "female utterances.")

There are 203 female utterances.


In [81]:
#Source and target
source_dir = 'audio_files/'
target_dir = 'data/audio/F'


keys = keys_F

for filename in os.listdir(source_dir):
    if filename.endswith('.wav'):
        # Extract the id from the filename
        id = filename.split('.')[0]
        
        if id in keys:
            source_file = os.path.join(source_dir, filename)
            target_file = os.path.join(target_dir, filename)
            
            shutil.copy(source_file, target_file)

In [82]:
directory = 'data/audio/F'

files = os.listdir(directory)

# Count the number of .wav files
wav_files = [file for file in files if file.endswith('.wav')]
num_wav_files = len(wav_files)

print(f'There are {num_wav_files} .wav files in the female utterance audio directory.')

There are 203 .wav files in the female utterance audio directory.


Same procedure for the male utterances in a separate folder:

In [83]:
# Open the JSON file
with open('data/M_data.json', 'r') as file:
    # Load the contents of the file into a Python object
    dataM = json.load(file)

In [84]:
keys_M = dataM.keys()

In [87]:
#Source and target
source_dir = 'audio_files/'
target_dir = 'data/audio/M'


keys = keys_M

for filename in os.listdir(source_dir):
    if filename.endswith('.wav'):
        # Extract the id from the filename
        id = filename.split('.')[0]
        
        if id in keys:
            source_file = os.path.join(source_dir, filename)
            target_file = os.path.join(target_dir, filename)
            
            shutil.copy(source_file, target_file)

In [88]:
directory = 'data/audio/M'

files = os.listdir(directory)

# Count the number of .wav files
wav_files = [file for file in files if file.endswith('.wav')]
num_wav_files = len(wav_files)

print(f'There are {num_wav_files} .wav files in the male utterance audio directory.')

There are 430 .wav files in the male utterance audio directory.


**Data Augmentation and Balancing**

As known from the text augmentation and balancing seen in the text_data_preparation file, the sarcasm of the mixed dataset is balanced. Furthermore, it is also known that the male dataset lacks 18 non-sarcastic utterances while the mixed set lacks female utterances in quantity. The female set only has a difference of one between sarcastic and non sarcastic and is therefore considered balanced.

In [130]:
#Load labels for sarcasm detection (from a different file, not part of the　audio folder)
with open('data/M_data.json') as f:
    text_data = json.load(f)
    sarcasm_labels_M = {k: int(v['sarcasm']) for k, v in text_data.items()}

In [133]:
assert len(sarcasm_labels_M) == 430

In [140]:
#AUGMENTATION MALE UTTERANCES

directory = 'data/audio/M/'
label = 'A'  #label for augmented data
labels = sarcasm_labels_M



files = [file for file in os.listdir(directory) if file.endswith('.wav')]

non_sarcastic_files = [file for file in files if labels[file.split('.')[0]] == 0]

# Randomly select 18 files
selected_files = random.sample(non_sarcastic_files, 18)

all_labels = copy.deepcopy(sarcasm_labels_M)

for filename in selected_files:
    y, sr = librosa.load(os.path.join(directory, filename))
    
    #Pitch shifting
    y_shifted = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-2)  # Shift pitch down by 2 half-steps
    
    #New filenames
    new_filename = f'{label}_{filename}'
    sf.write(os.path.join(directory, new_filename), y_shifted, sr)
    
    #label for the augmented audio
    all_labels[new_filename.split('.')[0]] = 0  # 0 for non-sarcastic

# Save all the labels as a JSON file
with open('data/audio/labels_M.json', 'w') as f:
    json.dump(all_labels, f)

To save these labels locally for the training later, this is how it can be done:

In [141]:
with open('data/audio/labels_M.json', 'r') as f:
    labelsM = json.load(f)

In [143]:
assert len(labelsM) == 448

In [144]:
#Checking that the dataset is balanced

label_counts = {label: list(labelsM.values()).count(label) for label in set(labelsM.values())}

if len(set(label_counts.values())) == 1:
    print("The dataset is balanced.")
else:
    print("The dataset is not balanced.")

print(label_counts)

The dataset is balanced.
{0: 224, 1: 224}


Similar procedure for the mixed data:

In [3]:
#Load labels for sarcasm detection (from a different file, not part of the　audio folder)
with open('data/mixed_data.json') as f:
    text_data = json.load(f)
    sarcasm_labels_mixed = {k: int(v['sarcasm']) for k, v in text_data.items()}

In [4]:
assert len(sarcasm_labels_mixed) == 633

In [5]:
#Load labels for sarcasm detection (from a different file, not part of the　audio folder)
with open('data/F_data.json') as f:
    text_data = json.load(f)
    sarcasm_labels_F = {k: int(v['sarcasm']) for k, v in text_data.items()}

In [8]:
assert len(sarcasm_labels_F) == 203

In [25]:
# Save the sarcasm_labels_F to a JSON file
with open('data/audio/labels_F.json', 'w') as f:
    json.dump(sarcasm_labels_F, f)

In [26]:
#CONTROL for correct storing
with open('data/audio/labels_F.json', 'r') as f:
    labels_F = json.load(f)

num_labels = len(labels_F)

assert num_labels == 203

In [19]:
directory = 'data/audio/mixed'

files = os.listdir(directory)

# Count the number of .wav files
wav_files = [file for file in files if file.endswith('.wav')]
num_wav_files = len(wav_files)

print(f'There are {num_wav_files} .wav files in the mixed utterance audio directory.')

There are 633 .wav files in the mixed utterance audio directory.


In [20]:
#AUGMENTATION MIXED UTTERANCES
#Augmenting female utterances to match
#(1)

# Set the directory and label for augmented data
directory = 'data/audio/mixed/'
label = 'A1'

# Use the female labels
labels = sarcasm_labels_F

# Get all the .wav files in the directory
files = [file for file in os.listdir(directory) if file.endswith('.wav')]

# Separate the sarcastic and non-sarcastic files
sarcastic_files = [file for file in files if file.split('.')[0] in labels and labels[file.split('.')[0]] == 1]
non_sarcastic_files = [file for file in files if file.split('.')[0] in labels and labels[file.split('.')[0]] == 0]

# Randomly select files for augmentation
selected_sarcastic_files = random.sample(sarcastic_files, 52) 
selected_non_sarcastic_files = random.sample(non_sarcastic_files, 62)

# Copy the original labels
all_labels = copy.deepcopy(sarcasm_labels_mixed)

# Perform the first augmentation on the selected sarcastic files
for filename in selected_sarcastic_files:
    y, sr = librosa.load(os.path.join(directory, filename))
    y_shifted = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-2)  # Shift pitch down by 2 half-steps
    new_filename = f'{label}_{filename}'
    sf.write(os.path.join(directory, new_filename), y_shifted, sr)
    all_labels[new_filename.split('.')[0]] = 1  # 1 for sarcastic

# Perform the second augmentation on the selected non-sarcastic files
# Modify the augmentation as needed
for filename in selected_non_sarcastic_files:
    y, sr = librosa.load(os.path.join(directory, filename))
    y_shifted = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=2)  # Shift pitch down by 2 half-steps
    new_filename = f'{label}_{filename}'
    sf.write(os.path.join(directory, new_filename), y_shifted, sr)
    all_labels[new_filename.split('.')[0]] = 0  # 0 for non-sarcastic

# Save all the labels as a JSON file
with open('data/audio/labels_mixed.json', 'w') as f:
    json.dump(all_labels, f)

In [21]:
directory = 'data/audio/mixed'

files = os.listdir(directory)

# Count the number of .wav files
wav_files = [file for file in files if file.endswith('.wav')]
num_wav_files = len(wav_files)

print(f'There are now {num_wav_files} .wav files in the mixed utterance audio directory after the first augmentation.')

There are now 747 .wav files in the mixed utterance audio directory after the first augmentation.


In [22]:
#Do it again for the second augmentation
#(2)
#AUGMENTATION MIXED UTTERANCES
#Augmenting female utterances to match
#(1)

# Set the directory and label for augmented data
directory = 'data/audio/mixed/'
label = 'A2'

# Use the female labels
labels = sarcasm_labels_F

# Get all the .wav files in the directory
files = [file for file in os.listdir(directory) if file.endswith('.wav')]

# Separate the sarcastic and non-sarcastic files
sarcastic_files = [file for file in files if file.split('.')[0] in labels and labels[file.split('.')[0]] == 1]
non_sarcastic_files = [file for file in files if file.split('.')[0] in labels and labels[file.split('.')[0]] == 0]

# Randomly select files for augmentation
selected_sarcastic_files = random.sample(sarcastic_files, 52) 
selected_non_sarcastic_files = random.sample(non_sarcastic_files, 61)

# Load the existing labels
with open('data/audio/labels_mixed.json', 'r') as f:
    all_labels = json.load(f)

# Perform the first augmentation on the selected sarcastic files
for filename in selected_sarcastic_files:
    y, sr = librosa.load(os.path.join(directory, filename))
    noise = np.random.normal(0, 0.007, y.shape)  # Generate Gaussian noise
    y_noisy = y + noise  # Add noise to the original audio
    new_filename = f'{label}_{filename}'
    sf.write(os.path.join(directory, new_filename), y_noisy, sr)
    all_labels[new_filename.split('.')[0]] = 1  # 1 for sarcastic

# Perform the second augmentation on the selected non-sarcastic files
for filename in selected_non_sarcastic_files:
    y, sr = librosa.load(os.path.join(directory, filename))
    noise = np.random.normal(0, 0.007, y.shape)  # Generate Gaussian noise
    y_noisy = y + noise  # Add noise to the original audio
    new_filename = f'{label}_{filename}'
    sf.write(os.path.join(directory, new_filename), y_noisy, sr)
    all_labels[new_filename.split('.')[0]] = 0  # 0 for non-sarcastic


# Save all the labels as a JSON file
with open('data/audio/labels_mixed.json', 'w') as f:
    json.dump(all_labels, f)

In [23]:
directory = 'data/audio/mixed'

files = os.listdir(directory)

# Count the number of .wav files
wav_files = [file for file in files if file.endswith('.wav')]
num_wav_files = len(wav_files)

print(f'There are now {num_wav_files} .wav files in the mixed utterance audio directory after the second augmentation.')

There are now 860 .wav files in the mixed utterance audio directory after the second augmentation.


To check that all the correct audio files are present we are going to rely on the labels that were created at the same to check that the sets are now balanced:

In [39]:
# Load the labels
with open('data/audio/labels_mixed.json', 'r') as f:
    mixed_labels = json.load(f)

with open('data/audio/labels_F.json', 'r') as f:
    labels_F = json.load(f)

with open('data/audio/labels_M.json', 'r') as f:
    labels_M = json.load(f)

# Mixed
sarcastic_count = sum(1 for label in mixed_labels.values() if label == 1)
non_sarcastic_count = sum(1 for label in mixed_labels.values() if label == 0)

# Mixed: F vs M
male_count_mixed = sum(1 for id in mixed_labels.keys() if any(id == label for label in labels_M.keys()))
female_count_mixed = sum(1 for id in mixed_labels.keys() if any(id == label for label in labels_F.keys()))

# Female Set
sarcastic_count_F = sum(1 for label in labels_F.values() if label == 1)
non_sarcastic_count_F = sum(1 for label in labels_F.values() if label == 0)
# Male Set
sarcastic_count_M = sum(1 for label in labels_M.values() if label == 1)
non_sarcastic_count_M = sum(1 for label in labels_M.values() if label == 0)

print(f'Sarcastic count: {sarcastic_count}')
print(f'Non-sarcastic count: {non_sarcastic_count}')
print(f'Male count in mixed labels: {male_count_mixed}')
print(f'Female count in mixed labels: {female_count_mixed}')
print(f'Sarcastic count in  female labels: {sarcastic_count_F}')
print(f'Non-sarcastic count in female labels: {non_sarcastic_count_F}')
print(f'Sarcastic count in sarcasmM_labels: {sarcastic_count_M}')
print(f'Non-sarcastic count in sarcasmM_labels: {non_sarcastic_count_M}')


Sarcastic count: 430
Non-sarcastic count: 430
Male count in mixed labels: 430
Female count in mixed labels: 203
Sarcastic count in  female labels: 102
Non-sarcastic count in female labels: 101
Sarcastic count in sarcasmM_labels: 224
Non-sarcastic count in sarcasmM_labels: 224


**Now to create the correct spectrograms for each of the three datasets:**

In [42]:
#NOT TO BE RE-RUN
#Creation of spectrograms from the waveform data sorted into their proper directories

#FEMALE UTTERANCES

#Directory containing audio files
audio_dir = 'data/audio/F/'

#Directory to save spectrograms
spectrogram_dir = 'data/audio/spectrograms_F/'

#Length of the maximum waveform found in the dataset
# 1_213.wav
max_length = 882882


for file_name in os.listdir(audio_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(audio_dir, file_name)
        y, sr = librosa.load(file_path)

        #Padding the waveform to the maximum length found
        if len(y) < max_length:
            y_padded = np.pad(y, (0, max_length - len(y)), 'constant')
        else:
            y_padded = y[:max_length]

        #Compute the short-time Fourier transform
        D = librosa.stft(y_padded)

        #Convert the amplitude to decibels (logarithmic scale)
        D_log = librosa.amplitude_to_db(abs(D))

        #Resize the spectrogram to 224x224 -> wanted dimensions for the Beit model
        resize_factor_x = 224 / D_log.shape[1]
        resize_factor_y = 224 / D_log.shape[0]
        D_log_resized = zoom(D_log, (resize_factor_y, resize_factor_x))

        #Conversion to colour image
        D_log_resized_color = plt.get_cmap('viridis')(D_log_resized / np.amax(D_log_resized))

        #Remove the alpha channel of the RGBA image
        D_log_resized_color = D_log_resized_color[:, :, :3]

        #All colour channels saved in the same variable
        red_channel = D_log_resized_color[:, :, 0]
        green_channel = D_log_resized_color[:, :, 1]
        blue_channel = D_log_resized_color[:, :, 2]

        #Save the spectrograms separately for each colour channel
        red_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_red.npy'))
        green_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_green.npy'))
        blue_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_blue.npy'))

        np.save(red_spectrogram_path, red_channel)
        np.save(green_spectrogram_path, green_channel)
        np.save(blue_spectrogram_path, blue_channel)

In [46]:
#To check whether correct number of files
directory = 'data/audio/spectrograms_F/' 

files = os.listdir(directory)
num_files = len(files)

assert num_files == 3*len(os.listdir('data/audio/F/'))

In [43]:
#NOT TO BE RE-RUN
#Creation of spectrograms from the waveform data sorted into their proper directories

#MALE UTTERANCES

#Directory containing audio files
audio_dir = 'data/audio/M/'

#Directory to save spectrograms
spectrogram_dir = 'data/audio/spectrograms_M/'

#Length of the maximum waveform found in the dataset
# 1_213.wav
max_length = 882882


for file_name in os.listdir(audio_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(audio_dir, file_name)
        y, sr = librosa.load(file_path)

        #Padding the waveform to the maximum length found
        if len(y) < max_length:
            y_padded = np.pad(y, (0, max_length - len(y)), 'constant')
        else:
            y_padded = y[:max_length]

        #Compute the short-time Fourier transform
        D = librosa.stft(y_padded)

        #Convert the amplitude to decibels (logarithmic scale)
        D_log = librosa.amplitude_to_db(abs(D))

        #Resize the spectrogram to 224x224 -> wanted dimensions for the Beit model
        resize_factor_x = 224 / D_log.shape[1]
        resize_factor_y = 224 / D_log.shape[0]
        D_log_resized = zoom(D_log, (resize_factor_y, resize_factor_x))

        #Conversion to colour image
        D_log_resized_color = plt.get_cmap('viridis')(D_log_resized / np.amax(D_log_resized))

        #Remove the alpha channel of the RGBA image
        D_log_resized_color = D_log_resized_color[:, :, :3]

        #All colour channels saved in the same variable
        red_channel = D_log_resized_color[:, :, 0]
        green_channel = D_log_resized_color[:, :, 1]
        blue_channel = D_log_resized_color[:, :, 2]

        #Save the spectrograms separately for each colour channel
        red_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_red.npy'))
        green_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_green.npy'))
        blue_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_blue.npy'))

        np.save(red_spectrogram_path, red_channel)
        np.save(green_spectrogram_path, green_channel)
        np.save(blue_spectrogram_path, blue_channel)

In [47]:
#To check whether correct number of files
directory = 'data/audio/spectrograms_M/' 

files = os.listdir(directory)
num_files = len(files)

assert num_files == 3*len(os.listdir('data/audio/M/'))

In [45]:
#NOT TO BE RE-RUN
#Creation of spectrograms from the waveform data sorted into their proper directories

#MIXED UTTERANCES

#Directory containing audio files
audio_dir = 'data/audio/mixed/'

#Directory to save spectrograms
spectrogram_dir = 'data/audio/spectrograms_mixed/'

#Length of the maximum waveform found in the dataset
# 1_213.wav
max_length = 882882


for file_name in os.listdir(audio_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(audio_dir, file_name)
        y, sr = librosa.load(file_path)

        #Padding the waveform to the maximum length found
        if len(y) < max_length:
            y_padded = np.pad(y, (0, max_length - len(y)), 'constant')
        else:
            y_padded = y[:max_length]

        #Compute the short-time Fourier transform
        D = librosa.stft(y_padded)

        #Convert the amplitude to decibels (logarithmic scale)
        D_log = librosa.amplitude_to_db(abs(D))

        #Resize the spectrogram to 224x224 -> wanted dimensions for the Beit model
        resize_factor_x = 224 / D_log.shape[1]
        resize_factor_y = 224 / D_log.shape[0]
        D_log_resized = zoom(D_log, (resize_factor_y, resize_factor_x))

        #Conversion to colour image
        D_log_resized_color = plt.get_cmap('viridis')(D_log_resized / np.amax(D_log_resized))

        #Remove the alpha channel of the RGBA image
        D_log_resized_color = D_log_resized_color[:, :, :3]

        #All colour channels saved in the same variable
        red_channel = D_log_resized_color[:, :, 0]
        green_channel = D_log_resized_color[:, :, 1]
        blue_channel = D_log_resized_color[:, :, 2]

        #Save the spectrograms separately for each colour channel
        red_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_red.npy'))
        green_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_green.npy'))
        blue_spectrogram_path = os.path.join(spectrogram_dir, file_name.replace('.wav', '_blue.npy'))

        np.save(red_spectrogram_path, red_channel)
        np.save(green_spectrogram_path, green_channel)
        np.save(blue_spectrogram_path, blue_channel)

In [48]:
#To check whether correct number of files
directory = 'data/audio/spectrograms_mixed/' 

files = os.listdir(directory)
num_files = len(files)

assert num_files == 3*len(os.listdir('data/audio/mixed/'))