### Play audio

In [None]:
from IPython.display import Audio  # for playing audio files

In [None]:
audio = Audio('/Users/hela/Code/pata/Audio_data/01/01_block1_1_56_TA_1.wav')
audio

### Covert .wav to a STFT spectrogram
Short-time Fourier transform (STFT)

In [None]:
import librosa  # for processing audio data
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
# one file OPTION 1 (size in inches, and no 224*224 needed for the model)

audio_file, sr = librosa.load('/Users/hela/Code/pata/Audio_data/01/01_block1_1_56_TA_1.wav')
s = librosa.stft(audio_file, center=False)
plt.figure(figsize=(4,5))
librosa.display.specshow(librosa.amplitude_to_db(s, ref=np.max))
# plt.savefig('/Users/hela/Code/pata/spectrograms_test/file.png', bbox_inches='tight', pad_inches=0, dpi=100)
plt.show()

Comment:
- audio = librosa.load returns audio time series (a 1D NumPy array of amplitude values)
- sr = sampling rate (e.g., 16000 Hz for speech). sr=None uses the file’s native sampling rate.
- librosa.stft = computes STFT, returns a complex-valued matrix where rows represent frequency bins, columns represent time frames
- hop_length = number of samples between consecutive windows (32 samples ≈ 2 ms at 16 kHz). A small hop length (high overlap, 75%) ensures smooth temporal resolution
- center = uses left-aligned frames, instead of centered frames
- librosa.amplitude_to_db = converts the power spectrogram to dB, a logarithmic scale that better matches human perception and highlights dynamic range
- ref=np.max: Sets the reference to the maximum power value, normalizing the dB scale so the loudest part is 0 dB
- bbox_inches='tight' = ensures the plot fits well in the saved image.

https://librosa.org/doc/0.9.2/generated/librosa.stft.html

In [None]:
# one file OPTION 2 (224*224 needed for the model, no white frame)

# Load audio
audio_file, sr = librosa.load('/Users/hela/Code/pata/Audio_data/01/01_block1_2_1_TA_2.wav')
s = librosa.stft(audio_file, center=False)
S_db = librosa.amplitude_to_db(np.abs(s), ref=np.max)

# Create figure with exact size and no padding
fig = plt.figure(figsize=(2.24, 2.24), dpi=100)
ax = plt.Axes(fig, [0., 0., 1., 1.])  # [left, bottom, width, height] in figure fraction
ax.set_axis_off()
fig.add_axes(ax)

# Display spectrogram
librosa.display.specshow(S_db, sr=sr, x_axis=None, y_axis=None, cmap='magma')

# Save without any borders
# plt.savefig('/Users/hela/Code/pata/spectrograms_test/file1.png', bbox_inches='tight', pad_inches=0, dpi=100)
plt.show()

In [None]:
# check the image size (needed for the model to be 224x224)
from PIL import Image

img = Image.open('/Users/hela/Code/pata/spectrograms_test/file1.png')
width, height = img.size
print(width, height)

In [None]:
# one folder, all files

dir = '/Users/hela/Code/pata/Audio_data/01/'

for file in os.listdir(dir):
    audio_file, sr = librosa.load(os.path.join(dir, file))
    s = librosa.stft(audio_file, center=False)
    plt.figure(figsize=(5,4))
    librosa.display.specshow(librosa.amplitude_to_db(s, ref=np.max))
    fig_dir = '/Users/hela/Code/pata/spectrograms/' + file.split('.')[0] + '.png'
    plt.savefig(fig_dir, bbox_inches='tight')
    plt.close()

In [None]:
# all folders OPTION 1

major_dir = '/Users/hela/Code/pata/Audio_data/'
for folder in os.listdir(major_dir):
    dir = major_dir + folder + '/'
    for file in os.listdir(dir):
        audio_file, sr = librosa.load(os.path.join(dir, file))
        s = librosa.stft(audio_file, center=False)
        plt.figure(figsize=(5,4))
        librosa.display.specshow(librosa.amplitude_to_db(s, ref=np.max))
        fig_dir = '/Users/hela/Code/pata/spectrograms/' + file.split('.')[0] + '.png'
        plt.savefig(fig_dir, bbox_inches='tight')
        plt.close()

In [None]:
# all folders OPTION 2

major_dir = '/Users/hela/Code/pata/Audio_data/'
for folder in os.listdir(major_dir):
    dir = major_dir + folder + '/'
    for file in os.listdir(dir):
        # Load audio
        audio_file, sr = librosa.load(os.path.join(dir, file))
        s = librosa.stft(audio_file, center=False)
        S_db = librosa.amplitude_to_db(np.abs(s), ref=np.max)
        # Create figure
        fig = plt.figure(figsize=(2.24, 2.24), dpi=100)
        ax = plt.Axes(fig, [0., 0., 1., 1.])
        ax.set_axis_off()
        fig.add_axes(ax)
        # Display spectrogram
        librosa.display.specshow(S_db, sr=sr, x_axis=None, y_axis=None, cmap='magma')
        # Save
        fig_dir = '/Users/hela/Code/pata/spectrograms_test/' + file.split('.')[0] + '.png'
        plt.savefig(fig_dir, bbox_inches='tight', pad_inches=0, dpi=100)
        plt.close()

In [None]:
# check the image size final
from PIL import Image

img = Image.open('/Users/hela/Code/pata/spectrograms/07_block1_33_11_PA_1.png')
width, height = img.size
print(width, height)