In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os, glob, random
import numpy as np
import matplotlib.pyplot as plt
import librosa, librosa.display
from tqdm import tqdm

In [None]:
DATA_PATH  = "/content/drive/MyDrive/DS340_Final_Project/Audio_Speech_Actors_01-24"
OUT_PATH   = "/content/drive/MyDrive/DS340_Final_Project/augmented_spectrograms_dataset_speech"
AUG_PER_FILE = 3                # ← number of random variants to create
SR_TARGET     = 22_050          # common sample-rate for speech/music

In [None]:
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}

# make label folders
for lbl in emotion_map.values():
    os.makedirs(os.path.join(OUT_PATH, lbl), exist_ok=True)

In [None]:
def pitch_shift(y, sr, n_steps):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

def time_stretch(y, rate):
    return librosa.effects.time_stretch(y, rate=rate)

def make_and_save_spectrogram(y, sr, save_path):
    S   = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_d = librosa.power_to_db(S, ref=np.max)

    plt.figure(figsize=(3, 3))
    librosa.display.specshow(S_d, sr=sr, x_axis='time', y_axis='mel')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()

In [None]:
pitch_choices  = [-4, -2, 0, 2, 4]
speed_choices  = [0.25, 0.4, 0.5, 0.8, 1.0, 1.2, 1.5, 2.0, 2.5]

In [None]:
def random_augment(y, sr):
    """Apply one random pitch-shift and time-stretch in place."""

    # --- Pitch
    n_steps = random.choice(pitch_choices)
    if n_steps != 0:
        y = pitch_shift(y, sr, n_steps=n_steps)

    # --- Speed
    rate = random.choice(speed_choices)
    if rate != 1.0:
        y = time_stretch(y, rate=rate)

    return y

In [None]:
wav_paths = glob.glob(os.path.join(DATA_PATH, "*", "*.wav"))
print(f"Found {len(wav_paths)} wav files")

for wav_path in tqdm(wav_paths, desc="Processing clips"):
    filename   = os.path.basename(wav_path)
    parts      = filename.split("-")
    if len(parts) < 3:
        continue                              # unexpected file-name

    emotion_code = parts[2]
    emotion_lbl  = emotion_map.get(emotion_code)
    if emotion_lbl is None:
        continue                              # skip unknown code

    # ensure label directory exists
    save_dir = os.path.join(OUT_PATH, emotion_lbl)
    os.makedirs(save_dir, exist_ok=True)

    # load audio once
    y, sr = librosa.load(wav_path, sr=SR_TARGET)
    stem  = os.path.splitext(filename)[0]

    # ---- original spectrogram
    make_and_save_spectrogram(
        y, sr,
        save_path=os.path.join(save_dir, f"{stem}_orig.png"))

    # ---- N random augmentations
    for k in range(AUG_PER_FILE):
        y_aug = random_augment(y.copy(), sr)
        make_and_save_spectrogram(
            y_aug, sr,
            save_path=os.path.join(save_dir, f"{stem}_aug{k}.png"))

print("All spectrograms written to:", OUT_PATH)

Found 1440 wav files


Processing clips: 100%|██████████| 1440/1440 [25:11<00:00,  1.05s/it]

All spectrograms written to: /content/drive/MyDrive/DS340_Final_Project/augmented_spectrograms_dataset_speech



