In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import os

data_dirs = [
    "/content/drive/MyDrive/DS340_Final_Project/Audio_Song_Actors_01-24",
    "/content/drive/MyDrive/DS340_Final_Project/Audio_Speech_Actors_01-24"
]

base_out = "/content/drive/MyDrive/DS340_Final_Project/augmented_spectrograms_pitch"

os.makedirs(base_out, exist_ok=True)

In [9]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob

In [10]:
pitch_shifts = [-4, -2, 2, 4]

In [11]:
emotion_map = {
    "01":"neutral","02":"calm","03":"happy","04":"sad",
    "05":"angry","06":"fearful","07":"disgust","08":"surprised"
}

In [12]:
for n_steps in pitch_shifts:
    # 1) one shared pitch directory
    pitch_dir = os.path.join(base_out, f"pitch_{n_steps:+}")
    os.makedirs(pitch_dir, exist_ok=True)

    # 2) inside it, one subfolder per emotion
    for emo in emotion_map.values():
        os.makedirs(os.path.join(pitch_dir, emo), exist_ok=True)

    # 3) process both Song & Speech folders into the same tree
    for data_path in data_dirs:
        desc = f"Pitch={n_steps:+} [{os.path.basename(data_path)}]"
        for actor in tqdm(os.listdir(data_path), desc=desc):
            actor_path = os.path.join(data_path, actor)
            if not os.path.isdir(actor_path):
                continue

            for fn in os.listdir(actor_path):
                if not fn.lower().endswith(".wav"):
                    continue

                # pull out emotion label
                code    = fn.split("-")[2]
                emotion = emotion_map.get(code)
                if emotion is None:
                    continue

                # load & pitch-shift
                wav_path = os.path.join(actor_path, fn)
                y, sr    = librosa.load(wav_path, sr=None)
                y_shift  = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

                # mel-spectrogram → dB
                S    = librosa.feature.melspectrogram(y=y_shift, sr=sr)
                S_db = librosa.power_to_db(S, ref=np.max)

                # save into pitch_dir/emotion
                plt.figure(figsize=(3,3))
                librosa.display.specshow(S_db, sr=sr,
                                         x_axis='time', y_axis='mel')
                plt.axis('off')
                outname   = fn.replace(".wav", f"_pitch{n_steps:+}.png")
                save_path = os.path.join(pitch_dir, emotion, outname)
                plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
                plt.close()

Pitch=-4 [Audio_Song_Actors_01-24]: 100%|██████████| 24/24 [16:51<00:00, 42.13s/it]
Pitch=-4 [Audio_Speech_Actors_01-24]: 100%|██████████| 25/25 [19:06<00:00, 45.86s/it]
Pitch=-2 [Audio_Song_Actors_01-24]: 100%|██████████| 24/24 [03:16<00:00,  8.19s/it]
Pitch=-2 [Audio_Speech_Actors_01-24]: 100%|██████████| 25/25 [04:15<00:00, 10.21s/it]
Pitch=+2 [Audio_Song_Actors_01-24]: 100%|██████████| 24/24 [03:32<00:00,  8.84s/it]
Pitch=+2 [Audio_Speech_Actors_01-24]: 100%|██████████| 25/25 [04:25<00:00, 10.63s/it]
Pitch=+4 [Audio_Song_Actors_01-24]: 100%|██████████| 24/24 [03:48<00:00,  9.52s/it]
Pitch=+4 [Audio_Speech_Actors_01-24]: 100%|██████████| 25/25 [04:40<00:00, 11.22s/it]
