In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import glob
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [3]:
base_path = "/content/drive/MyDrive/audio_dataset"
song_path = os.path.join(base_path, "songs")
speech_path = os.path.join(base_path, "speech")

In [4]:
def load_audio_paths(base_dir, audio_type):
    data = []
    for actor_folder in os.listdir(base_dir):
        actor_path = os.path.join(base_dir, actor_folder)
        if os.path.isdir(actor_path):
            for wav_file in glob.glob(actor_path + "/*.wav"):
                filename = os.path.basename(wav_file)
                trial_id = filename.replace(".wav", "")
                data.append({
                    "filepath": wav_file,
                    "actor": actor_folder,
                    "trial": trial_id,
                    "type": audio_type
                })
    return data

In [5]:
song_data = load_audio_paths(song_path, "song")
speech_data = load_audio_paths(speech_path, "speech")
df = pd.DataFrame(song_data + speech_data)
print("Total audio files:", len(df))


Total audio files: 2452


In [12]:
img_size = (128, 128)
spectrogram_dir = "/content/drive/MyDrive/audio_dataset/spectrograms"
os.makedirs(spectrogram_dir, exist_ok=True)
spectrogram_paths = []
labels = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    y, sr = librosa.load(row['filepath'], sr=22050)
    S = librosa.feature.melspectrogram(y=y, sr=sr)
    S_DB = librosa.power_to_db(S, ref=np.max)

    fig = plt.figure(figsize=(2, 2))
    librosa.display.specshow(S_DB, sr=sr, x_axis='time', y_axis='mel')
    plt.axis('off')
    image_name = f"{row['actor']}_{row['trial']}_{row['type']}.png"
    full_path = os.path.join(spectrogram_dir, image_name)
    fig.savefig(full_path, bbox_inches='tight', pad_inches=0)
    plt.close(fig)

    spectrogram_paths.append(full_path)
    labels.append(row['type'])

# Encode labels
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)

100%|██████████| 2452/2452 [05:21<00:00,  7.64it/s]
