In [12]:
import os
import librosa
import numpy as np
import pandas as pd
from pydub import AudioSegment
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import multiprocessing as mp

# Function to extract features using librosa and pydub for loading
def extract_features(file_path):
    try:
        audio = AudioSegment.from_file(file_path)
        y = np.array(audio.get_array_of_samples(), dtype=np.float32)
        sr = audio.frame_rate
        y = y / (2**15)  # Normalize sample values for librosa
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        return np.hstack((np.mean(mfccs, axis=1), np.mean(centroid, axis=1), np.mean(zcr, axis=1)))  # Return averaged features
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None  # Return None on failure

# Function to process all files within a directory
def process_directory(directory):
    features = []
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        if os.path.isfile(file_path):
            feat = extract_features(file_path)
            if feat is not None:
                features.append((file, feat))
    return (directory.split(os.sep)[-1], features)  # Return folder name and features

# Using multiprocessing to handle data on a larger scale
def main(data_folder, max_dirs):
    directories = [os.path.join(data_folder, d) for d in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, d))]
    directories.sort()  # Sort directories to ensure alphabetical order
    directories = directories[:max_dirs]  # Process only the first max_dirs directories
    results_list = []
    for directory in directories:
        results = process_directory(directory)
        results_list.extend([(results[0], file, feat) for file, feat in results[1] if feat is not None])

    if not results_list:
        return []
    # Normalize features
    data = np.array([item[2] for item in results_list])
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(data)
    # Dynamically adjust PCA components based on available data
    n_components = min(20, normalized_features.shape[1])  # Ensure n_components does not exceed number of features
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(normalized_features)
    
    # Save to CSV with folder and file names
    df = pd.DataFrame(reduced_features)
    df['folder'] = [item[0] for item in results_list]  # Add folder names
    df['file'] = [item[1] for item in results_list]  # Add file names
    df.to_csv('extracted_features_selected_folders.csv', index=False)
    return reduced_features

if __name__ == '__main__':
    data_folder = 'fma_large'  # Update this path to your dataset
    max_dirs = 5  # Set the number of directories you want to process
    features = main(data_folder, max_dirs)
    if features.size > 0:
        print("Features saved to 'extracted_features_selected_folders.csv'")
    else:
        print("No features extracted.")


Error processing fma_large/001/001486.mp3: Decoding failed. ffmpeg returned error code: 1

Output from ffmpeg/avlib:

ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libso