In [4]:
import numpy as np
import pandas as pd
import librosa
import glob
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Directory where the MFCC CSV files are stored
data_dir = "path_to_your_mfcc_files"  # Replace with your path

# Function to compute statistical summaries for each feature type
def compute_statistics(feature_matrix, feature_type):
    # Initialize lists for each statistic type
    means, stds, mins, maxs, skews, kurts = [], [], [], [], [], []
    
    for i in range(feature_matrix.shape[0]):
        means.append(np.mean(feature_matrix[i, :]))
        stds.append(np.std(feature_matrix[i, :]))
        mins.append(np.min(feature_matrix[i, :]))
        maxs.append(np.max(feature_matrix[i, :]))
        skews.append(skew(feature_matrix[i, :]))
        kurts.append(kurtosis(feature_matrix[i, :]))

    # Feature names in the specified order (all means, all stds, etc.)
    mean_names = [f"{feature_type}_mean_{i+1}" for i in range(feature_matrix.shape[0])]
    std_names = [f"{feature_type}_std_{i+1}" for i in range(feature_matrix.shape[0])]
    min_names = [f"{feature_type}_min_{i+1}" for i in range(feature_matrix.shape[0])]
    max_names = [f"{feature_type}_max_{i+1}" for i in range(feature_matrix.shape[0])]
    skew_names = [f"{feature_type}_skew_{i+1}" for i in range(feature_matrix.shape[0])]
    kurt_names = [f"{feature_type}_kurt_{i+1}" for i in range(feature_matrix.shape[0])]

    # Return ordered features and names
    return means + stds + mins + maxs + skews + kurts, mean_names + std_names + min_names + max_names + skew_names + kurt_names

# Process each CSV file and extract features
all_song_features = []
all_file_names = []
feature_names = None

file_names = glob.glob('*-MFCC.csv')
# file_names = ['national_anthem_mfcc.csv'] +["Michael Jackson - Thriller (Official 4K Video) [sOnqjkJTMaA]-MFCC.csv"]+["Michael Jackson - Beat It (Official 4K Video) [oRdxUFDoQe0]-MFCC.csv"] + glob.glob('*-MFCC.csv') 
for file_name in file_names:

    mfcc_data = pd.read_csv(file_name, header=None).values  # Load MFCC data

    # Compute delta and delta-delta
    delta_mfcc = librosa.feature.delta(mfcc_data)
    delta2_mfcc = librosa.feature.delta(mfcc_data, order=2)

    # Compute statistical summaries for each feature type in the specified order
    mfcc_stats, mfcc_names = compute_statistics(mfcc_data, "mfcc")
    delta_stats, delta_names = compute_statistics(delta_mfcc, "delta")
    delta2_stats, delta2_names = compute_statistics(delta2_mfcc, "delta2")

    # Concatenate features for the song
    song_features = mfcc_stats + delta_stats + delta2_stats
    all_song_features.append(song_features)

    # Capture file name
    all_file_names.append(file_name)

    # Capture feature names only once
    if feature_names is None:
        feature_names = ["file_name"] + mfcc_names + delta_names + delta2_names

# Convert list of feature vectors to a DataFrame with feature names as column headers
feature_df = pd.DataFrame(all_song_features, columns=feature_names[1:])
feature_df.insert(0, "file_name", all_file_names) 

In [7]:
feature_df.to_csv('train_master.csv')