In [1]:
import os
import librosa
import numpy as np
import pandas as pd

# Directory containing the audio files
audio_dir = r"/Users/dinesh/College/final proj/attempt3/updatedMoseiData/audio"
# Directory to save the extracted features
features_dir = r"/Users/dinesh/College/final proj/attempt3/features/audio"
os.makedirs(features_dir, exist_ok=True)

# Function to extract features from an audio file using librosa
def extract_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    
    # 1. MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)
    mfccs_mean = np.mean(mfccs.T, axis=0)

    # 2. Pitch (using piptrack)
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch_mean = np.mean(pitches[magnitudes > np.median(magnitudes)]) if np.any(magnitudes > np.median(magnitudes)) else 0

    # 3. Zero-crossing rate (related to voice source parameters)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=y))

    # 4. Spectral centroid (related to timbre, tone)
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))

    # 5. Spectral bandwidth (related to dispersion quotient)
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))

    # Combine all features into a single vector
    features = np.hstack([mfccs_mean, pitch_mean, zcr, spectral_centroid, spectral_bandwidth])
    return features

# Process each audio file
features_list = []
for i, file_name in enumerate(os.listdir(audio_dir)):
    if file_name.endswith('.wav'):  # Assuming the files are in .wav format
        audio_path = os.path.join(audio_dir, file_name)
        features = extract_features(audio_path)
        # Save the features as a numpy file
        feature_file = os.path.join(features_dir, f'features_{i+1}.npy')
        np.save(feature_file, features)
        features_list.append(features)
        print(f"Processed {i+1}/{len(os.listdir(audio_dir))} files")

# Optionally, save all features in a single CSV file
features_df = pd.DataFrame(features_list)
features_df.to_csv(os.path.join(features_dir, 'all_features.csv'), index=False)

print("Feature extraction completed!")


Processed 1/2089 files
Processed 2/2089 files
Processed 3/2089 files
Processed 4/2089 files
Processed 5/2089 files
Processed 6/2089 files
Processed 7/2089 files
Processed 8/2089 files
Processed 9/2089 files
Processed 10/2089 files
Processed 11/2089 files
Processed 12/2089 files
Processed 13/2089 files
Processed 14/2089 files
Processed 15/2089 files
Processed 16/2089 files
Processed 17/2089 files
Processed 18/2089 files
Processed 19/2089 files
Processed 20/2089 files
Processed 21/2089 files
Processed 22/2089 files
Processed 23/2089 files
Processed 24/2089 files
Processed 25/2089 files
Processed 26/2089 files
Processed 27/2089 files
Processed 28/2089 files
Processed 29/2089 files
Processed 30/2089 files
Processed 31/2089 files
Processed 32/2089 files
Processed 33/2089 files
Processed 34/2089 files
Processed 35/2089 files
Processed 36/2089 files
Processed 37/2089 files
Processed 38/2089 files
Processed 39/2089 files
Processed 40/2089 files
Processed 41/2089 files
Processed 42/2089 files
P