In [1]:
!pip install librosa numpy pandas scipy tqdm


Collecting librosa
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting numpy
  Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting scipy
  Downloading scipy-1.15.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting audioread>=2.1.9 (from librosa)
  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.2-cp312-cp312-win_amd64.whl.metadata (2.9 kB)
Collecting scikit-learn>=1.1.0 (from librosa)
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.0 (from librosa)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Using cached soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting poo

In [2]:
import os
import librosa
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from tqdm import tqdm

In [3]:
AUDIO_DIR = r"C:\Users\Rishi S Etagi\Desktop\medivoice\LIBRI"

OUTPUT_CSV = "speech_features.csv"

In [None]:
def extract_features(filepath):
    y, sr = librosa.load(filepath, sr=16000)
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)   #capturing spectral shape of speech ( i am taking mean of 13 mfcc's)
    mfcc_mean = np.mean(mfccs, axis=1)

    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)   # taking the average pitch over time 
    pitch = pitches[magnitudes > np.median(magnitudes)]
    pitch_mean = np.mean(pitch) if len(pitch) > 0 else 0

    energy = np.sum(librosa.feature.rms(y=y))   # rms value of speech which gives loudness

    envelope = np.abs(y)
    peaks, _ = find_peaks(envelope, height=0.02, distance=1000)     #This is for speaking rate which is no of peaks per second
    duration = len(y) / sr
    speaking_rate = len(peaks) / duration if duration > 0 else 0

    return {
        "filename": os.path.basename(filepath),
        "pitch_mean": pitch_mean,
        "energy": energy,
        "speaking_rate": speaking_rate,
        **{f"mfcc_{i+1}": mfcc_mean[i] for i in range(len(mfcc_mean))}
    }


features = []
for root, dirs, files in os.walk(AUDIO_DIR):
    for file in tqdm(files):
        if file.endswith(".flac"):
            path = os.path.join(root, file)
            try:
                f = extract_features(path)
                features.append(f)
            except Exception as e:
                print(f"Error processing {file}: {e}")

100%|██████████| 2703/2703 [01:47<00:00, 25.19it/s]


In [7]:
df = pd.DataFrame(features)
df.to_csv(OUTPUT_CSV, index=False)
