In [21]:
!pip install speechbrain
!pip install torchaudio


Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting sentencepiece (from speechbrain)
  Using cached sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Collecting torchaudio (from speechbrain)
  Downloading torchaudio-2.7.1-cp312-cp312-win_amd64.whl.metadata (6.6 kB)
Collecting huggingface_hub (from speechbrain)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from huggingface_hub->speechbrain)
  Using cached PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.12-py3-none-any.whl.metadata (24 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl.metadata (2.8 kB)
Collecting t

In [22]:
import os
import librosa
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from tqdm import tqdm
from speechbrain.pretrained import SpeakerRecognition
import torchaudio


  from .autonotebook import tqdm as notebook_tqdm
  if ismodule(module) and hasattr(module, '__file__'):
  from speechbrain.pretrained import SpeakerRecognition


In [23]:
AUDIO_DIR = r"C:\Users\Rishi S Etagi\Desktop\medivoice\LIBRI"

OUTPUT_CSV = "speech_features.csv"

In [24]:
def predict_gender(filepath):
    try:
        y, sr = librosa.load(filepath, sr=16000)
        pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
        pitch = pitches[magnitudes > np.median(magnitudes)]
        pitch = pitch[pitch > 0]
        pitch_mean = np.mean(pitch) if len(pitch) > 0 else 0
        return "male" if pitch_mean < 165 else "female"
    except Exception as e:
        print(f"Gender detection error in {filepath}: {e}")
        return "unknown"
    
def extract_features(filepath):
    y, sr = librosa.load(filepath, sr=16000)

    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfccs, axis=1)

    # Pitch-related features
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = pitches[magnitudes > np.median(magnitudes)]
    pitch = pitch[pitch > 0]  # Filter out zeros

    pitch_mean = np.mean(pitch) if len(pitch) > 0 else 0
    pitch_min = np.min(pitch) if len(pitch) > 0 else 0
    pitch_max = np.max(pitch) if len(pitch) > 0 else 0
    pitch_depth = pitch_max - pitch_min

    # Energy
    energy = np.sum(librosa.feature.rms(y=y))

    # Speaking Rate
    envelope = np.abs(y)
    peaks, _ = find_peaks(envelope, height=0.02, distance=1000)
    duration = len(y) / sr
    speaking_rate = len(peaks) / duration if duration > 0 else 0

    gender = predict_gender(filepath)

    return {
        "filename": os.path.basename(filepath),
        "pitch_mean": pitch_mean,
        "pitch_min": pitch_min,
        "pitch_max": pitch_max,
        "pitch_depth": pitch_depth,
        "energy": energy,
        "speaking_rate": speaking_rate,
        "gender": gender,
        **{f"mfcc_{i+1}": mfcc_mean[i] for i in range(len(mfcc_mean))}
    }

# Extraction loop
features = []
for root, dirs, files in os.walk(AUDIO_DIR):
    for file in tqdm(files):
        if file.endswith(".flac"):
            path = os.path.join(root, file)
            try:
                f = extract_features(path)
                features.append(f)
            except Exception as e:
                print(f"Error processing {file}: {e}")


100%|██████████| 2703/2703 [01:14<00:00, 36.23it/s]


In [25]:
df = pd.DataFrame(features)
df.to_csv(OUTPUT_CSV, index=False)
