In [35]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_CREATION"] = "1"


In [33]:
!pip install speechbrain




In [7]:
import os
import librosa
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from tqdm import tqdm
from speechbrain.pretrained import SpeakerRecognition # type:ignore 
import torchaudio
import joblib
from feat_gender import extract_features_from_audio


In [8]:
AUDIO_DIR = r"C:\Users\Rishi S Etagi\Desktop\medivoice\LIBRI"

OUTPUT_CSV = "speech_features.csv"

In [9]:
gender_model = joblib.load("gender_classifier.pkl")
gender_scaler = joblib.load("gender_scaler.pkl")

In [10]:
def predict_gender(filepath):
    try:
        gender_features = extract_features_from_audio(filepath)  

        features_scaled = gender_scaler.transform(gender_features)

        gender = gender_model.predict(features_scaled)[0]

        return gender
    except Exception as e:
        print(f"Error predicting gender for {filepath}: {e}")
        return "unknown"

In [11]:




def extract_features(filepath):
    y, sr = librosa.load(filepath, sr=16000)

    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfccs, axis=1)

    # Pitch-related features
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = pitches[magnitudes > np.median(magnitudes)]
    pitch = pitch[pitch > 0]  # Filter out zeros

    pitch_mean = np.mean(pitch) if len(pitch) > 0 else 0
    pitch_min = np.min(pitch) if len(pitch) > 0 else 0
    pitch_max = np.max(pitch) if len(pitch) > 0 else 0
    pitch_depth = pitch_max - pitch_min

    # Energy
    energy = np.sum(librosa.feature.rms(y=y))

    # Speaking Rate
    envelope = np.abs(y)
    peaks, _ = find_peaks(envelope, height=0.02, distance=1000)
    duration = len(y) / sr
    speaking_rate = len(peaks) / duration if duration > 0 else 0

    gender = predict_gender(filepath)

    return {
        "filename": os.path.basename(filepath),
        "pitch_mean": pitch_mean,
        "pitch_min": pitch_min,
        "pitch_max": pitch_max,
        "pitch_depth": pitch_depth,
        "energy": energy,
        "speaking_rate": speaking_rate,
        "gender": gender,
        **{f"mfcc_{i+1}": mfcc_mean[i] for i in range(len(mfcc_mean))}
    }

# Extraction loop
features = []
for root, dirs, files in os.walk(AUDIO_DIR):
    for file in tqdm(files):
        if file.endswith(".flac"):
            path = os.path.join(root, file)
            try:
                f = extract_features(path)
                features.append(f)
            except Exception as e:
                print(f"Error processing {file}: {e}")


  0%|          | 0/2703 [00:00<?, ?it/s]

100%|██████████| 2703/2703 [03:20<00:00, 13.47it/s]


In [14]:
print(df['gender'].value_counts())


gender
female    2202
male       501
Name: count, dtype: int64


In [13]:
df = pd.DataFrame(features)
df.to_csv(OUTPUT_CSV, index=False)


In [15]:
numeric_df = df.drop(columns=["filename", "gender"])

mean = numeric_df.mean().values
std = numeric_df.std().values + 1e-8  

import joblib
joblib.dump({"mean": mean, "std": std}, "normalization_stats.pkl")



['normalization_stats.pkl']