In [35]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_CREATION"] = "1"


In [33]:
!pip install speechbrain




In [37]:
import os
import librosa
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from tqdm import tqdm
from speechbrain.pretrained import SpeakerRecognition # type:ignore 
import torchaudio
# from pyAudioAnalysis import audioTrainTest as aT
import joblib



In [38]:
AUDIO_DIR = r"C:\Users\Rishi S Etagi\Desktop\medivoice\LIBRI"

OUTPUT_CSV = "speech_features.csv"

In [42]:
def predict_gender(filepath):
    import librosa
    y, sr = librosa.load(filepath, sr=16000)
    
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = pitches[magnitudes > np.median(magnitudes)]
    pitch = pitch[pitch > 0]

    if len(pitch) == 0:
        return "unknown"
    
    pitch_mean = np.mean(pitch)
    pitch_min = np.min(pitch)
    pitch_max = np.max(pitch)
    pitch_depth = pitch_max - pitch_min

    energy = np.sum(librosa.feature.rms(y=y))

    # Basic logic:
    # Male generally has lower mean pitch and lower pitch depth
    # Female generally has higher mean pitch and higher energy

    if pitch_mean < 160 and pitch_depth < 40:
        return 'male'
    elif pitch_mean >= 160 and energy > 0.1:
        return 'female'
    else:
        # ambiguous case, fallback to pitch_mean only
        return 'female' if pitch_mean >= 150 else 'male'

# def predict_gender(filepath):
#     try:
#         [Result, P, classNames] = aT.fileClassification(filepath, "svm_gender_model", "svm")
#         return classNames[int(Result)]
#     except Exception as e:
#         print(f"Error detecting gender for {filepath}: {e}")
#         return "unknown"
    
    


def extract_features(filepath):
    y, sr = librosa.load(filepath, sr=16000)

    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfccs, axis=1)

    # Pitch-related features
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = pitches[magnitudes > np.median(magnitudes)]
    pitch = pitch[pitch > 0]  # Filter out zeros

    pitch_mean = np.mean(pitch) if len(pitch) > 0 else 0
    pitch_min = np.min(pitch) if len(pitch) > 0 else 0
    pitch_max = np.max(pitch) if len(pitch) > 0 else 0
    pitch_depth = pitch_max - pitch_min

    # Energy
    energy = np.sum(librosa.feature.rms(y=y))

    # Speaking Rate
    envelope = np.abs(y)
    peaks, _ = find_peaks(envelope, height=0.02, distance=1000)
    duration = len(y) / sr
    speaking_rate = len(peaks) / duration if duration > 0 else 0

    gender = predict_gender(filepath)

    return {
        "filename": os.path.basename(filepath),
        "pitch_mean": pitch_mean,
        "pitch_min": pitch_min,
        "pitch_max": pitch_max,
        "pitch_depth": pitch_depth,
        "energy": energy,
        "speaking_rate": speaking_rate,
        "gender": gender,
        **{f"mfcc_{i+1}": mfcc_mean[i] for i in range(len(mfcc_mean))}
    }

# Extraction loop
features = []
for root, dirs, files in os.walk(AUDIO_DIR):
    for file in tqdm(files):
        if file.endswith(".flac"):
            path = os.path.join(root, file)
            try:
                f = extract_features(path)
                features.append(f)
            except Exception as e:
                print(f"Error processing {file}: {e}")


  0%|          | 0/2703 [00:00<?, ?it/s]

100%|██████████| 2703/2703 [01:38<00:00, 27.49it/s]


In [43]:
print(df['gender'].value_counts())


gender
female    2703
Name: count, dtype: int64


In [44]:
df = pd.DataFrame(features)
df.to_csv(OUTPUT_CSV, index=False)
