In [17]:
# !pip install librosa scikit-learn pandas matplotlib seaborn tqdm
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [18]:
DATA_DIR = "data"
LANGUAGES = ["Spanish", "Korean", "Italian", "German"]
SAMPLE_RATE = 16000      # Standard for speech
DURATION = 60            # seconds (as per project)
N_MFCC = 13              # Common choice for speech

In [19]:
def extract_features(file_path, sr=SAMPLE_RATE, duration=DURATION, n_mfcc=N_MFCC):
    """
    Load MP3 audio and extract MFCCs (mean + std per coefficient).
    Returns a flat feature vector of length 2 * n_mfcc.
    """
    try:
        y, sr = librosa.load(file_path, sr=sr, duration=duration)
        # Pad if shorter than DURATION
        if len(y) < sr * duration:
            y = np.pad(y, (0, sr * duration - len(y)), mode='constant')
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mean = np.mean(mfccs, axis=1)
        std = np.std(mfccs, axis=1)
        return np.concatenate([mean, std])
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

In [21]:
features, labels = [], []

for lang in LANGUAGES:
    lang_dir = os.path.join(DATA_DIR, lang)
    if not os.path.exists(lang_dir):
        print(f"Warning: {lang_dir} not found.")
        continue
        
    # Check for male and female subdirectories
    for gender in ["Male", "Female"]:
        gender_dir = os.path.join(lang_dir, gender)
        if not os.path.exists(gender_dir):
            print(f"Warning: {gender_dir} not found.")
            continue
            
        for file in tqdm(os.listdir(gender_dir), desc=f"Loading {lang}/{gender}"):
            if file.endswith(".mp3"):
                path = os.path.join(gender_dir, file)
                feat = extract_features(path)
                if feat is not None:
                    features.append(feat)
                    labels.append(lang)  # Only language matters for labeling

# Save to CSV
df = pd.DataFrame(features)
df["label"] = labels
df.to_csv("extracted_features.csv", index=False)
print(f"\n✅ Extracted {len(df)} samples with {df.shape[1]-1} features each.")

Loading Spanish/Male: 100%|██████████| 90/90 [00:10<00:00,  8.63it/s]
Loading Spanish/Female: 100%|██████████| 90/90 [00:09<00:00,  9.05it/s]
Loading Korean/Male: 100%|██████████| 90/90 [00:09<00:00,  9.37it/s]
Loading Korean/Female: 100%|██████████| 90/90 [00:10<00:00,  8.66it/s]
Loading Italian/Male: 100%|██████████| 90/90 [00:10<00:00,  8.80it/s]
Loading Italian/Female: 100%|██████████| 90/90 [00:09<00:00,  9.56it/s]
Loading German/Male: 100%|██████████| 90/90 [00:09<00:00,  9.83it/s]
Loading German/Female: 100%|██████████| 90/90 [00:10<00:00,  8.96it/s]


✅ Extracted 712 samples with 26 features each.



