In [6]:
# !pip install librosa scikit-learn pandas matplotlib seaborn tqdm
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [7]:
DATA_DIR = "data"
LANGUAGES = ["Spanish", "Korean", "Italian", "German"]
SAMPLE_RATE = 16000      # Standard for speech
DURATION = 60            # seconds (as per project)
N_MFCC = 13              # Common choice for speech

In [8]:
def extract_features(file_path, sr=SAMPLE_RATE, duration=DURATION, n_mfcc=N_MFCC):
    """
    Load MP3 audio and extract MFCCs (mean + std per coefficient).
    Returns a flat feature vector of length 2 * n_mfcc.
    """
    try:
        y, sr = librosa.load(file_path, sr=sr, duration=duration)
        # Pad if shorter than DURATION
        if len(y) < sr * duration:
            y = np.pad(y, (0, sr * duration - len(y)), mode='constant')
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mean = np.mean(mfccs, axis=1)
        std = np.std(mfccs, axis=1)
        return np.concatenate([mean, std])
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

In [9]:
features, labels = [], []

for lang in LANGUAGES:
    lang_dir = os.path.join(DATA_DIR, lang)
    if not os.path.exists(lang_dir):
        print(f"Warning: {lang_dir} not found.")
        continue
        
    # Check for male and female subdirectories
    for gender in ["Male", "Female"]:
        gender_dir = os.path.join(lang_dir, gender)
        if not os.path.exists(gender_dir):
            print(f"Warning: {gender_dir} not found.")
            continue
            
        for file in tqdm(os.listdir(gender_dir), desc=f"Loading {lang}/{gender}"):
            if file.endswith(".mp3"):
                path = os.path.join(gender_dir, file)
                feat = extract_features(path)
                if feat is not None:
                    features.append(feat)
                    labels.append(lang)  # Only language matters for labeling

# Save to CSV
df = pd.DataFrame(features)
df["label"] = labels
df.to_csv("extracted_features.csv", index=False)
print(f"\n✅ Extracted {len(df)} samples with {df.shape[1]-1} features each.")

Loading Spanish/Male: 100%|██████████| 90/90 [00:11<00:00,  7.80it/s]
Loading Spanish/Female: 100%|██████████| 90/90 [00:11<00:00,  8.04it/s]
Loading Korean/Male: 100%|██████████| 90/90 [00:11<00:00,  7.72it/s]
Loading Korean/Female: 100%|██████████| 90/90 [00:13<00:00,  6.72it/s]
Loading Italian/Male: 100%|██████████| 90/90 [00:11<00:00,  7.87it/s]
Loading Italian/Female: 100%|██████████| 90/90 [00:10<00:00,  8.63it/s]
Loading German/Male: 100%|██████████| 90/90 [00:10<00:00,  8.86it/s]
Loading German/Female: 100%|██████████| 90/90 [00:11<00:00,  8.05it/s]


✅ Extracted 712 samples with 26 features each.





In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split into train and test sets (80/20 stratified split)
df = pd.DataFrame(features)
df["label"] = labels

y_encoded = pd.factorize(df["label"])[0]

# Stratified split
train_indices, test_indices = train_test_split(
    range(len(df)), test_size=0.2, stratify=y_encoded, random_state=42
)

df_train = df.iloc[train_indices].reset_index(drop=True)
df_test = df.iloc[test_indices].reset_index(drop=True)

# Encode labels to numbers
le = LabelEncoder()
le.fit(df["label"])
df_train["en_label"] = le.transform(df_train["label"])
df_test["en_label"] = le.transform(df_test["label"])

# Add index column (sample number)
df_train.insert(0, "sample_id", range(1, len(df_train) + 1))
df_test.insert(0, "sample_id", range(1, len(df_test) + 1))

# Save as separate CSV files
df_train.to_csv("train_features.csv", index=False)
df_test.to_csv("test_features.csv", index=False)

print(f"✅ Training set: {len(df_train)} samples saved to train_features.csv")
print(f"✅ Test set: {len(df_test)} samples saved to test_features.csv")

df

✅ Training set: 569 samples saved to train_features.csv
✅ Test set: 143 samples saved to test_features.csv


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,label
0,-318.677582,76.730209,45.628063,35.019958,3.965368,-41.809654,-10.338325,-40.877865,-18.791592,-12.403339,...,21.120277,23.694580,18.851728,18.645567,13.152010,15.159042,11.134150,10.884776,10.505372,Spanish
1,-347.584564,81.671066,35.001820,24.909710,3.784000,-29.446495,-24.129908,-30.706806,-20.125753,-12.018286,...,19.895252,19.163012,17.405811,16.132006,13.326885,14.734371,10.078533,10.839331,10.699685,Spanish
2,-376.462616,62.923275,42.590950,25.251196,7.806287,-30.876791,-16.002266,-27.947130,-18.684361,-12.636662,...,19.477505,21.848541,17.131063,18.283098,14.070754,14.109545,10.638796,10.645078,10.470984,Spanish
3,-298.727051,71.501999,48.119728,24.471537,10.393909,-26.041599,-19.835464,-28.709284,-18.038946,-15.919062,...,19.950577,18.421413,16.129679,14.687603,11.626188,13.749689,8.753366,11.242859,10.363416,Spanish
4,-298.497253,61.133537,46.624790,25.759184,0.902360,-30.987343,-18.643078,-35.942532,-15.234863,-13.362785,...,19.957750,21.954889,15.852568,19.193199,14.548494,13.967322,11.708312,11.392424,10.514035,Spanish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,-415.490265,62.610229,13.264565,16.243914,4.326336,-1.964846,-6.200542,-3.965514,-3.682459,1.345128,...,18.474501,15.948372,15.529987,15.631176,14.403872,10.023800,12.423157,9.772285,8.405737,German
708,-323.043640,60.528969,3.393697,12.330757,-1.532031,-2.873862,-5.982039,-5.970890,-6.787503,-1.438415,...,20.042074,16.670088,14.673621,16.308836,15.774516,11.538981,11.760208,10.759164,9.073681,German
709,-309.468781,56.973454,6.465310,8.270527,2.023496,-3.397437,-8.958847,-7.524642,-5.540236,1.133760,...,18.937189,16.106758,15.357042,14.774331,13.755350,10.281881,12.239145,10.828581,9.052409,German
710,-401.070038,60.755714,9.908932,12.414324,0.562268,-1.954595,-2.719823,-0.471427,-4.068000,-0.843919,...,20.241062,17.484093,13.814991,14.803244,16.068365,10.742634,11.233427,9.659823,8.852901,German
