In [1]:
pip install librosa scikit-learn tqdm


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

# Path to cleaned dataset
base_dir = r"C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset"

# Prepare lists for features and labels
features = []
labels = []

# Iterate through folders (accents)
for accent_label in os.listdir(base_dir):
    accent_path = os.path.join(base_dir, accent_label)
    if not os.path.isdir(accent_path):
        continue

    for file in tqdm(os.listdir(accent_path), desc=f"Processing {accent_label}"):
        if not file.endswith(".mp3"):
            continue
        file_path = os.path.join(accent_path, file)

        try:
            # Load audio file
            y, sr = librosa.load(file_path, sr=16000)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            mfcc_mean = np.mean(mfcc.T, axis=0)

            features.append(mfcc_mean)
            labels.append(accent_label)

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Save as DataFrame
df = pd.DataFrame(features)
df["label"] = labels

# Save to CSV
csv_path = r"C:\Users\sagni\Downloads\Accent Detectection\mfcc_features.csv"
df.to_csv(csv_path, index=False)
print(f"\n✅ MFCC features saved to: {csv_path}")


Processing afrikaans: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.02s/it]
Processing albanian: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 16.50it/s]
Processing amazigh: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 13.37it/s]
Processing amharic: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:01<00:00, 13.44it/s]
Processing arabic: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:07<00:00, 13.10it/s]
Processing armenian: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 17.05it/s]
Processing azerbaijani: 100%|███████████████████████████████████


✅ MFCC features saved to: C:\Users\sagni\Downloads\Accent Detectection\mfcc_features.csv





In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load features
df = pd.read_csv(r"C:\Users\sagni\Downloads\Accent Detectection\mfcc_features.csv")

X = df.drop("label", axis=1)
y = df["label"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Results
print("\n🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))



🎯 Accuracy: 0.2912621359223301

📄 Classification Report:
               precision    recall  f1-score   support

     amharic       0.00      0.00      0.00         5
      arabic       0.12      0.12      0.12        24
   belarusan       0.00      0.00      0.00         2
     bengali       0.00      0.00      0.00         3
     bosnian       0.00      0.00      0.00         3
   bulgarian       0.00      0.00      0.00         3
     burmese       0.00      0.00      0.00         1
   cantonese       0.00      0.00      0.00         6
     catalan       0.00      0.00      0.00         1
    chaldean       0.00      0.00      0.00         1
    croatian       0.00      0.00      0.00         1
       czech       0.00      0.00      0.00         3
       dutch       0.00      0.00      0.00        13
     english       0.33      0.97      0.49       120
         ewe       0.00      0.00      0.00         1
       farsi       0.00      0.00      0.00         4
      fijian       0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
