In [2]:
import os
import shutil

source_dir = r"C:\Users\sagni\Downloads\Accent Detectection\archive (1)\recordings\recordings"
target_dir = r"C:\Users\sagni\Downloads\Accent Detectection\organized_by_accent"

os.makedirs(target_dir, exist_ok=True)

for file in os.listdir(source_dir):
    if file.endswith(".mp3"):
        accent = ''.join([char for char in file if not char.isdigit()]).replace(".mp3", "").strip()
        accent_folder = os.path.join(target_dir, accent)
        os.makedirs(accent_folder, exist_ok=True)
        shutil.copy(os.path.join(source_dir, file), os.path.join(accent_folder, file))

print("✅ Files organized by accent.")


✅ Files organized by accent.


In [6]:
import os
import numpy as np
import librosa
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

# Path to your organized accent folder
data_dir = r"C:\Users\sagni\Downloads\Accent Detectection\organized_by_accent"

# Step 1: Extract MFCC features and labels
X_raw, y_raw = [], []

for label in os.listdir(data_dir):
    label_path = os.path.join(data_dir, label)
    if not os.path.isdir(label_path):
        continue

    for file in os.listdir(label_path):
        if file.endswith(".mp3"):
            file_path = os.path.join(label_path, file)
            try:
                audio, sr = librosa.load(file_path, sr=16000)
                mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
                mfcc_mean = np.mean(mfcc.T, axis=0)
                X_raw.append(mfcc_mean)
                y_raw.append(label)
            except Exception as e:
                print(f"⚠️ Skipping {file_path}: {e}")

print(f"🔢 Total raw samples: {len(X_raw)}")

# Step 2: Remove classes with < 2 samples
label_counts = Counter(y_raw)
X_filtered, y_filtered = zip(*[
    (x, label) for x, label in zip(X_raw, y_raw) if label_counts[label] >= 2
])

print(f"✅ Samples after filtering: {len(X_filtered)}")
print(f"🔠 Unique classes after filtering: {len(set(y_filtered))}")

# Step 3: Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_filtered)
X_filtered = np.array(X_filtered)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Step 5: Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = model.predict(X_test)
unique_test_labels = np.unique(y_test)
target_names = le.inverse_transform(unique_test_labels)
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=target_names))


🔢 Total raw samples: 4276
✅ Samples after filtering: 4120
🔠 Unique classes after filtering: 244

📊 Classification Report:

                precision    recall  f1-score   support

     afrikaans       0.00      0.00      0.00         1
  afrikaans.mp       0.00      0.00      0.00         1
      albanian       0.00      0.00      0.00         2
   albanian.mp       0.00      0.00      0.00         2
       amharic       0.00      0.00      0.00         4
    amharic.mp       0.00      0.00      0.00         4
        arabic       0.04      0.05      0.05        20
     arabic.mp       0.00      0.00      0.00        20
      armenian       0.00      0.00      0.00         2
   armenian.mp       0.00      0.00      0.00         2
   azerbaijani       0.00      0.00      0.00         1
azerbaijani.mp       0.00      0.00      0.00         1
       bambara       0.00      0.00      0.00         1
    bambara.mp       0.00      0.00      0.00         1
     belarusan       0.00      0.00 