In [1]:
import os
import librosa
import numpy as np
from collections import Counter
from tqdm import tqdm

base_dir = r"C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset"
MIN_SAMPLES = 10
MAX_FILES_PER_CLASS = 100  # Optional limit

def extract_features(file_path, sr=22050, max_len=130):
    y, sr = librosa.load(file_path, sr=sr)
    
    # Extract MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Extract Chroma
    stft = np.abs(librosa.stft(y))
    chroma = librosa.feature.chroma_stft(S=stft, sr=sr)

    # Spectral Contrast
    contrast = librosa.feature.spectral_contrast(S=stft, sr=sr)

    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y)

    # Stack features into one matrix
    features = np.vstack([mfcc, chroma, contrast, zcr])

    # Pad/truncate to fixed time axis (second axis)
    if features.shape[1] < max_len:
        pad_width = max_len - features.shape[1]
        features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant')
    else:
        features = features[:, :max_len]
    
    return features

# Count label frequencies
label_counts = Counter()
for label in os.listdir(base_dir):
    path = os.path.join(base_dir, label)
    if os.path.isdir(path):
        label_counts[label] = len([f for f in os.listdir(path) if f.endswith('.mp3')])

# Keep only classes with enough samples
valid_labels = {label for label, count in label_counts.items() if count >= MIN_SAMPLES}
print(f"✅ Keeping {len(valid_labels)} classes: {valid_labels}")

# Extract features
X = []
y = []
label_to_index = {label: idx for idx, label in enumerate(sorted(valid_labels))}

for label in tqdm(valid_labels, desc="Extracting features"):
    label_path = os.path.join(base_dir, label)
    files = [f for f in os.listdir(label_path) if f.endswith('.mp3')]
    files = files[:MAX_FILES_PER_CLASS]  # Optional: limit per class
    
    for file in files:
        try:
            file_path = os.path.join(label_path, file)
            features = extract_features(file_path)
            X.append(features)
            y.append(label_to_index[label])
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")

X = np.array(X)
y = np.array(y)

print(f"✅ Feature shape: {X.shape}, Labels: {y.shape}")


✅ Keeping 35 classes: {'russian', 'bengali', 'italian', 'hindi', 'spanish', 'nepali', 'amharic', 'swedish', 'thai', 'tagalog', 'japanese', 'vietnamese', 'german', 'punjabi', 'urdu', 'cantonese', 'romanian', 'french', 'farsi', 'bulgarian', 'ukrainian', 'korean', 'dutch', 'pashto', 'macedonian', 'greek', 'polish', 'mandarin', 'serbian', 'arabic', 'english', 'turkish', 'kurdish', 'portuguese', 'miskito'}


Extracting features: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [04:07<00:00,  7.07s/it]

✅ Feature shape: (1144, 33, 130), Labels: (1144,)





In [2]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Prepare input shape
X = X[..., np.newaxis]  # (samples, features, time, 1)
y_cat = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)

# CNN model
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=X.shape[1:]),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y_cat.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.0411 - loss: 17.2314 - val_accuracy: 0.0830 - val_loss: 3.5442
Epoch 2/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.0606 - loss: 3.5476 - val_accuracy: 0.1004 - val_loss: 3.5187
Epoch 3/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.0679 - loss: 3.5194 - val_accuracy: 0.1092 - val_loss: 3.5159
Epoch 4/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.0644 - loss: 3.4713 - val_accuracy: 0.0961 - val_loss: 3.4888
Epoch 5/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.1183 - loss: 3.3539 - val_accuracy: 0.1004 - val_loss: 3.4768
Epoch 6/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.1021 - loss: 3.2986 - val_accuracy: 0.0873 - val_loss: 3.4660
Epoch 7/30
[1m29/29[0m [32m━━━