In [1]:
import os
import shutil
from collections import Counter
from tqdm import tqdm

# Path to original recordings
source_folder = r"C:\Users\sagni\Downloads\Accent Detectection\archive (1)\recordings\recordings"
# Path where cleaned data will be saved
destination_folder = r"C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset"

# Create destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Step 1: Collect all file paths and labels
file_paths = []
labels = []

for filename in os.listdir(source_folder):
    if filename.endswith(".mp3"):
        file_path = os.path.join(source_folder, filename)
        label = filename.split(".")[0].lower().strip()  # Remove extension and normalize
        file_paths.append(file_path)
        labels.append(label)

# Step 2: Count how many samples each label has
label_counts = Counter(labels)
MIN_SAMPLES = 20

# Step 3: Filter out underrepresented classes
filtered_files = []
filtered_labels = []

for file_path, label in zip(file_paths, labels):
    if label_counts[label] >= MIN_SAMPLES:
        filtered_files.append(file_path)
        filtered_labels.append(label)

print(f"Total files before filtering: {len(file_paths)}")
print(f"Total files after filtering: {len(filtered_files)}")
print(f"Remaining labels: {set(filtered_labels)}")

# Step 4: Save to cleaned folder (organized by label)
for file_path, label in tqdm(zip(filtered_files, filtered_labels), total=len(filtered_files), desc="Copying files"):
    label_folder = os.path.join(destination_folder, label)
    os.makedirs(label_folder, exist_ok=True)

    dest_path = os.path.join(label_folder, os.path.basename(file_path))
    shutil.copy(file_path, dest_path)

print(f"\n✅ Cleaned dataset saved to: {destination_folder}")


Total files before filtering: 2138
Total files after filtering: 0
Remaining labels: set()


Copying files: 0it [00:00, ?it/s]


✅ Cleaned dataset saved to: C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset





In [2]:
from collections import Counter
import os

source_folder = r"C:\Users\sagni\Downloads\Accent Detectection\archive (1)\recordings\recordings"

labels = []

for filename in os.listdir(source_folder):
    if filename.endswith(".mp3"):
        label = filename.split(".")[0].lower().strip()
        labels.append(label)

label_counts = Counter(labels)

print("Class sample counts:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


Class sample counts:
afrikaans1: 1
afrikaans2: 1
afrikaans3: 1
afrikaans4: 1
afrikaans5: 1
agni1: 1
akan1: 1
albanian1: 1
albanian2: 1
albanian3: 1
albanian4: 1
albanian5: 1
albanian6: 1
albanian7: 1
albanian8: 1
albanian9: 1
amazigh1: 1
amazigh2: 1
amharic1: 1
amharic10: 1
amharic11: 1
amharic12: 1
amharic13: 1
amharic14: 1
amharic15: 1
amharic16: 1
amharic17: 1
amharic18: 1
amharic19: 1
amharic2: 1
amharic20: 1
amharic3: 1
amharic4: 1
amharic5: 1
amharic6: 1
amharic7: 1
amharic8: 1
amharic9: 1
arabic1: 1
arabic10: 1
arabic100: 1
arabic101: 1
arabic102: 1
arabic11: 1
arabic12: 1
arabic13: 1
arabic14: 1
arabic15: 1
arabic16: 1
arabic17: 1
arabic18: 1
arabic19: 1
arabic2: 1
arabic20: 1
arabic21: 1
arabic22: 1
arabic23: 1
arabic24: 1
arabic25: 1
arabic26: 1
arabic27: 1
arabic28: 1
arabic29: 1
arabic3: 1
arabic30: 1
arabic31: 1
arabic32: 1
arabic33: 1
arabic34: 1
arabic35: 1
arabic36: 1
arabic37: 1
arabic38: 1
arabic39: 1
arabic4: 1
arabic40: 1
arabic41: 1
arabic42: 1
arabic43: 1
arabic44

In [3]:
import os
import shutil
from collections import defaultdict
from tqdm import tqdm

# Update these paths
source_folder = r"C:\Users\sagni\Downloads\Accent Detectection\archive (1)\recordings\recordings"
destination_folder = r"C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset"

# Minimum samples required per accent
MIN_SAMPLES = 2

# Count files per accent
accent_files = defaultdict(list)

for filename in os.listdir(source_folder):
    if filename.endswith(".mp3"):
        label = filename.split(".")[0].lower().strip()
        filepath = os.path.join(source_folder, filename)
        accent_files[label].append(filepath)

# Filter labels with at least MIN_SAMPLES files
filtered_accents = {label: files for label, files in accent_files.items() if len(files) >= MIN_SAMPLES}

print(f"Total files before filtering: {sum(len(v) for v in accent_files.values())}")
print(f"Total files after filtering: {sum(len(v) for v in filtered_accents.values())}")
print(f"Remaining labels: {set(filtered_accents.keys())}")

# Copy filtered files to cleaned_dataset
if os.path.exists(destination_folder):
    shutil.rmtree(destination_folder)
os.makedirs(destination_folder)

for label, files in tqdm(filtered_accents.items(), desc="Copying files"):
    label_folder = os.path.join(destination_folder, label)
    os.makedirs(label_folder, exist_ok=True)
    for file in files:
        shutil.copy(file, os.path.join(label_folder, os.path.basename(file)))

print(f"\n✅ Cleaned dataset saved to: {destination_folder}")


Total files before filtering: 2138
Total files after filtering: 0
Remaining labels: set()


Copying files: 0it [00:00, ?it/s]


✅ Cleaned dataset saved to: C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset





In [4]:
import os
import shutil
import re
from collections import defaultdict
from tqdm import tqdm

# Paths
source_folder = r"C:\Users\sagni\Downloads\Accent Detectection\archive (1)\recordings\recordings"
destination_folder = r"C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset"

# Minimum samples required
MIN_SAMPLES = 2

# Step 1: Group files by accent (extracting only letters)
accent_files = defaultdict(list)

for filename in os.listdir(source_folder):
    if filename.endswith(".mp3"):
        match = re.match(r"([a-zA-Z]+)", filename)  # Match only alphabetic prefix
        if match:
            label = match.group(1).lower()
            filepath = os.path.join(source_folder, filename)
            accent_files[label].append(filepath)

# Step 2: Filter accents with enough samples
filtered_accents = {label: files for label, files in accent_files.items() if len(files) >= MIN_SAMPLES}

print(f"Total files before filtering: {sum(len(v) for v in accent_files.values())}")
print(f"Total files after filtering: {sum(len(v) for v in filtered_accents.values())}")
print(f"Remaining labels: {set(filtered_accents.keys())}")

# Step 3: Copy files
if os.path.exists(destination_folder):
    shutil.rmtree(destination_folder)
os.makedirs(destination_folder)

for label, files in tqdm(filtered_accents.items(), desc="Copying files"):
    label_folder = os.path.join(destination_folder, label)
    os.makedirs(label_folder, exist_ok=True)
    for file in files:
        shutil.copy(file, os.path.join(label_folder, os.path.basename(file)))

print(f"\n✅ Cleaned dataset saved to: {destination_folder}")


Total files before filtering: 2138
Total files after filtering: 2060
Remaining labels: {'turkish', 'kazakh', 'norwegian', 'pashto', 'catalan', 'bambara', 'tswana', 'rotuman', 'dutch', 'korean', 'gujarati', 'tigrigna', 'punjabi', 'serbian', 'shona', 'filipino', 'finnish', 'luo', 'bengali', 'garifuna', 'icelandic', 'hausa', 'danish', 'igbo', 'tajiki', 'latvian', 'hadiyya', 'yiddish', 'armenian', 'ga', 'mauritian', 'urdu', 'wolof', 'burmese', 'ewe', 'belarusan', 'yoruba', 'ukrainian', 'romanian', 'satawalese', 'hungarian', 'miskito', 'vietnamese', 'tibetan', 'hebrew', 'ngemba', 'twi', 'uyghur', 'lao', 'slovak', 'xiang', 'khmer', 'bafang', 'ganda', 'arabic', 'kikongo', 'tagalog', 'greek', 'afrikaans', 'fanti', 'cantonese', 'kiswahili', 'thai', 'polish', 'amazigh', 'kikuyu', 'hmong', 'portuguese', 'slovenian', 'czech', 'mongolian', 'ibibio', 'lithuanian', 'tamil', 'telugu', 'hindi', 'azerbaijani', 'kambaata', 'bosnian', 'pulaar', 'farsi', 'oriya', 'croatian', 'russian', 'english', 'malayala

Copying files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 122/122 [00:07<00:00, 17.31it/s]


✅ Cleaned dataset saved to: C:\Users\sagni\Downloads\Accent Detectection\cleaned_dataset



