In [1]:
!rm -rf /kaggle/working/*

In [2]:
import csv
from datasets import load_dataset
from huggingface_hub import login

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret('HF_TOKEN')

In [4]:
login(token=HF_TOKEN)

In [5]:
LANGUAGES = {
    'id': 'Indonesian',
    'fr': 'French',
    'es': 'Spanish',
    'ja': 'Japanese',
    'ne-NP': 'Nepali',
    'nl': 'Dutch',
    'lt': 'Lithuanian',
    'pt': 'Portuguese',
    'zh-CN': 'Chinese (China)',
    'ro': 'Romanian',
    'is': 'Icelandic',
    'hi': 'Hindi',
    'ps': 'Pashto',
    'sl': 'Slovenian',
    'pa-IN': 'Punjabi',
    'fi': 'Finnish',
    'en': 'English',
}

In [6]:
NUMBER_OF_PIECES = 1000
DURATION_SECONDS = 3
DATASET_DIR = '/kaggle/working/processed_languages'
METADATA_FILE = '/kaggle/working/metadata.csv'

In [7]:
def process_language(language_code, language):
    print(f'processing language: {language}')
    try:
        dataset = load_dataset(
            'mozilla-foundation/common_voice_17_0', 
            language_code, 
            split='train', 
            streaming=True, 
            trust_remote_code=True
        )
    except Exception as e:
        print(f'failed to load {language}: {e}')
        return []

    metadata = []
    count = 0

    for sample in dataset:
        if count >= NUMBER_OF_PIECES:
            break

        try:
            array = sample['audio']['array']
            sampling_rate = sample["audio"]["sampling_rate"]
            piece_size = DURATION_SECONDS * sampling_rate
            number_of_pieces = len(array) // piece_size

            if len(array) < (sampling_rate * DURATION_SECONDS):
                continue

            for i in range(number_of_pieces):
                piece = array[i*piece_size : (i+1)*piece_size]
                metadata.append({
                    'name': f'{language_code}{count}speech',
                    'array': piece,
                    'language_code': language_code,
                    'label': 'speech',
                })
                count += 1

        except Exception as e:
            print(f'error processing sample: {e}')
    
    print(f'processed {count} pieces for language {language} ({language_code})')
    return metadata

In [8]:
def save_metadata_csv(full_metadata):
    with open(METADATA_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=full_metadata[0].keys())
        writer.writeheader()
        writer.writerows(full_metadata)
    print(f"metadata saved to {METADATA_FILE}")

In [9]:
full_metadata = []
for language_code, language in LANGUAGES.items():
    metadata = process_language(language_code, language)
    if metadata:
        full_metadata.extend(metadata)

processing language: Indonesian


README.md:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

common_voice_17_0.py:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/132k [00:00<?, ?B/s]

Reading metadata...: 4970it [00:00, 10503.03it/s]


processed 1001 pieces for language Indonesian (id)
processing language: French


Reading metadata...: 558054it [00:19, 28114.81it/s]


processed 1000 pieces for language French (fr)
processing language: Spanish


Reading metadata...: 336846it [00:08, 39377.84it/s]


processed 1000 pieces for language Spanish (es)
processing language: Japanese


Reading metadata...: 10039it [00:00, 24323.06it/s]


processed 1000 pieces for language Japanese (ja)
processing language: Nepali


Reading metadata...: 283it [00:00, 1813.30it/s]


processed 197 pieces for language Nepali (ne-NP)
processing language: Dutch


Reading metadata...: 34898it [00:01, 33368.23it/s]


processed 1001 pieces for language Dutch (nl)
processing language: Lithuanian


Reading metadata...: 7253it [00:00, 11637.75it/s]


processed 1000 pieces for language Lithuanian (lt)
processing language: Portuguese


Reading metadata...: 21968it [00:02, 9924.53it/s] 


processed 1000 pieces for language Portuguese (pt)
processing language: Chinese (China)


Reading metadata...: 29406it [00:01, 17922.37it/s]


processed 1000 pieces for language Chinese (China) (zh-CN)
processing language: Romanian


Reading metadata...: 5141it [00:00, 23229.49it/s]


processed 1000 pieces for language Romanian (ro)
processing language: Icelandic


Reading metadata...: 14it [00:00, 114.44it/s]


processed 29 pieces for language Icelandic (is)
processing language: Hindi


Reading metadata...: 4689it [00:00, 18925.77it/s]


processed 1000 pieces for language Hindi (hi)
processing language: Pashto


Reading metadata...: 1027it [00:00, 3596.79it/s]


processed 1000 pieces for language Pashto (ps)
processing language: Slovenian


Reading metadata...: 1388it [00:00, 5437.65it/s]


processed 948 pieces for language Slovenian (sl)
processing language: Punjabi


Reading metadata...: 732it [00:00, 2555.27it/s]


processed 891 pieces for language Punjabi (pa-IN)
processing language: Finnish


Reading metadata...: 2076it [00:00, 11335.97it/s]


processed 1001 pieces for language Finnish (fi)
processing language: English


Reading metadata...: 1101170it [00:26, 42078.90it/s]


processed 1000 pieces for language English (en)


In [10]:
save_metadata_csv(full_metadata)

metadata saved to /kaggle/working/metadata.csv
