In [1]:
import csv
from datasets import load_dataset
from huggingface_hub import login

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret('HF_TOKEN')

In [3]:
login(token=HF_TOKEN)

In [4]:
LANGUAGES = {
    'id': 'Indonesian',
    'fr': 'French',
    'es': 'Spanish',
    'ja': 'Japanese',
    'nl': 'Dutch',
    'lt': 'Lithuanian',
    'pt': 'Portuguese',
    'zh-CN': 'Chinese',
    'ro': 'Romanian',
    'sl': 'Slovenian',
    'pa-IN': 'Punjabi',
    'fi': 'Finnish',
    'en': 'English'
}

In [5]:
NUMBER_OF_PIECES = 1500
DURATION_SECONDS = 3
DATASET_DIR = '/kaggle/working/processed_languages'
METADATA_FILE = '/kaggle/working/metadata.csv'

In [6]:
def process_language(language_code, language):
    print(f'processing language: {language}')
    try:
        dataset = load_dataset(
            'mozilla-foundation/common_voice_17_0', 
            language_code, 
            split='train', 
            streaming=True, 
            trust_remote_code=True
        )
    except Exception as e:
        print(f'failed to load {language}: {e}')
        return []

    metadata = []
    count = 0

    for sample in dataset:
        if count >= NUMBER_OF_PIECES:
            break

        try:
            array = sample['audio']['array']
            sampling_rate = sample["audio"]["sampling_rate"]
            piece_size = DURATION_SECONDS * sampling_rate
            number_of_pieces = len(array) // piece_size

            if len(array) < (sampling_rate * DURATION_SECONDS):
                continue

            for i in range(number_of_pieces):
                piece = array[i*piece_size : (i+1)*piece_size]
                metadata.append({
                    'name': f'{language_code}{count}speech',
                    'array': piece,
                    'language_code': language_code,
                    'label': 'speech',
                })
                count += 1

        except Exception as e:
            print(f'error processing sample: {e}')
    
    print(f'processed {count} pieces for language {language} ({language_code})')
    return metadata

In [7]:
def save_metadata_csv(full_metadata):
    with open(METADATA_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=full_metadata[0].keys())
        writer.writeheader()
        writer.writerows(full_metadata)
    print(f"metadata saved to {METADATA_FILE}")

In [8]:
full_metadata = []
for language_code, language in LANGUAGES.items():
    metadata = process_language(language_code, language)
    if metadata:
        full_metadata.extend(metadata)

processing language: Indonesian


README.md:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

common_voice_17_0.py:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/132k [00:00<?, ?B/s]

Reading metadata...: 4970it [00:00, 16453.72it/s]


processed 1500 pieces for language Indonesian (id)
processing language: French


Reading metadata...: 558054it [00:12, 44582.52it/s]


processed 1500 pieces for language French (fr)
processing language: Spanish


Reading metadata...: 336846it [00:08, 42078.68it/s]


processed 1501 pieces for language Spanish (es)
processing language: Japanese


Reading metadata...: 10039it [00:00, 28538.62it/s]


processed 1500 pieces for language Japanese (ja)
processing language: Dutch


Reading metadata...: 34898it [00:00, 44023.40it/s]


processed 1500 pieces for language Dutch (nl)
processing language: Lithuanian


Reading metadata...: 7253it [00:00, 29217.89it/s]


processed 1500 pieces for language Lithuanian (lt)
processing language: Portuguese


Reading metadata...: 21968it [00:00, 32102.70it/s]


processed 1501 pieces for language Portuguese (pt)
processing language: Chinese


Reading metadata...: 29406it [00:01, 15957.58it/s]


processed 1500 pieces for language Chinese (zh-CN)
processing language: Romanian


Reading metadata...: 5141it [00:00, 24431.93it/s]


processed 1500 pieces for language Romanian (ro)
processing language: Slovenian


Reading metadata...: 1388it [00:00, 15868.37it/s]


processed 948 pieces for language Slovenian (sl)
processing language: Punjabi


Reading metadata...: 732it [00:00, 11348.95it/s]


processed 891 pieces for language Punjabi (pa-IN)
processing language: Finnish


Reading metadata...: 2076it [00:00, 18883.18it/s]


processed 1500 pieces for language Finnish (fi)
processing language: English


Reading metadata...: 1101170it [00:22, 49039.91it/s]


processed 1500 pieces for language English (en)


In [9]:
save_metadata_csv(full_metadata)

metadata saved to /kaggle/working/metadata.csv
