In [1]:
import pickle
from datasets import load_dataset
from huggingface_hub import login

## Список языков, отобранных для датасета:
1. Indonesian
2. French
3. Spanish
4. Japanese
5. Dutch
6. Lithuanian
7. Portuguese
8. Chinese
9. Romanian
10. Slovenian
11. Punjabi
12. Finnish
13. English

In [2]:
LANGUAGES = ['Indonesian', 'French', 'Spanish', 'Japanese', 'Dutch', 'Lithuanian', 'Portuguese', 'Chinese', 'Romanian', 'Finnish', 'English']

In [3]:
LANGUAGES_CODES = {
    'Indonesian': 'id',
    'French': 'fr',
    'Spanish': 'es',
    'Japanese': 'ja',
    'Dutch': 'nl',
    'Lithuanian': 'lt',
    'Portuguese': 'pt',
    'Chinese': 'zh-CN',
    'Romanian': 'ro',
    'Finnish': 'fi',
    'English': 'en'
}

In [4]:
NUMBER_OF_PIECES = 1500
DURATION_SECONDS = 3

In [5]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret('HF_TOKEN')
login(token=HF_TOKEN)

In [6]:
def process_language(language_code, language):
    print(f'processing language: {language} ({language_code})')
    try:
        dataset = load_dataset(
            'mozilla-foundation/common_voice_16_1', 
            language_code, 
            split='train', 
            streaming=True, 
            trust_remote_code=True
        )
    except Exception as e:
        print(f'failed to load {language}: {e}')
        return []

    language_data = []
    language_count = 0

    for sample in dataset:
        if language_count >= NUMBER_OF_PIECES:
            break

        array = sample['audio']['array']
        sampling_rate = sample['audio']['sampling_rate']
        piece_size = DURATION_SECONDS * sampling_rate
        if len(array) < (sampling_rate * DURATION_SECONDS):
            continue

        number_of_pieces_from_sample = len(array) // piece_size

        for i in range(number_of_pieces_from_sample):
            piece = array[i*piece_size : (i+1)*piece_size]
            
            language_data.append({
                'name': f'{language_code}{language_count}speech',
                'array': piece,
                'language_code': language_code,
                'label': 'speech',
            })
            language_count += 1
    
    print(f'processed {language_count} pieces for language {language} ({language_code})')
    return language_data

In [7]:
for language in LANGUAGES:
    language_data = process_language(LANGUAGES_CODES[language], language)
    with open('speech_dataset.pkl', 'ab') as file:
        pickle.dump(language_data, file)

processing language: Indonesian (id)


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

common_voice_16_1.py:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/77.3k [00:00<?, ?B/s]

Reading metadata...: 4969it [00:00, 11651.63it/s]


processed 1500 pieces for language Indonesian (id)
processing language: French (fr)


Reading metadata...: 541822it [00:15, 34865.24it/s]


processed 1500 pieces for language French (fr)
processing language: Spanish (es)


Reading metadata...: 327717it [00:08, 37170.16it/s]


processed 1501 pieces for language Spanish (es)
processing language: Japanese (ja)


Reading metadata...: 9616it [00:00, 20738.12it/s]


processed 1500 pieces for language Japanese (ja)
processing language: Dutch (nl)


Reading metadata...: 34605it [00:01, 27850.52it/s]


processed 1500 pieces for language Dutch (nl)
processing language: Lithuanian (lt)


Reading metadata...: 6728it [00:00, 20442.40it/s]


processed 1500 pieces for language Lithuanian (lt)
processing language: Portuguese (pt)


Reading metadata...: 21685it [00:00, 27797.43it/s]


processed 1500 pieces for language Portuguese (pt)
processing language: Chinese (zh-CN)


Reading metadata...: 29406it [00:01, 18021.27it/s]


processed 1501 pieces for language Chinese (zh-CN)
processing language: Romanian (ro)


Reading metadata...: 5113it [00:00, 14533.29it/s]


processed 1500 pieces for language Romanian (ro)
processing language: Finnish (fi)


Reading metadata...: 2131it [00:00, 8981.03it/s]


processed 1500 pieces for language Finnish (fi)
processing language: English (en)


Reading metadata...: 1090061it [00:32, 34016.51it/s]


processed 1500 pieces for language English (en)
