In [2]:
import pickle
from datasets import load_dataset
from huggingface_hub import login

## Список языков, отобранных для датасета:
1. Indonesian
2. French
3. Spanish
4. Japanese
5. Dutch
6. Lithuanian
7. Portuguese
8. Romanian
9. Finnish
10. English 

In [3]:
LANGUAGES = ['Indonesian', 'French', 'Spanish', 'Japanese', 'Dutch', 'Lithuanian', 'Portuguese', 'Romanian', 'Finnish', 'English']

In [4]:
LANGUAGES_CODES = {
    'Indonesian': 'id',
    'French': 'fr',
    'Spanish': 'es',
    'Japanese': 'ja',
    'Dutch': 'nl',
    'Lithuanian': 'lt',
    'Portuguese': 'pt',
    'Romanian': 'ro',
    'Finnish': 'fi',
    'English': 'en'
}

In [5]:
NUMBER_OF_PIECES = 1500
DURATION_SECONDS = 3

In [6]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret('HF_TOKEN')
login(token=HF_TOKEN)

BackendError: Unexpected response from the service. Response: {'errors': ['No user secrets exist for kernel id 83124945 and label HF_TOKEN.'], 'error': {'code': 5, 'details': []}, 'wasSuccessful': False}.

In [None]:
def process_language(language_code, language):
    print(f'processing language: {language} ({language_code})')
    try:
        dataset = load_dataset(
            'mozilla-foundation/common_voice_16_1', 
            language_code, 
            split='train', 
            streaming=True, 
            trust_remote_code=True
        )
    except Exception as e:
        print(f'failed to load {language}: {e}')
        return []

    language_data = []
    language_count = 0

    for sample in dataset:
        if language_count >= NUMBER_OF_PIECES:
            break

        array = sample['audio']['array']
        sampling_rate = sample['audio']['sampling_rate']
        piece_size = DURATION_SECONDS * sampling_rate
        if len(array) < (sampling_rate * DURATION_SECONDS):
            continue

        number_of_pieces_from_sample = len(array) // piece_size

        for i in range(number_of_pieces_from_sample):
            piece = array[i*piece_size : (i+1)*piece_size]
            
            language_data.append({
                'name': f'{language_code}{language_count}speech',
                'array': piece,
                'language_code': language_code,
                'label': 'speech',
            })
            language_count += 1
    
    print(f'processed {language_count} pieces for language {language} ({language_code})')
    return language_data

In [None]:
for language in LANGUAGES:
    language_data = process_language(LANGUAGES_CODES[language], language)
    with open('speech_dataset.pkl', 'ab') as file:
        pickle.dump(language_data, file)