In [1]:
import os
import csv
import pickle
import librosa
import pandas as pd

## Список языков, отобранных для датасета:
1. Indonesian
2. French
3. Spanish
4. Japanese
5. Dutch
6. Portuguese
7. Romanian
8. Slovenian
9. Finnish
10. English

In [2]:
LANGUAGES = ['Indonesian', 'French', 'Spanish', 'Japanese', 'Dutch', 'Lithuanian', 'Portuguese', 'Romanian', 'Finnish', 'English']

In [3]:
LANGUAGES_CODES = {
    'Indonesian': 'id',
    'French': 'fr',
    'Spanish': 'es',
    'Japanese': 'ja',
    'Dutch': 'nl',
    'Lithuanian': 'lt',
    'Portuguese': 'pt',
    'Romanian': 'ro',
    'Finnish': 'fi',
    'English': 'en'
}

In [4]:
NUMBER_OF_PIECES = 1500
DURATION_SECONDS = 3

In [5]:
with open('/kaggle/input/jukebox-mirror/artistsIdsAndLanguages.csv', 'r') as file:
    df = pd.read_csv(file, delimiter=';')
    languages_and_artists_ids = df.groupby('language_label')['artist_id'].apply(list).to_dict()

In [6]:
def process_artist_folder(JukeBox_folder, language_code, artist_id):
    folder_count = 0
    folder_data = []
    for wav_file in os.listdir(JukeBox_folder):
        song_id = wav_file[:-4]
        wav_path = os.path.join(JukeBox_folder, wav_file)
        array, sampling_rate = librosa.load(wav_path, dtype='float64', sr=48000)
        piece_size = DURATION_SECONDS * sampling_rate
        number_of_pieces_from_wav = len(array) // piece_size
        for i in range(number_of_pieces_from_wav):
            piece = array[i*piece_size : (i+1)*piece_size]
            folder_data.append({
                'name': f'{language_code}{song_id}{folder_count}music',
                'array': piece,
                'language_code': language_code,
                'label': 'music',
            })
            folder_count += 1
    return folder_count, folder_data

In [7]:
def process_language(language, language_code, artists_ids):
    print(f'processing language: {language}')
    language_data = []
    language_count = 0
    for artist_id in artists_ids:
        if language_count >= NUMBER_OF_PIECES:
            break
        artist_folder = f'/kaggle/input/jukebox-mirror/ALLTHEDATA/ALLTHEDATA/{artist_id}'
        folder_count, folder_data = process_artist_folder(artist_folder, language_code, artist_id)
        language_count += folder_count
        language_data.extend(folder_data[:NUMBER_OF_PIECES])
    print(f'processed {language_count} pieces for {language} language ({language_code})')
    return language_data

In [8]:
for language, artists_ids in languages_and_artists_ids.items():
    if language in LANGUAGES:
        language_data = process_language(language, LANGUAGES_CODES[language], artists_ids)
        with open('music_dataset.pkl', 'ab') as file:
            pickle.dump(language_data, file)

processing language: Dutch
processed 5800 pieces for Dutch language (nl)
processing language: English
processed 1884 pieces for English language (en)
processing language: Finnish
processed 70 pieces for Finnish language (fi)
processing language: French
processed 2360 pieces for French language (fr)
processing language: Indonesian
processed 2090 pieces for Indonesian language (id)
processing language: Japanese
processed 1670 pieces for Japanese language (ja)
processing language: Lithuanian
processed 2630 pieces for Lithuanian language (lt)
processing language: Portuguese
processed 2108 pieces for Portuguese language (pt)
processing language: Romanian
processed 1830 pieces for Romanian language (ro)
processing language: Spanish
processed 1890 pieces for Spanish language (es)
