In [1]:
!rm -rf /kaggle/working/*

In [2]:
import os
import json
import csv
import pandas as pd
import soundfile as sf
import numpy as np

In [3]:
LANGUAGES = {
    'id': 'Indonesian',
    'fr': 'French',
    'es': 'Spanish',
    'ja': 'Japanese',
    'nl': 'Dutch',
    'lt': 'Lithuanian',
    'pt': 'Portuguese',
    'zh-CN': 'Chinese',
    'ro': 'Romanian',
    'sl': 'Slovenian',
    'pa-IN': 'Punjabi',
    'fi': 'Finnish',
    'en': 'English'
}

In [4]:
LANGUAGES_CODES = {
    'Indonesian': 'id',
    'French': 'fr',
    'Spanish': 'es',
    'Japanese': 'ja',
    'Dutch': 'nl',
    'Lithuanian': 'lt',
    'Portuguese': 'pt',
    'Chinese': 'zh-CN',
    'Romanian': 'ro',
    'Slovenian': 'sl',
    'Punjabi': 'pa-IN',
    'Finnish': 'fi',
    'English': 'en'
}

In [5]:
NUMBER_OF_PIECES = 1500
DURATION_SECONDS = 3
DATASET_DIR = '/kaggle/working/processed_languages'
METADATA_FILE = '/kaggle/working/metadata.csv'

In [6]:
with open('/kaggle/input/jukebox-mirror/artistsIdsSplitAndLanguages.csv', 'r') as file:
    result = {}
    df = pd.read_csv(file, delimiter=';')
    for split in ['TRAIN', 'TEST']:
        split_df = df[df['partition'] == split]
        result[split] = split_df.groupby('language_label')['artist_id'].apply(list).to_dict()

with open('artists_by_languages.json', 'w', encoding='utf-8') as jsfile:
    json.dump(result, jsfile, ensure_ascii=False, indent=4)

In [7]:
def process_directory(JukeBox_folder, language_code, artist_id):
    dir_count = 0
    metadata = []
    for wav_file in os.listdir(JukeBox_folder):
        song_id = wav_file[:-4]
        wav_path = os.path.join(JukeBox_folder, wav_file)
        array, sample_rate = sf.read(wav_path)
        piece_size = DURATION_SECONDS * sample_rate
        number_of_pieces = len(array) // piece_size
        for i in range(number_of_pieces):
            piece = array[i*piece_size : (i+1)*piece_size]
            metadata.append({
                'name': f'{language_code}{dir_count}music',
                'array': piece,
                'language_code': language_code,
                'label': 'music',
            })
            dir_count += 1
    return dir_count, metadata

In [8]:
def save_metadata_csv(full_metadata):
    with open(METADATA_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=full_metadata[0].keys())
        writer.writeheader()
        writer.writerows(full_metadata)
    print(f"metadata saved to {METADATA_FILE}")

In [9]:
full_metadata = []
for split in ['TEST', 'TRAIN']:
    for language, artists_ids in result[split].items():
        if language in list(LANGUAGES.values()):
            print(f'processing language: {language}')
            language_code = LANGUAGES_CODES[language]
            count = 0
            for artist_id in artists_ids:
                if count >= NUMBER_OF_PIECES:
                    break
                artist_dir = f'/kaggle/input/jukebox-mirror/{split}/{split}/{artist_id}'
                dir_count, dir_metadata = process_directory(artist_dir, language_code, artist_id)
                count += dir_count
                if dir_metadata:
                    full_metadata.extend(dir_metadata)
            print(f'processed {count} pieces for language {language} ({language_code})')

processing language: Dutch
processed 149 pieces for language Dutch (nl)
processing language: English
processed 1640 pieces for language English (en)
processing language: French
processed 1890 pieces for language French (fr)
processing language: Japanese
processed 680 pieces for language Japanese (ja)
processing language: Punjabi
processed 140 pieces for language Punjabi (pa-IN)
processing language: Romanian
processed 440 pieces for language Romanian (ro)
processing language: Spanish
processed 480 pieces for language Spanish (es)
processing language: Dutch
processed 5520 pieces for language Dutch (nl)
processing language: English
processed 1664 pieces for language English (en)
processing language: French
processed 4049 pieces for language French (fr)
processing language: Indonesian
processed 2090 pieces for language Indonesian (id)
processing language: Japanese
processed 1609 pieces for language Japanese (ja)
processing language: Lithuanian
processed 2630 pieces for language Lithuanian 

In [10]:
save_metadata_csv(full_metadata)

metadata saved to /kaggle/working/metadata.csv
