# Dataset creation

Randomly extracts 3 seconds from an audio file, computes musicnn tags and stores melspectrogram + audio + tags

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install librosa soundfile musicnn 
#matplotlib

In [None]:
import os, sys
import numpy as np
import librosa
import random
import soundfile as sf
import string

from tqdm.notebook import tqdm
from musicnn.extractor import extractor
from musicnn.tagger import top_tags

import matplotlib.pyplot as plt

letters = string.ascii_uppercase

In [None]:
AUDIO_DIR = "/content/drive/MyDrive/CODE/CondVAEmelspec/data/audio_orig/"
#files = os.listdir(AUDIO_DIR)
#print(len(files))

In [None]:
# audio params
SR = 16000 #22050
FFT_HOP = 256
FFT_SIZE = 512
N_MELS = 96

In [None]:
%cd /content/drive/MyDrive/CODE/CondVAEmelspec/data/spectrograms/
!ls | wc -l

In [None]:
generate_dataset = True

if generate_dataset:
    
    # collect audio data
    AUDIO_DATA_DIR = "/content/drive/MyDrive/CODE/CondVAEmelspec/data/audio_orig/"
    SAVE_AUDIO_DIR = "/content/drive/MyDrive/CODE/CondVAEmelspec/data/audio/"
    SAVE_SPEC_DIR = "/content/drive/MyDrive/CODE/CondVAEmelspec/data/spectrograms/"
    SAVE_TAGS_DIR = "/content/drive/MyDrive/CODE/CondVAEmelspec/data/tags/"

    audio_data = os.listdir(AUDIO_DATA_DIR)
    audio_data = [i for i in audio_data if '.aif.asd' not in i]
    random.shuffle(audio_data)

    # cut randomly 3 seconds audio files
    for i in tqdm(range(5000)):

        try:

            audio_file = os.path.join(AUDIO_DATA_DIR, random.choice(audio_data))
            start_time = random.randint(0, 100)
            MAX_DURATION = 3
            sampled_y, sr = librosa.load(audio_file, sr=SR, offset=start_time, duration=MAX_DURATION)

            # norm
            sampled_y = librosa.util.normalize(sampled_y)
            
            # trim silence
            sampled_y, _ = librosa.effects.trim(sampled_y)
            
            # compute duration
            duration = librosa.get_duration(y=sampled_y, sr=sr)
            
            if int(duration) == MAX_DURATION:

                # generate rnd filename
                filename = ''.join(random.choice(letters) for i in range(10))

                # MELSPEC
                audio_path = os.path.join(SAVE_AUDIO_DIR, filename + '.wav')
                sf.write(audio_path, sampled_y, sr, 'PCM_24')

                spec_path = os.path.join(SAVE_SPEC_DIR, filename + '.npy')
                melspec = librosa.feature.melspectrogram(y=sampled_y,
                                                       sr=sr,
                                                       hop_length=FFT_HOP,
                                                       n_fft=FFT_SIZE,
                                                       n_mels=N_MELS) #.T

                np.save(spec_path, melspec)

                #print('\nmelspec.shape', melspec.shape)
                #plt.imshow(melspec)
                
                # MUSICNN TAGS
                musicnn_res = extractor(audio_path, model='MTT_musicnn')
    
                taggram = musicnn_res[0]
                tags_likelihood_mean = np.mean(taggram, axis=0)
                musicnn_tags_path = os.path.join(SAVE_TAGS_DIR, filename + '.npy')
                np.save(musicnn_tags_path, tags_likelihood_mean)

        except Exception as e:
            print('exception on:', audio_file)
            print(e)