# Imports

In [11]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from skimage import io

# Prepare Data

In [20]:
def read_audio(f_names: list, path: str, has_labels: bool):
    data   = [[]] * len(f_names)  # (data, samp_rate)
    srs     = [0] * len(f_names)
    labels = [0] * len(f_names)
    if has_labels:
        y_df = pd.read_csv('../res/train.csv', header=0, dtype={'new_id':str, 'genre':np.int16})
        y_df = y_df.set_index('new_id')
    
    for i in range(len(f_names)):
        x, sr = librosa.load(path + f_names[i], sr=None, mono=True)
        data[i] = x
        srs[i] = sr

        if has_labels:
            labels[i] = y_df.loc[f_names[i][:-4]].genre
        if i % 100 == 0:
            print('i=', i, '\t num points:', x.shape, 'samp_rate:', sr)
    return (data, srs, labels) if has_labels else (data, srs)

def save_audio_as_spectrogram(data, srs, f_names, path):
    def scale_minmax(x_audio, min=0.0, max=1.0):
        x_audio_std = (x_audio - x_audio.min()) / (x_audio.max() - x_audio.min())
        x_audio_scaled = x_audio_std * (max - min) + min
        return x_audio_scaled
    hop_length = 512  # samples per time sample
    time_steps= 2550  # width of data
    n_mels = 128  # height

    for i in range(len(f_names)):
        data[i] = data[i][:time_steps * hop_length]
        mels = librosa.feature.melspectrogram(y=data[i], sr=srs[i],
                                              n_mels=n_mels,
                                              n_fft=hop_length*2, hop_length=hop_length)
        mels = np.log(mels + 1e-9) # add small number to avoid log(0)

        # min-max scale to fit inside 8-bit range
        img = scale_minmax(mels, 0, 255).astype(np.uint8)
        img = np.flip(img, axis=0) # put low frequencies at the bottom in image
        img = 255 - img # invert. make black==more energy

        # save as PNG
        io.imsave(path + f_names[i][:-4] + '.png', img)
    print("Finished! Images saved to", path)
    
def read_spectrogram(path: str, f_names: list):
    img_data = [(0, 0)] * len(f_names)
    for i in range(len(f_names)):
        img_data[i] = io.imread(path + f_names[i][:-3] + 'png')
        if expected_spectro_shape != img_data[i].shape:
            print(img_data[i].shape)
    print("Spectrogram from", path, "read in!")
    return img_data

## Set Path Variables


In [6]:
train_wav_path = '../res/wav/train/'
test_wav_path = '../res/wav/test/'
train_spectro_path = '../res/spectrogram/train/'
test_spectro_path = '../res/spectrogram/test/'

train_wav_names = os.listdir(train_wav_path)
test_wav_names = os.listdir(test_wav_path)
train_spectro_names = os.listdir(train_spectro_path)
test_spectro_names = os.listdir(test_spectro_path)

expected_spectro_shape = (128, 2551)

print("num train wavs:", len(train_wav_names))
print("num test wavs:", len(test_wav_names))
print("num train spectros:", len(train_spectro_names))
print("num test spectros:", len(test_spectro_names))
print("expected_spectro_shape:", expected_spectro_shape)

Number of train .wav files in audio folder: 2400
i= 0 	 num points: (1321967,) samp_rate: 44100
i= 25 	 num points: (1323119,) samp_rate: 44100
i= 50 	 num points: (1323119,) samp_rate: 44100
i= 75 	 num points: (1321967,) samp_rate: 44100
i= 100 	 num points: (1321967,) samp_rate: 44100
i= 125 	 num points: (1321967,) samp_rate: 44100
i= 150 	 num points: (1323119,) samp_rate: 44100
i= 175 	 num points: (1323119,) samp_rate: 44100
i= 200 	 num points: (1321967,) samp_rate: 44100
i= 225 	 num points: (1321967,) samp_rate: 44100
i= 250 	 num points: (1323119,) samp_rate: 44100
i= 275 	 num points: (1323119,) samp_rate: 44100
i= 300 	 num points: (1323119,) samp_rate: 44100
i= 325 	 num points: (1321967,) samp_rate: 44100
i= 350 	 num points: (1323119,) samp_rate: 44100
i= 375 	 num points: (1323119,) samp_rate: 44100
i= 400 	 num points: (1321967,) samp_rate: 44100
i= 425 	 num points: (1323119,) samp_rate: 44100
i= 450 	 num points: (1323119,) samp_rate: 44100
i= 475 	 num points: (132

## Read Training .wav Files and Save as Spectrograms

In [21]:
print("Number of train .wav files in audio folder:", len(train_wav_names))
training_wav, training_srs, training_labels = read_audio(train_wav_names, train_wav_path, True)

save_audio_as_spectrogram(training_wav,
                          training_srs,
                          train_wav_names,
                          train_spectro_path)

../res/spectrogram/train/00907299
../res/spectrogram/train/00907479
../res/spectrogram/train/00907482


## Read Testing .wav Files and Save as Spectrograms

In [None]:
print("Number of train .wav files in audio folder:", len(test_wav_names))
testing_wav, testing_srs = read_audio(test_wav_names, test_wav_path, False)

save_audio_as_spectrogram(testing_wav,
                          testing_srs,
                          test_wav_names,
                          test_spectro_path)

## Read Training & Testing Spectrogram PNGs

In [None]:
training_x = read_spectrogram(train_spectro_path, train_spectro_names)
testing_x = read_spectrogram(test_spectro_path, test_spectro_names)

# TF Training

In [None]:
model = keras.models.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Conv2D()
])
# model = keras.Model(inputs=inputs, outputs=outputs)
