# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from skimage import io

# Handle Data

In [3]:
def read_audio(f_names: list, path: str):
    data   = [[]] * len(f_names)  # (data, samp_rate)
    srs     = [0] * len(f_names)
    labels = [0] * len(f_names)

    for i in range(len(f_names)):
        x, sr = librosa.load(path + f_names[i], sr=None, mono=True)
        data[i] = x
        srs[i] = sr

        if i % 100 == 0:
            print('i=', i, '\t num points:', x.shape, 'samp_rate:', sr)
    print("Finished reading", len(data), "audio files from", path,)
    return data, srs

def read_labels(f_names: list):
    labels = np.zeros(len(f_names))
    y_df = pd.read_csv('../res/train.csv', header=0, dtype={'new_id':str, 'genre':np.int16})
    y_df = y_df.set_index('new_id')
    for i in range(len(f_names)):
        labels[i] = y_df.loc[f_names[i][:-4]].genre
    print("Finished reading", len(labels), 'labels')
    return labels

def read_spectrogram(path: str, f_names: list):
    img_data = np.zeros(shape=(len(f_names), expected_spectro_shape[0], expected_spectro_shape[1]))
    for i in range(len(f_names)):
        img_data[i] = io.imread(path + f_names[i][:-3] + 'png')
        if expected_spectro_shape != img_data[i].shape:
            print("index:", i, "has shape", img_data[i].shape)
    print("Spectrogram from", path, "read in! Shape is:", img_data.shape)
    return img_data

def save_audio_as_spectrogram(data: list, srs: list, f_names: list, path: str):
    def scale_minmax(x_audio, min=0.0, max=1.0):
        x_audio_std = (x_audio - x_audio.min()) / (x_audio.max() - x_audio.min())
        x_audio_scaled = x_audio_std * (max - min) + min
        return x_audio_scaled
    hop_length = 512  # samples per time sample
    time_steps= 2550  # width of data
    n_mels = 128  # height

    for i in range(len(f_names)):
        data[i] = data[i][:time_steps * hop_length]
        mels = librosa.feature.melspectrogram(y=data[i], sr=srs[i],
                                              n_mels=n_mels,
                                              n_fft=hop_length*2, hop_length=hop_length)
        mels = np.log(mels + 1e-9) # add small number to avoid log(0)

        # min-max scale to fit inside 8-bit range
        img = scale_minmax(mels, 0, 255).astype(np.uint8)
        img = np.flip(img, axis=0) # put low frequencies at the bottom in image
        img = 255 - img # invert. make black==more energy

        # save as PNG
        io.imsave(path + f_names[i][:-4] + '.png', img)

        if i % 100 == 0:
            print('i=', i, '\t img shape:', img.shape)
    print("Finished! Images saved to", path)

## Set Path Variables


In [4]:
train_wav_path = '../res/wav/train/'
test_wav_path = '../res/wav/test/'
train_spectro_path = '../res/spectrogram/train/'
test_spectro_path = '../res/spectrogram/test/'

train_wav_names = os.listdir(train_wav_path)
test_wav_names = os.listdir(test_wav_path)
train_spectro_names = os.listdir(train_spectro_path)
test_spectro_names = os.listdir(test_spectro_path)

expected_spectro_shape = (128, 2551)
num_classes = 6

print("num train wavs:", len(train_wav_names))
print("num test wavs:", len(test_wav_names))
print("num train spectros:", len(train_spectro_names))
print("num test spectros:", len(test_spectro_names))
print("expected_spectro_shape:", expected_spectro_shape)

num train wavs: 2400
num test wavs: 1202
num train spectros: 2400
num test spectros: 1200
expected_spectro_shape: (128, 2551)


## Read Training .wav Files and Save as Spectrograms

In [4]:
print("Number of train .wav files in audio folder:", len(train_wav_names))
training_wav, training_srs = read_audio(train_wav_names, train_wav_path)

save_audio_as_spectrogram(training_wav,
                          training_srs,
                          train_wav_names,
                          train_spectro_path)

Number of train .wav files in audio folder: 2400
i= 0 	 num points: (1321967,) samp_rate: 44100
i= 100 	 num points: (1321967,) samp_rate: 44100
i= 200 	 num points: (1321967,) samp_rate: 44100
i= 300 	 num points: (1323119,) samp_rate: 44100
i= 400 	 num points: (1321967,) samp_rate: 44100
i= 500 	 num points: (1321967,) samp_rate: 44100
i= 600 	 num points: (1323119,) samp_rate: 44100
i= 700 	 num points: (1321967,) samp_rate: 44100
i= 800 	 num points: (1321967,) samp_rate: 44100
i= 900 	 num points: (1323119,) samp_rate: 44100
i= 1000 	 num points: (1321967,) samp_rate: 44100
i= 1100 	 num points: (1321967,) samp_rate: 44100
i= 1200 	 num points: (1321967,) samp_rate: 44100
i= 1300 	 num points: (1323119,) samp_rate: 44100
i= 1400 	 num points: (1323119,) samp_rate: 44100
i= 1500 	 num points: (1439471,) samp_rate: 48000
i= 1600 	 num points: (1323119,) samp_rate: 44100
i= 1700 	 num points: (1321967,) samp_rate: 44100
i= 1800 	 num points: (1323119,) samp_rate: 44100
i= 1900 	 num

## Read Testing .wav Files and Save as Spectrograms

In [None]:
print("Number of train .wav files in audio folder:", len(test_wav_names))
testing_wav, testing_srs = read_audio(test_wav_names, test_wav_path)

save_audio_as_spectrogram(testing_wav,
                          testing_srs,
                          test_wav_names,
                          test_spectro_path)

## Read Training & Testing Spectrogram PNGs

In [None]:
print(len(test_wav_names))

In [5]:
training_x = read_spectrogram(train_spectro_path, train_spectro_names)
training_labels = read_labels(train_wav_names)
# testing_x = read_spectrogram(test_spectro_path, test_spectro_names)

train_size = int(len(training_x) * .8)
train_set_x = training_x[:train_size]
train_set_y = training_labels[:train_size]

eval_set_x = training_x[train_size:]
eval_set_y = training_labels[train_size:]

Spectrogram from ../res/spectrogram/train/ read in! Shape is: (2400, 128, 2551)
Finished reading 2400 labels


# TF Model

In [6]:
def create_CNN(input_shape=None):

    model = keras.models.Sequential()
    model.add(layers.Conv2D(filters=8, kernel_size=5, strides=1, activation='relu',
              input_shape=input_shape))
    model.add(layers.MaxPool2D(pool_size=(3, 3), strides=1))
    model.add(layers.Dropout(0.25))
    model.add(layers.Flatten())
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.25))
    model.add(layers.Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.sparse_categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])
    return model

## TF Training

In [None]:
from tensorflow.keras import backend as K
img_rows = expected_spectro_shape[0]
img_cols = expected_spectro_shape[1]
if K.image_data_format() == 'channels_first':
    train_set_x = train_set_x.reshape(train_set_x.shape[0], 1, img_rows, img_cols)
    eval_set_x = eval_set_x.reshape(eval_set_x.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    train_set_x = train_set_x.reshape(train_set_x.shape[0], img_rows, img_cols, 1)
    eval_set_x = eval_set_x.reshape(eval_set_x.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

model = create_CNN(input_shape=input_shape)
model.fit(train_set_x, train_set_y
          epochs=5,
          verbose=1,
          validation_data=(eval_set_x, eval_set_y))
score = model.evaluate(eval_set_x, eval_set_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

## TF Test Predictions

## File Writing