## Model Development
A convolutional neural network (CNN) is implemented using Keras as an interface for TensorFlow. CNNs are artificial neural networks designed for image classification.

The genres were obtained by extracting data from the metadata of each .mp3 file using the mutagen python module. 

In [5]:
import warnings;
warnings.filterwarnings('ignore');

%matplotlib inline
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from numpy import argmax
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display
import random
import warnings
import os
from PIL import Image
import pathlib
import csv
from sklearn.model_selection import train_test_split
import tensorflow as tf # this includes keras, keras.layers in TensorFlow 2.0 

## Melspec functions

In [6]:
def audio_file_to_mel_spectrogram(filepath, num_bins, hop_length):
    """
    Takes in the audiofile and converts it to a Mel Spectrogram
    :param filepath [string]:

    :return mel_spect [numpy.ndarray]:
    """
    # Read in audio file
    y, sr = librosa.load(filepath, sr=None, mono=True)

    # Get image window (aka image length)
    window = create_image_window(y, hop_length)

    # Get Mel Spectrogram Features
    mel_spect = librosa.feature.melspectrogram(y=window, sr=sr,
                                               n_fft=hop_length*2,
                                               n_mels=num_bins,
                                               hop_length=hop_length)
    # Convert to Db
    mel_spect = librosa.power_to_db(mel_spect, ref=np.max)

    return mel_spect

def create_image_window(y, hop_length):
    """
    Creates how wide the image is with respect to the audio clip
    :param y:
    :param hop_length:
    :return:
    """
    time_steps = 384  # number of time-steps. Width of image

    # extract a fixed length window
    start_sample = 0  # starting at beginning

    length_samples = time_steps * hop_length

    window = y[start_sample:start_sample + length_samples]

    return window

## Building the model

In [None]:
# Perform image augmentation to create additional spectrographs (suggest trying with and without)
# Ref: https://www.kdnuggets.com/2020/02/audio-data-analysis-deep-learning-python-part-2.html
# Ref: https://keras.io/api/preprocessing/image/
# Load Libraries

MP3_DIR = '/home/atj64/Downloads/fma_small/'
IMG_DIR = '/home/atj64/Downloads/fma_melspecs/'
wd = os.chdir(MP3_DIR)
genres = os.listdir(wd)
genre_dict = {}
for genre, x in enumerate(genres):
    genre_dict[x] = genre
print(genre_dict)

mel_specs = []
mel_genre = []
for genre in genres:
    print(genre)
    genre_dir = os.chdir(MP3_DIR + genre)
    songs = os.listdir(wd)
    for song in songs:
        try:
            mel_specs.append(audio_file_to_mel_spectrogram(MP3_DIR+genre+'/'+song,num_bins=128,hop_length=512))
            mel_genre.append(genre_dict[genre])
        except:
            print("Skipping: " + genre + " " + song)
X = np.array(mel_specs)
y = mel_genre
        

{'Instrumental': 0, 'Hip-Hop': 1, 'Folk': 2, 'Pop': 3, 'Experimental': 4, 'Electronic': 5, 'International': 6, 'Rock': 7}
Instrumental
Hip-Hop
Folk
Pop
Experimental
Electronic
Skipping: Electronic 099134.mp3
International
Rock
Skipping: Rock 108925.mp3
Skipping: Rock 041965.mp3


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


'''
dataset = tf.keras.image_dataset_from_directory(
    IMG_DIR,
    labels="inferred",
    label_mode="category",
    class_names=None,
    color_mode="rgb",
    batch_size=32,
    image_size=(256, 256),
    shuffle=True,
    seed=None,
    validation_split=None,
    subset=None,
    interpolation="bilinear",
    follow_links=False,
)

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255, # normalize the dataset
    shear_range=0.2, # randomize some transformations
    zoom_range=0.2, # zoom
    horizontal_flip=True) # is this needed or helpful?
val_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

# Flow images from a directory
training_set = train_datagen.flow_from_directory(
    './training_data_fma/train',
    target_size=(64,64), # to just use default size, this would be None
    batch_size=47, # 32 is default, should evenly divide total number of files. 4606 files in train directory
    class_mode='categorical', # categorical - must include y_col column with classes of each image
    shuffle = False)

val_set = val_datagen.flow_from_directory(
    './training_data_fma/val',
    target_size=(64,64), # must be same size as target
    batch_size=47,
    class_mode='categorical',
    shuffle=False)
'''
# Create a CNN
model = tf.keras.Sequential() # groups a linear stack of layers
input_shape=(64,64,3) # required so model knows input shape from the start. Add to first layer.
model.add(tf.keras.layers.Conv2D(32, (3,3), strides=(2,2), input_shape=input_shape)) #produce tensor of outputs
model.add(tf.keras.layers.AveragePooling2D((2, 2), strides=(2,2))) # average pooling for spatial data
model.add(tf.keras.layers.Activation('relu'))#2nd hidden layer, Rectified linear unit activation function
model.add(tf.keras.layers.Conv2D(64, (3, 3), padding="same"))
model.add(tf.keras.layers.AveragePooling2D((2, 2), strides=(2,2)))
model.add(tf.keras.layers.Activation('relu'))#3rd hidden layer
model.add(tf.keras.layers.Conv2D(64, (3, 3), padding="same"))
model.add(tf.keras.layers.AveragePooling2D((2, 2), strides=(2,2)))
model.add(tf.keras.layers.Activation('relu'))#Flatten
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dropout(rate=0.5))#Add fully connected layer.
model.add(tf.keras.layers.Dense(64))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dropout(rate=0.5))#Output layer
model.add(tf.keras.layers.Dense(8))
model.add(tf.keras.layers.Activation('softmax')) # activation function for output layer on multi-class classifications
model.summary()


In [None]:
# Compile and train the model using stochastic gradient descent
# Ref 2: https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/SGD 
# Using default values from tutorial, except 16 epochs rather than 200 to start with
epochs = 200
batch_size = 8
learning_rate = 0.01
decay_rate = learning_rate / epochs
momentum = 0.9
sgd = tf.keras.optimizers.SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
model.compile(optimizer="sgd", loss="categorical_crossentropy", metrics=['accuracy'])

In [None]:
# fit the model
# time it
model.fit_generator(
    training_set,
    steps_per_epoch=90,
    epochs=50,
    validation_data=val_set,
    validation_steps=200)