In [None]:
# Imports
import os
import librosa
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.python.keras import utils
import tensorflow as tf
from keras.utils import to_categorical
from numba import cuda
cuda.select_device(0)
cuda.close()

In [2]:
def extract_mel_spectrogram(directory):

    
    labels = []
    mel_specs = []
    
    
    for file in os.scandir(directory):
        
        y, sr = librosa.core.load(file)
        
        label = str(file).split('.')[0][11:]
        labels.append(label)
        
        # Computing the spectrograms
        #spec = np.abs(librosa.stft(y, hop_length=512))
        #spec = librosa.amplitude_to_db(spec, ref=np.max) # converting to decibals
        # If you want the mel spectrograms, use this.
        spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024)
        spec = librosa.power_to_db(spec, ref=np.max) # Converting to decibals
        # Adjusting the size to be 128 x 660
        if spec.shape[1] != 660:
            spec.resize(128,660, refcheck=False)
            
        mel_specs.append(spec)
        
    X = np.array(mel_specs)
    
    labels = pd.Series(labels)
    label_dict = {
        'jazz': 0,
        'reggae': 1,
        'rock': 2,
        'blues': 3,
        'hiphop': 4,
        'country': 5,
        'metal': 6,
        'classical': 7,
        'disco': 8,
        'pop': 9
    }
    y = labels.map(label_dict).values
    
    # Returning the mel spectrograms and labels
    return X, y

In [None]:

X, y = extract_mel_spectrogram('../data/data')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=.2)

In [None]:

X_train.min()
X_train /= -80
X_test /= -80
X_train = X_train.reshape(X_train.shape[0], 128, 660, 1)
X_test = X_test.reshape(X_test.shape[0], 128, 660, 1)
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

In [None]:

np.random.seed(23456)
tf.random.set_seed(123)

cnn_model = Sequential(name='cnn_1')

cnn_model.add(Conv2D(filters=16,
                     kernel_size=(3,3),
                     activation='relu',
                     input_shape=(128,660,1)))

cnn_model.add(MaxPooling2D(pool_size=(2,4)))

cnn_model.add(Conv2D(filters=32,
                     kernel_size=(3,3),
                     activation='relu'))

cnn_model.add(MaxPooling2D(pool_size=(2,4)))

cnn_model.add(Flatten())

cnn_model.add(Dense(64, activation='relu'))

cnn_model.add(Dropout(0.25))

cnn_model.add(Dense(10, activation='softmax'))

cnn_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

# Fitting our neural network
history = cnn_model.fit(X_train,
                        y_train, 
                        batch_size=16,
                        validation_data=(X_test, y_test),
                        epochs=20)

In [None]:

train_loss = history.history['loss']
test_loss = history.history['val_loss']

plt.figure(figsize=(15,6))

plt.plot(train_loss, label='Training Loss', color='blue')
plt.plot(test_loss, label='Testing Loss', color='red')

plt.title('Training and Testing Loss by Epoch', fontsize = 25)
plt.xlabel('Epoch', fontsize = 18)
plt.ylabel('Categorical Crossentropy', fontsize = 18)
plt.xticks(range(1,21), range(1,21))

plt.legend(fontsize = 18);

In [None]:

train_loss = history.history['accuracy']
test_loss = history.history['val_accuracy']

plt.figure(figsize=(15,6))

plt.plot(train_loss, label='Training Accuracy', color='blue')
plt.plot(test_loss, label='Testing Accuracy', color='red')

# Set title
plt.title('Training and Testing Accuracy by Epoch', fontsize = 25)
plt.xlabel('Epoch', fontsize = 18)
plt.ylabel('Accuracy', fontsize = 18)
plt.xticks(range(1,21), range(1,21))

plt.legend(fontsize = 18);

In [None]:
# Making predictions from the cnn model
predictions = cnn_model.predict(X_test, verbose=1)
for i in range(10): 
    print(f'{i}: {sum([1 for target in y_test if target[i] == 1])}')
for i in range(10): 
    print(f'{i}: {sum([1 for prediction in predictions if np.argmax(prediction) == i])}')
conf_matrix = confusion_matrix(np.argmax(y_test, 1), np.argmax(predictions, 1))
conf_matrix

In [None]:
confusion_df = pd.DataFrame(conf_matrix)
# Creating a dictionary of labels
labels_dict = {
    0: 'jazz',
    1: 'reggae',
    2: 'rock',
    3: 'blues',
    4: 'hiphop',
    5: 'country',
    6: 'metal',
    7: 'classical',
    8: 'disco',
    9: 'pop'
}
confusion_df = confusion_df.rename(columns=labels_dict)
confusion_df.index = confusion_df.columns
plt.figure(figsize= (20,12))
sns.set(font_scale = 2);
ax = sns.heatmap(confusion_df, annot=True, cmap=sns.cubehelix_palette(50));
ax.set(xlabel='Predicted Values', ylabel='Actual Values');