In [48]:
# set the matplotlib backend so figures can be saved in the background
#TODO: cleanup, lint
import matplotlib
matplotlib.use("Agg")
 
import pathlib
from pathlib import Path
import matplotlib.pyplot as plot
import librosa
import keras
from keras.layers import Activation, Dense, Dropout, Conv2D, \
                         Flatten, MaxPooling2D
    
# import the necessary packages
from musicrec.vgg import VGGNet
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import SGD
from keras import regularizers
from imutils import paths
import matplotlib.pyplot as plt
import numpy as np
import argparse
import random
from datetime import datetime
import pickle
import cv2
import os
import sys
import tensorflow as tf
import pickle
import json

from keras.models import Sequential

import warnings

#env parameters
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [49]:
# Paramters
data_folder = Path("../../../audio/testfiles/GTZAN/genres/")
output_root = './../../../models/'
output_folder = Path('./output/cvnn.model')
output_model = output_root + '/cnn_dong_model_weights.h5'
output_architecture = output_root + '/cnn_dong_model_architecture.json'
output_whole = output_root + 'cnn_dong_model_whole.h5'
output_label = output_root + 'label.pkl'
output_test_paths = output_root + 'test_paths.pkl'
spectogram_folder = Path("./img_data/")
# Duration of songsnippet in seconds
duration = 2.97
# Matplotlib colormap for spectogram
spectogram_cmap = 'binary' 
# Predefined list of genres
pred_genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split() 
epochs = 120

# parameters
sr = 22050 # if sampling rate is different, resample it to this

# parameters for calculating spectrogram in mel scale
fmax = 1500 # maximum frequency considered
fft_window_points = 512
fft_window_dur = fft_window_points * 1.0 / sr # 23ms windows
hop_size = int(fft_window_points/ 2) # 50% overlap between consecutive frames
n_mels = 64

# segment duration
num_fft_windows = 256 # num fft windows per music segment
segment_in_points = num_fft_windows * 255 # number of data points that ensure the spectrogram has size: 64 * 256
segment_dur = segment_in_points * 1.0 / sr

num_genres=10
input_shape=(64, 256, 1)

randomseed = 11
#randomseed = datetime.now()
# Seed for RNG
random.seed(datetime.now())


In [50]:
#Get directories of all songs
songs = []
genres = []

spectograms = []

for g in data_folder.iterdir():
    genres.append(g.name)
    for i in g.iterdir():
        songs.append(i)

In [51]:
mel_specs = []  
labels = []

offset_1 = duration
offset_2 = duration*2

def load_specs(offset = 0):
    offset = duration*offset
    for song in songs:
        y, sr = librosa.load(song, mono=True, offset=offset, duration=duration)
        m_sp = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=fft_window_points,
                                              hop_length=hop_size, n_mels=n_mels,
                                              fmax=fmax)
        #if m_sp.shape != (128, 128): continue
        mel_specs.append([m_sp])
        label = song.parts[-2]
        labels.append(label)
        spectograms.append( (m_sp, label, song) )
        input_shape = m_sp.shape + (1,)

#data augmentation:
def get_songs_num_offsets(num = 1):
    for i  in range(num):
        load_specs(offset = i)
    return songs

#get_songs_num_offsets(num = 4)
load_specs(offset = 0)
#load_specs(offset = 1)
#load_specs(offset = 2)
#load_specs(offset = 3)
#load_specs(offset = 4)
#load_specs(offset = 5)
#load_specs(offset = 6)
#load_specs(offset = 7)
#load_specs(offset = 8)
#load_specs(offset = 9)
#spectograms = list(zip(mel_specs, labels))
print("Total number of samples: ", len(mel_specs))

#set dynamic input shape
input_shape = np.shape(mel_specs[2]) + (1,)
print("Input shape: " + str(input_shape))
np.shape(mel_specs)

Total number of samples:  1000
Input shape: (1, 64, 256, 1)


(1000, 1, 64, 256)

In [52]:
batch_size = input_shape[1] # use second dim as batch size, as it varies with duration

In [53]:
imagesize_x = np.shape(mel_specs)[1]
imagesize_y = np.shape(mel_specs)[2]
print("Spectogram dimensions: ", imagesize_x, "x", imagesize_y)

Spectogram dimensions:  1 x 64


In [54]:
# Shuffle the spectograms
random.shuffle(spectograms)
#spectograms[0][1]

In [66]:
#TODO: change
testsplit = len(mel_specs)*2/3
train = spectograms[:int(testsplit)]
test = spectograms[int(testsplit):]

X_train, y_train, p_train = zip(*train)
X_test, y_test, p_test = zip(*test)

# Reshape for CNN input
X_train = np.array([x.reshape(input_shape) for x in X_train])
X_test = np.array([x.reshape(input_shape) for x in X_test])

# One-Hot encoding for classes
lb = LabelBinarizer()
y_train_bin = lb.fit_transform(y_train)
y_test_bin = lb.transform(y_test)


with open(output_label, 'wb') as f:
    pickle.dump(lb, f)
    
with open(output_test_paths, 'wb') as f:
    pickle.dump(p_test, f)

"""
label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(y_train)
y_test_int = label_encoder.fit_transform(y_test)
#print(integer_encoded)

y_train_bin = np.array(keras.utils.to_categorical(y_train_int, 10))
y_test_bin = np.array(keras.utils.to_categorical(y_test_int, 10))
#print("Encoding of test data:\n", y_test, "=>\n", y_test_int, "=>\n", y_test_bin)
"""
0
np.shape(p_test)

(334,)


(334,)

In [62]:
np.shape(test)

(655, 3)

In [45]:
y_train = y_train_bin
y_test = y_test_bin

In [46]:
"""

# construct the image generator for data augmentation
aug = ImageDataGenerator(rotation_range=30, width_shift_range=0.1, 
                         height_shift_range=0.1, shear_range=0.2, 
                         zoom_range=0.2,horizontal_flip=True, 
                         fill_mode="nearest")
 
# initialize our VGG-like Convolutional Neural Network
model = VGGNet.build(width=imagesize_x, height=imagesize_y, depth=1, 
                          classes=len(lb.classes_))
"""
0

0

In [None]:
#old model for urban
def cnn_urban_model_build():
    model = Sequential()

    model.add(Conv2D(24, (5, 5), strides=(1, 1), input_shape=input_shape))
    model.add(MaxPooling2D((4, 2), strides=(4, 2)))
    model.add(Activation('relu'))

    model.add(Conv2D(48, (5, 5), padding="valid"))
    model.add(MaxPooling2D((4, 2), strides=(4, 2)))
    model.add(Activation('relu'))

    model.add(Conv2D(48, (5, 5), padding="valid"))
    model.add(Activation('relu'))

    model.add(Flatten())
    model.add(Dropout(rate=0.5))

    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(rate=0.5))

    model.add(Dense(10))
    model.add(Activation('softmax'))

    model.compile(
        optimizer="Adam",
        loss="categorical_crossentropy",
        metrics=['accuracy'])
    print(model.summary)
    return model

#model for GTZAN
def cnn_dong_model_build():

    model = Sequential()
    model.add(Conv2D(64, kernel_size=(3, 3),
                     activation='relu', kernel_regularizer=regularizers.l2(0.01),
                     input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 4)))

    model.add(Conv2D(64, (3, 5), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(MaxPooling2D(pool_size=(2, 4)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
    model.add(Dropout(0.5))
    model.add(Dense(num_genres, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adadelta(decay=1e-6),
                  metrics=['accuracy'])
    print(model.summary)
    return model

In [None]:
model = cnn_dong_model_build()

In [None]:
H = model.fit(
	x=X_train, 
	y=y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data= (X_test, y_test))

score = model.evaluate(x=X_test,y=y_test)

print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
# evaluate the network
print("[INFO] evaluating network...")
predictions = model.predict(X_test, batch_size=batch_size)
print(classification_report(y_test_bin.argmax(axis=1),
                            predictions.argmax(axis=1), 
                            target_names=lb.classes_))
 
# plot the training loss and accuracy
N = np.arange(0, epochs)
plt.style.use("ggplot")
plt.figure()
"""
plt.plot(N, H.history["loss"], label="train_loss")
plt.plot(N, H.history["val_loss"], label="val_loss")
plt.plot(N, H.history["acc"], label="train_acc")
plt.plot(N, H.history["val_acc"], label="val_acc")
plt.title("Training Loss and Accuracy (SmallVGGNet)")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()
"""
0

In [None]:
cm = confusion_matrix(y_test_bin.argmax(axis=1), predictions.argmax(axis=1))

#cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm

In [None]:
# Save the weights
#model.save_weights(output_model)

# Save the model architecture
#with open(output_architecture, 'w') as f:
#    f.write(model.to_json())
    
#model.save(output_whole)