In [None]:
import math
import json
import librosa
import os
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive
import tensorflow as tf
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Data

Mounted at /content/drive
/content/drive/MyDrive/Data


In [None]:

SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION


def preprocess(dataset_path, num_mfcc=40, n_fft=2048, hop_length=512, num_segments=10):

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping":[],
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

		# load audio file

                file_path = os.path.join(dirpath, f)

                if file_path != '/content/drive/My Drive/Data/genres_original/jazz/jazz.00054.wav':

                    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)


                    # process all segments of audio file
                    for d in range(num_segments):

                        # calculate start and finish sample for current segment
                        start = samples_per_segment * d
                        finish = start + samples_per_segment

                        # extract mfcc
                        #mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
                        mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)

                        mfcc = mfcc.T

                        # store only mfcc feature with expected number of vectors
                        if len(mfcc) == num_mfcc_vectors_per_segment:
                            data["mfcc"].append(mfcc.tolist())
                            data["labels"].append(i-1)
                            #print("{}, segment:{}".format(file_path, d+1))
    return data

In [None]:
dataset_path='/content/drive/My Drive/Data/genres_original'
mfcc_data=preprocess(dataset_path)
x=np.array(mfcc_data["mfcc"])
y=np.array(mfcc_data["labels"])
z=np.array(mfcc_data["mapping"])
x=x.reshape(x.shape[0],x.shape[1],x.shape[2],1)
y=tf.keras.utils.to_categorical(y,num_classes=10)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.2)
y_train[y_train==10]=9
y_val[y_val==10]=9
y_test[y_test==10]=9
input_shape=x_train.shape[1:]


Processing: jazz

Processing: pop

Processing: rock

Processing: metal

Processing: blues

Processing: reggae

Processing: country

Processing: disco

Processing: classical

Processing: hiphop


In [None]:
from tensorflow.keras import models, layers

cnn_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', padding='valid', input_shape=input_shape),
    layers.MaxPooling2D(2, padding='same'),

    layers.Conv2D(128, (3, 3), activation='relu', padding='valid'),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),

    layers.Conv2D(128, (3, 3), activation='relu', padding='valid'),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),

    layers.GlobalAveragePooling2D(),
    layers.Dense(512, activation='relu'),
    layers.Dense(10, activation='softmax')
])


In [None]:
cnn_model.compile(loss='binary_crossentropy',optimizer='adam',metrics='acc')
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 128, 38, 32)       320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 64, 19, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 62, 17, 128)       36992     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 31, 9, 128)        0         
 g2D)                                                            
                                                                 
 dropout (Dropout)           (None, 31, 9, 128)        0         
                                                                 
 conv2d_2 (Conv2D)           (None, 29, 7, 128)        1

In [None]:

history=cnn_model.fit(x_train,y_train,validation_data=(x_val,y_val),epochs=40,verbose=2,batch_size=32)



Epoch 1/40
188/188 - 99s - loss: 0.2655 - acc: 0.3597 - val_loss: 0.2131 - val_acc: 0.4973 - 99s/epoch - 527ms/step
Epoch 2/40
188/188 - 90s - loss: 0.2018 - acc: 0.5358 - val_loss: 0.1817 - val_acc: 0.6048 - 90s/epoch - 479ms/step
Epoch 3/40
188/188 - 91s - loss: 0.1774 - acc: 0.6116 - val_loss: 0.1876 - val_acc: 0.5794 - 91s/epoch - 484ms/step
Epoch 4/40
188/188 - 90s - loss: 0.1630 - acc: 0.6535 - val_loss: 0.1517 - val_acc: 0.6809 - 90s/epoch - 481ms/step
Epoch 5/40
188/188 - 92s - loss: 0.1496 - acc: 0.6832 - val_loss: 0.1599 - val_acc: 0.6769 - 92s/epoch - 487ms/step
Epoch 6/40
188/188 - 92s - loss: 0.1410 - acc: 0.6984 - val_loss: 0.1426 - val_acc: 0.7196 - 92s/epoch - 487ms/step
Epoch 7/40
188/188 - 96s - loss: 0.1313 - acc: 0.7301 - val_loss: 0.1262 - val_acc: 0.7457 - 96s/epoch - 513ms/step
Epoch 8/40
188/188 - 97s - loss: 0.1217 - acc: 0.7568 - val_loss: 0.1255 - val_acc: 0.7623 - 97s/epoch - 514ms/step
Epoch 9/40
188/188 - 93s - loss: 0.1100 - acc: 0.7788 - val_loss: 0.1125

In [None]:
test_loss, test_acc = cnn_model.evaluate(x_test, y_test, verbose=2)
print("\nTest accuracy:", test_acc)


79/79 - 8s - loss: 0.0794 - acc: 0.8915 - 8s/epoch - 103ms/step

Test accuracy: 0.8914697766304016


In [None]:
def predict(model, X, y):
    """Predict a single sample using the trained model.
    :param model: Trained classifier
    :param X: Input data
    :param y: One-hot encoded target label
    """

    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X = X[np.newaxis, ...]  # array shape (1, 130, 40, 1)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    # convert the one-hot encoded label back to the original integer label
    original_label = np.argmax(y)

    # get mappings for target and predicted label
    target = z[original_label]
    predicted = z[predicted_index[0]]  # Note: use predicted_index[0] to access the single predicted label

    print("Target: {}, Predicted label: {}".format(target, predicted))

X_to_predict = x_test[100]
y_to_predict = y_test[100]

# predict sample
predict(cnn_model, X_to_predict, y_to_predict)
#display(x_test[199].shape)

Target: hiphop, Predicted label: hiphop


In [None]:
cnn_model.save('CNN_GTZAN.h5')

  saving_api.save_model(
