In [None]:
import math
import json
import librosa
import os
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Data

Mounted at /content/drive
/content/drive/MyDrive/Data


In [None]:

SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION


def preprocess(dataset_path, num_mfcc=40, n_fft=2048, hop_length=512, num_segments=10):

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping":[],
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

		# load audio file

                file_path = os.path.join(dirpath, f)

                if file_path != '/content/drive/My Drive/Data/genres_original/jazz/jazz.00054.wav':

                    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)


                    # process all segments of audio file
                    for d in range(num_segments):

                        # calculate start and finish sample for current segment
                        start = samples_per_segment * d
                        finish = start + samples_per_segment

                        #extract mfcc
                        #mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
                        mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)

                        mfcc = mfcc.T

                        # store only mfcc feature with expected number of vectors
                        if len(mfcc) == num_mfcc_vectors_per_segment:
                            data["mfcc"].append(mfcc.tolist())
                            data["labels"].append(i-1)
                            #print("{}, segment:{}".format(file_path, d+1))
    return data

In [None]:
dataset_path='/content/drive/My Drive/Data/genres_original'

mfcc_data=preprocess(dataset_path)
x=np.array(mfcc_data["mfcc"])
y=np.array(mfcc_data["labels"])
z=np.array(mfcc_data["mapping"])
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.2)
input_shape=(x_train.shape[1],x_train.shape[2])


Processing: jazz

Processing: pop

Processing: rock

Processing: metal

Processing: blues

Processing: reggae

Processing: country

Processing: disco

Processing: classical

Processing: hiphop


In [None]:
import tensorflow as tf
model=tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(64,input_shape=input_shape,return_sequences=True))
model.add(tf.keras.layers.LSTM(64))
model.add(tf.keras.layers.Dense(64,activation="relu"))
model.add(tf.keras.layers.Dense(10,activation="softmax"))

In [None]:
optimizer=tf.keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()
model.fit(x_train,y_train,validation_data=(x_val,y_val),batch_size=32,epochs=60,verbose=2)
model.save("GTZAN_LSTM.h5")



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 130, 64)           26880     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 10)                650       
                                                                 
Total params: 64714 (252.79 KB)
Trainable params: 64714 (252.79 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/60
188/188 - 38s - loss: 1.7043 - accuracy: 0.3837 - val_loss: 1.5403 - val_accuracy: 0.4419 - 38s/epoch - 202ms/step
Epoch 2/60
188/188 - 32s

  saving_api.save_model(


In [None]:

test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print("\nTest accuracy:", test_acc)


79/79 - 3s - loss: 0.8375 - accuracy: 0.8174 - 3s/epoch - 35ms/step

Test accuracy: 0.8173808455467224


In [None]:
def predict(model, X, y):
    """Predict a single sample using the trained model.
    :param model: Trained classifier
    :param X: Input data
    :param y (int): Target
    """

    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X = X[np.newaxis, ...]  # array shape (1, 130, 13, 1)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    # get mappings for target and predicted label
    target = z[y]
    predicted = z[predicted_index]

    print("Target: {}, Predicted label: {}".format(target, predicted))
X_to_predict = x_test[199]
y_to_predict = y_test[199]

# predict sample
predict(model, X_to_predict, y_to_predict)

Target: pop, Predicted label: ['pop']


In [None]:
from sklearn.metrics import f1_score, confusion_matrix
# Predictions on the test set
y_pred = np.argmax(model.predict(x_test), axis=1)

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

F1 Score: 0.8167335257591312
Confusion Matrix:
[[164   2  13   0  10   5   6   2  27   0]
 [  1 219   5   0   2   3   9  14   2   2]
 [  7   6 185  13   6   8  19  25   2   0]
 [  0   0  10 225   4   0   1   3   0   1]
 [  4   1   9   0 205   4   9   2   2   2]
 [  4   3   8   0   3 206  11  13   1   3]
 [  7   4  13   0  10   4 190   6   1   2]
 [  0   7  12   4   0   3   6 236   0   7]
 [  5   0   3   0   0   0   2   4 243   0]
 [  0  13   7   4   6  14   6  17   2 168]]
