In [3]:
import os
import librosa
import math
import json
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras import models, layers
import tensorflow as tf
import matplotlib.pyplot as plt

DATASET_PATH = "data/training/"
JSON_PATH = "data.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 5 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION


2023-07-27 18:22:40.817543: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [1]:
def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):

    data = {
        "mapping": [],
        "mfcc": [],
        "labels": []
    }

    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length)

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

                # load audio file
                file_path = os.path.join(dirpath, f)
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

                # process all segments of audio file
                for d in range(num_segments):

                    # calculate start and finish sample for current segment
                    start = num_samples_per_segment * d
                    finish = start + num_samples_per_segment

                    # extract mfcc
                    mfcc = librosa.feature.mfcc(y=signal,
                                                sr=SAMPLE_RATE, n_mfcc=20, n_fft=2048, hop_length=512, n_mels=128)

                    mfcc = mfcc.T

                    # store only mfcc feature with expected number of vectors
                    if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, d+1))

    # save MFCCs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [4]:
save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)


Processing: MR

Processing: AS


KeyboardInterrupt: 

In [3]:
def load_data(data_path):

    with open(data_path, "r") as fp:
        data = json.load(fp)

    X = np.array(data["mfcc"])
    y = np.array(data["labels"])
    

    return X, y

In [4]:
def load_dataset(test_size, validation_size):
    # load data
    X, y = load_data(JSON_PATH)


    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # create train/validation split
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    # add an axis to nd array
    X_train = X_train[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


In [5]:
def build_model(input_shape):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same', data_format='channels_first'))
    model.add(layers.BatchNormalization())

    model.add(layers.Conv2D(32, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same', data_format='channels_first'))
    model.add(layers.BatchNormalization())

    model.add(layers.Conv2D(32, (2, 2), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same', data_format='channels_first'))
    model.add(layers.BatchNormalization())

    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(5, activation='softmax'))

    return model

In [8]:
# load data
X_train, X_validation, X_test, y_train, y_validation, y_test = load_dataset(0.3, 0.2)
print(X_train.shape)

input_shape = (X_train[1], X_train[2], X_train[3])
model = build_model(input_shape)

optimazer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimazer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_validation, y_validation))



(32, 130, 20, 1)


TypeError: only integer scalar arrays can be converted to a scalar index

In [5]:
# evaluate model on test set
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print("Accuracy on test set is: {}".format(test_accuracy))

# plot accuracy/error for training and validation
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')


NameError: name 'model' is not defined

In [9]:
print(X_test)

[[[[-3.94451538e+02]
   [ 9.50516586e+01]
   [ 6.77674103e+01]
   ...
   [ 6.12180328e+00]
   [ 4.62497520e+00]
   [ 3.09662485e+00]]

  [[-3.15396667e+02]
   [ 1.66267914e+02]
   [ 7.30263519e+01]
   ...
   [ 3.85163093e+00]
   [ 5.06497478e+00]
   [ 3.18533087e+00]]

  [[-3.00560974e+02]
   [ 1.77877533e+02]
   [ 7.02767944e+01]
   ...
   [ 6.01160240e+00]
   [ 5.99095535e+00]
   [ 4.07767105e+00]]

  ...

  [[-4.70819519e+02]
   [ 0.00000000e+00]
   [ 0.00000000e+00]
   ...
   [ 0.00000000e+00]
   [ 0.00000000e+00]
   [ 0.00000000e+00]]

  [[-4.70501190e+02]
   [ 4.46713448e-01]
   [ 4.36372697e-01]
   ...
   [-2.22656518e-01]
   [-2.67100215e-01]
   [-3.07194173e-01]]

  [[-4.67021759e+02]
   [ 5.30994987e+00]
   [ 5.12979126e+00]
   ...
   [-1.19331145e+00]
   [-1.25106215e+00]
   [-1.25720203e+00]]]


 [[[-4.50180328e+02]
   [ 6.64707489e+01]
   [ 3.63920898e+01]
   ...
   [-8.48662317e-01]
   [-3.64365458e+00]
   [-3.05392480e+00]]

  [[-4.26619263e+02]
   [ 9.25543823e+01]
   [

In [10]:
# test model
X = X_test[5]
y = y_test[5]

prediction = model.predict(X[np.newaxis, ...])
predicted_index = np.argmax(prediction, axis=1)

print("Expected index: {}, Predicted index: {}".format(y, predicted_index))

# save model
model.save("model_mfcc.h5")



Expected index: 1, Predicted index: [1]


In [11]:
# test with real data
audio = "data/training/MR/New_MR_103.wav"
signal, sr = librosa.load(audio)

# extract MFCCs
mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=20, n_fft=2048, hop_length=512, n_mels=128)
mfcc = mfcc.T

# load model
model = tf.keras.models.load_model("model_mfcc.h5")

# add an axis to input data for sample
mfcc = mfcc[np.newaxis, ...]

# get the predicted label
predictions = model.predict(mfcc)
predicted_index = np.argmax(predictions, axis=1)

print("Predicted keyword is: {}".format(predicted_index))





ValueError: in user code:

    File "/home/mamet/anaconda3/envs/tf/lib/python3.10/site-packages/keras/engine/training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "/home/mamet/anaconda3/envs/tf/lib/python3.10/site-packages/keras/engine/training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/mamet/anaconda3/envs/tf/lib/python3.10/site-packages/keras/engine/training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "/home/mamet/anaconda3/envs/tf/lib/python3.10/site-packages/keras/engine/training.py", line 2079, in predict_step
        return self(x, training=False)
    File "/home/mamet/anaconda3/envs/tf/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/mamet/anaconda3/envs/tf/lib/python3.10/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 130, 20, 1), found shape=(None, 96, 20)
