In [18]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Load the dataset
background_dir = "dataset/background"
keyword_dir = "dataset/apsara"
background_files = [os.path.join(background_dir, f) for f in os.listdir(background_dir)]
keyword_files = [os.path.join(keyword_dir, f) for f in os.listdir(keyword_dir)]

# Extract features
def extract_features(file):
    audio, sr = librosa.load(file, sr=None, mono=True)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return mfccs.T

background_features = [extract_features(f) for f in background_files]
keyword_features = [extract_features(f) for f in keyword_files]

print(len(background_features + keyword_features))

# Pad or truncate MFCC feature vectors to a fixed length
max_len = max(len(seq) for seq in background_features + keyword_features)
print(max_len)
background_features = pad_sequences(background_features, maxlen=max_len, dtype="float32", padding="post", truncating="post", value=0.0)
keyword_features = pad_sequences(keyword_features, maxlen=max_len, dtype="float32", padding="post", truncating="post", value=0.0)


# Prepare the data
X = np.concatenate((background_features, keyword_features), axis=0)
y = np.concatenate((np.zeros(len(background_features)), np.ones(len(keyword_features))))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train_norm = (X_train - mean) / std
X_test_norm = (X_test - mean) / std

# Define the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=X_train_norm[0].shape),
    tf.keras.layers.Reshape(target_shape=(X_train_norm[0].shape[0], 1)),
    tf.keras.layers.Conv1D(filters=16, kernel_size=3, activation="relu"),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation="relu"),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_norm, y_train, validation_data=(X_test_norm, y_test), epochs=10)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_norm, y_test)
print(f"Test set accuracy: {accuracy}")

# Save the model
model.save("keyword_detection_model.h5")

200
173
(160, 173, 13) (40, 173, 13)
(160,) (40,)


ValueError: Exception encountered when calling layer "reshape_9" (type Reshape).

total size of new array must be unchanged, input_shape = [173, 13], output_shape = [173, 1]

Call arguments received:
  • inputs=tf.Tensor(shape=(None, 173, 13), dtype=float32)