In [1]:
# Importing necessary libraries
import os
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [14]:
# Function to load raw audio data and extract features
def load_and_extract_features(audio_path, sample_rate=16000, target_length=16000):
    audio_binary = tf.io.read_file(audio_path)
    audio, _ = tf.audio.decode_wav(audio_binary)
    audio = tf.squeeze(audio, axis=-1)  # Remove the last dimension (single channel)

    # Ensure the audio has the target length
    audio_length = tf.shape(audio)[0]
    if audio_length < target_length:
        # If shorter, pad with zeros
        audio = tf.pad(audio, paddings=[[0, target_length - audio_length]])
    elif audio_length > target_length:
        # If longer, truncate
        audio = audio[:target_length]

    # Reshape to (target_length, 1) for a single channel
    audio = tf.reshape(audio, (target_length, 1))

    return audio.numpy()


# Function to prepare the dataset
def prepare_dataset(data_path):
    labels = []
    audio_data = []

    for folder in os.listdir(data_path):
        label = folder
        folder_path = os.path.join(data_path, folder)
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            feature = load_and_extract_features(file_path)
            audio_data.append(feature)
            labels.append(label)

    return np.array(audio_data), np.array(labels)


In [15]:
# Same data_path for both training dataset and sample audio directory
data_path = 'VCR/dataset/audio/speakers/' 
# Load and preprocess the dataset
audio_data, labels = prepare_dataset(data_path)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(audio_data, encoded_labels, test_size=0.2, random_state=42)

In [17]:
# Build a simple neural network
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(16000, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Reshape input data to fit the model
X_train = X_train.reshape(X_train.shape[0], 16000, 1)
X_test = X_test.reshape(X_test.shape[0], 16000, 1)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=40, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x124af5db550>

In [18]:
# Evaluate the model
evaluation = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {evaluation[1] * 100:.2f}%")

Test Accuracy: 23.66%


In [20]:
# Demonstrate functionality by predicting a sample voice command
for folder in os.listdir(data_path):
    folder_path = os.path.join(data_path, folder)
    if os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith('.wav'):
                sample_audio_path = os.path.join(folder_path, filename)
                
                # Perform prediction for each file
                sample_input_data = load_and_extract_features(sample_audio_path)
                sample_input_data = sample_input_data.reshape(1, 16000, 1)

                # Make a prediction
                predicted_class_index = np.argmax(model.predict(sample_input_data))
                predicted_class_label = label_encoder.classes_[predicted_class_index]

                print(f"File: {filename}, Predicted Class: {predicted_class_label}")
                break
        break


File: 256b2060-4479-11e9-a9a5-5dbec3b8816a.wav, Predicted Class: 2BqVo8kVB2Skwgyb


In [21]:
# Measure accuracy using the test set
y_pred = np.argmax(model.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)
accuracy = np.sum(y_pred == y_true) / len(y_true)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")



AxisError: axis 1 is out of bounds for array of dimension 1