In [1]:
!git clone https://github.com/suessmeister/coe379L_final_project.git


fatal: destination path 'coe379L_final_project' already exists and is not an empty directory.


In [2]:
!cd coe379L_final_project && git pull origin main

[33mhint: Pulling without specifying how to reconcile divergent branches is[m
[33mhint: discouraged. You can squelch this message by running one of the following[m
[33mhint: commands sometime before your next pull:[m
[33mhint: [m
[33mhint:   git config pull.rebase false  # merge (the default strategy)[m
[33mhint:   git config pull.rebase true   # rebase[m
[33mhint:   git config pull.ff only       # fast-forward only[m
[33mhint: [m
[33mhint: You can replace "git config" with "git config --global" to set a default[m
[33mhint: preference for all repositories. You can also pass --rebase, --no-rebase,[m
[33mhint: or --ff-only on the command line to override the configured default per[m
[33mhint: invocation.[m
From https://github.com/suessmeister/coe379L_final_project
 * branch            main       -> FETCH_HEAD
Already up to date.


In [3]:
pip install librosa

[0mNote: you may need to restart the kernel to use updated packages.


In [13]:
import os
import numpy as np
import librosa
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [14]:
folders = ["hey", "hi", "hello"]
base_path = "coe379L_final_project/data"
sample_rate = 16000
n_mfcc = 13

In [15]:
def extract_mfcc_mean(file_path, n_mfcc=n_mfcc, sr=sample_rate):
    audio, _ = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfcc.T, axis=0)

In [16]:
X = []
y = []

for label in folders:
    folder_path = os.path.join(base_path, label)
    if not os.path.exists(folder_path):
        print(f"Warning: Folder missing -> {folder_path}")
        continue

    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            features = extract_mfcc_mean(file_path)
            X.append(features)
            y.append(label)

X = np.array(X)


In [17]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)
# Save the encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.3, random_state=42
)

model = Sequential([
    Dense(32, activation="relu", input_shape=(n_mfcc,)),
    Dense(32, activation="relu"),
    Dense(len(folders), activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=30, batch_size=4, validation_data=(X_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7d998c4bc850>

In [19]:
loss, acc = model.evaluate(X_test, y_test)
print(f"\nFinal Test Accuracy: {acc:.4f}")
model.save("artifacts/dense_mfcc_model.h5")
print("\nSaved model as dense_mfcc_model.h5")


Final Test Accuracy: 1.0000

Saved model as dense_mfcc_model.h5


  saving_api.save_model(


## The next step will be to test on an unseen dataset. I will use a different microphone as well as the same microphone and compare MFCC results.

In [20]:
def validate(valid_path):
    for label in folders:
        folder_path = os.path.join(valid_path, label)
        if not os.path.exists(folder_path):
            print(f"Missing validation folder: {folder_path}")
            continue
    
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                file_path = os.path.join(folder_path, file)
    
                features = extract_mfcc_mean(file_path)
                features = np.expand_dims(features, axis=0)
    
                pred = model.predict(features)
                pred_idx = np.argmax(pred)
                predicted_label = le.inverse_transform([pred_idx])[0]
    
                print(f"{file} ({label}) -> Predicted: {predicted_label}")
    
                y_true.append(label)
                y_pred.append(predicted_label)

    print("\nCONFUSION MATRIX:")
    print(confusion_matrix(y_true, y_pred, labels=folders))
    
    print("\nCLASSIFICATION REPORT:")
    print(classification_report(y_true, y_pred, labels=folders))

In [27]:
from tensorflow.keras.models import load_model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

valid_path_1 = "coe379L_final_project/valid_data_same_speaker" # same mic array.
valid_path_2 = "coe379L_final_project/valid_data" # different mic array with added background noise.  
valid_paths = [valid_path_1, valid_path_2]

model = load_model("dense_mfcc_model.h5")
with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)



In [28]:
y_true = [] 
y_pred = []

for valid_path in valid_paths:
    validate(valid_path)


vh2.wav (hey) -> Predicted: hey
vh3.wav (hey) -> Predicted: hey
vh4.wav (hey) -> Predicted: hey
vh5.wav (hey) -> Predicted: hey
vh1.wav (hey) -> Predicted: hey
vi2.wav (hi) -> Predicted: hello
vi4.wav (hi) -> Predicted: hi
vi3.wav (hi) -> Predicted: hi
vi5.wav (hi) -> Predicted: hello
vi1.wav (hi) -> Predicted: hey
vj3.wav (hello) -> Predicted: hi
vj4.wav (hello) -> Predicted: hey
vj1.wav (hello) -> Predicted: hello
vj5.wav (hello) -> Predicted: hello
vj2.wav (hello) -> Predicted: hello

CONFUSION MATRIX:
[[5 0 0]
 [1 2 2]
 [1 1 3]]

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

         hey       0.71      1.00      0.83         5
          hi       0.67      0.40      0.50         5
       hello       0.60      0.60      0.60         5

    accuracy                           0.67        15
   macro avg       0.66      0.67      0.64        15
weighted avg       0.66      0.67      0.64        15

vh2.wav (hey) -> Predicted: hey
vh3.wav (hey) -> Predict

## Now, let's put this on our STM32 chip. We'll export some of the same test samples to demonstrate this workflow.


In [29]:
# Match the training data
n_mfcc = 13
sample_rate = 16000

def get_C_array(file_path):
    # Load audio
    audio, sr = librosa.load(file_path, sr=sample_rate)
    
    # Extract MFCC, just like we did in the training
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    
    # Average over time 
    mfcc_mean = np.mean(mfcc, axis=1)
    
    print("MFCC vector (13 floats):")
    print(mfcc_mean)
    
    # Print in C array format for STM32
    print("\nHere is the C array for this sample")
    c_array = ", ".join([f"{x:.4f}" for x in mfcc_mean]) # this line was generated by AI
    print(f"float test_mfcc[13] = {{ {c_array} }};")


In [30]:
hi1 = "coe379L_final_project/valid_data_same_speaker/hi/vi4.wav" # hi examples
hi2 = "coe379L_final_project/valid_data/hi/vi3.wav" 

hey1 = "coe379L_final_project/valid_data_same_speaker/hey/vh3.wav" # hey examples
hey2 = "coe379L_final_project/valid_data/hey/vh1.wav" 

hello1 = "coe379L_final_project/valid_data_same_speaker/hello/vj5.wav" # hello examples
hello2 = "coe379L_final_project/valid_data/hello/vj4.wav" 

paths = [hi1, hi2, hey1, hey2, hello1, hello2]

for path in paths:
    print(path)
    get_C_array(path)

coe379L_final_project/valid_data_same_speaker/hi/vi4.wav
MFCC vector (13 floats):
[-5.7831769e+02  4.3509212e+01 -1.4505867e+01  5.1716151e+00
 -1.4796649e+01 -2.1161077e+00 -4.4688196e+00 -4.5387645e+00
  1.6780006e+00  3.7075694e+00 -5.5144513e-01 -3.7968655e+00
 -4.8061695e+00]

Here is the C array for this sample
float test_mfcc[13] = { -578.3177, 43.5092, -14.5059, 5.1716, -14.7966, -2.1161, -4.4688, -4.5388, 1.6780, 3.7076, -0.5514, -3.7969, -4.8062 };
coe379L_final_project/valid_data/hi/vi3.wav
MFCC vector (13 floats):
[-567.71204    106.95062    -22.197748    -3.5886517    3.8074598
   20.865473    18.63842    -13.193054    -7.8210144   10.953506
   21.705854     2.9293861   -7.7166734]

Here is the C array for this sample
float test_mfcc[13] = { -567.7120, 106.9506, -22.1977, -3.5887, 3.8075, 20.8655, 18.6384, -13.1931, -7.8210, 10.9535, 21.7059, 2.9294, -7.7167 };
coe379L_final_project/valid_data_same_speaker/hey/vh3.wav
MFCC vector (13 floats):
[-5.6070874e+02  3.5677265e+01

In [None]:
folders = ["hey", "hi", "hello"]
n_mfcc = 13
sample_rate = 16000

valid_path = "coe379L_final_project/valid_data"

def extract_mfcc(file_path, n_mfcc=n_mfcc, sr=sample_rate):
    audio, _ = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfcc.T, axis=0)

# Loop through each class folder
for label in folders:
    folder_path = os.path.join(valid_path, label)
    if not os.path.exists(folder_path):
        continue
    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            mfcc_features = extract_mfcc(file_path)
            prediction = model.predict(np.expand_dims(mfcc_features, axis=0))
            predicted_label = folders[np.argmax(prediction)]
            print(f"{file} ({label}) -> Predicted: {predicted_label}")


## Alternative Implementation -- Not Working, But an Interesting NN approach nonetheless.


In [None]:
import os
import numpy as np
import librosa
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


folders = ["hey", "hi", "hello"]
base_path = "coe379L_final_project/data"  # path to training data
sample_rate = 16000
n_mfcc = 13  
max_len = 32  # alternative approach -- do not do past 32 MFCC 

# an alternative approach to extracting MFCC based on fixed length 
def extract_mfcc(file_path, n_mfcc=n_mfcc, sr=sample_rate, max_len=max_len):
    audio, _ = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0,0),(0,pad_width)), mode='constant') # This line was generated by AI
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

X = []
y = []

for label in folders:
    folder_path = os.path.join(base_path, label)
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        continue
    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            mfcc = extract_mfcc(file_path)
            X.append(mfcc)
            y.append(label)

X = np.array(X)
X = X[..., np.newaxis]  # This line was generated by AI 

# encoding 
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_categorical = to_categorical(y_encoded)


n_samples, n_mfcc, n_time, n_channel = X.shape
X_reshaped = X.reshape((n_samples, n_mfcc*n_time)) # This line was generated by AI 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X = X_scaled.reshape((n_samples, n_mfcc, n_time, n_channel))


X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42
)


model = Sequential([
    Conv2D(16, (3,3), activation='relu', input_shape=(n_mfcc, max_len, 1)),
    MaxPooling2D((2,2)),
    Dropout(0.2),
    Conv2D(32, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(len(folders), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=30, batch_size=4, validation_data=(X_test, y_test))

# predictions now 
loss, acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {acc:.2f}")

In [31]:
import numpy as np
from tensorflow.keras.models import load_model
import pickle

# Load model and label encoder
model = load_model("dense_mfcc_model.h5")
# with open("label_encoder.pkl", "rb") as f:
#     le = pickle.load(f)

hi1_array = np.array([
    -578.3177, 43.5092, -14.5059, 5.1716, -14.7966, -2.1161, -4.4688,
    -4.5388, 1.6780, 3.7076, -0.5514, -3.7969, -4.8062
], dtype=np.float32)

hi1_array = hi1_array.reshape(1, -1)
pred = model.predict(hi1_array)
print("Raw model outputs:", pred)



Raw model outputs: [[0.2952013  0.04858567 0.6562131 ]]
