# Improve the Model (Version 4)
 🔧 What We’ll Do:
- Make the model deeper: More Conv1D + BatchNorm + Dropout
- Use Bidirectional LSTM to better capture emotion flow
- Stratified split so all emotion classes are balanced

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Dropout, BatchNormalization
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

Load features

In [12]:
X = np.load("../../data/features/features_v4.npy") #data\features\features_v4.npy
y = np.load("../../data/features/labels_v4.npy")
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (1902, 59, 173), y shape: (1902,)


Encode labels

In [13]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

Train-test split with stratify

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_categorical
)

Build better model

In [15]:
# Build the improved model
model = Sequential([
    Input(shape=(59, 173)),                              # ✅ Corrected input
    Conv1D(64, 5, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.3),
    
    Conv1D(128, 5, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.3),

    Bidirectional(LSTM(64)),
    Dropout(0.4),
    
    Dense(64, activation='relu'),
    Dropout(0.3),
    
    Dense(y_categorical.shape[1], activation='softmax')
])

In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# # Optional: Callbacks
# callbacks = [
#     EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
#     ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
# ]

In [21]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    # callbacks=callbacks
)

Epoch 1/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.5550 - loss: 1.1849 - val_accuracy: 0.4672 - val_loss: 1.4415
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.5845 - loss: 1.1173 - val_accuracy: 0.4672 - val_loss: 1.4334
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 33ms/step - accuracy: 0.5621 - loss: 1.1814 - val_accuracy: 0.4751 - val_loss: 1.4309
Epoch 4/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.5760 - loss: 1.1575 - val_accuracy: 0.4751 - val_loss: 1.4335
Epoch 5/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.5570 - loss: 1.1449 - val_accuracy: 0.4777 - val_loss: 1.4347
Epoch 6/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.5621 - loss: 1.1743 - val_accuracy: 0.4829 - val_loss: 1.4455
Epoch 7/50
[1m48/48[0m [32m━━━━

In [22]:
model.save("emotion_model_v4.keras")

In [23]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test accuracy (v4):", test_acc)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5054 - loss: 1.4017
Test accuracy (v4): 0.4803149700164795


# Prediction

In [None]:
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import joblib  # assuming you saved your LabelEncoder
import os

from src.preprocessing.preprocess import extract_features_from_audio  
# ------------------ Step 1: Load Pretrained Model ------------------
model = tf.keras.models.load_model("emotion_model_v4.keras")

# ------------------ Step 2: Load Label Encoder ------------------
# Either load it if saved, or recreate and fit it again if not saved
# Option A: Load saved encoder
# encoder = joblib.load("label_encoder.pkl")

# Option B: Recreate and fit using original labels
original_labels = np.load("labels_v4.npy")
encoder = LabelEncoder()
encoder.fit(original_labels)

# ------------------ Step 3: Feature Extraction Function ------------------
def extract_features(file_path, max_pad_len=173):
    y, sr = librosa.load(file_path, sr=22050)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    
    # Padding or truncating to fixed length
    if mfccs.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]
    
    return mfccs

# ------------------ Step 4: Load and Preprocess New Sample ------------------
file_path = "sample.wav"  # replace with your new file
mfcc_features = extract_features_from_audio(file_path)  # shape: (40, 173)
mfcc_features = np.expand_dims(mfcc_features, axis=0)  # shape: (1, 40, 173)

# ------------------ Step 5: Predict ------------------
prediction = model.predict(mfcc_features)
predicted_label_index = np.argmax(prediction)
predicted_emotion = encoder.inverse_transform([predicted_label_index])[0]

print("Predicted Emotion:", predicted_emotion)
