In [None]:
!pip install -q numpy pandas librosa matplotlib seaborn scikit-learn keras

In [None]:
# -------------------------
# Import
# -------------------------
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, BatchNormalization, Dropout,
                                     GlobalAveragePooling1D, Dense, Multiply, Reshape)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow.keras.backend as K
import tensorflow as tf
# from keras_focal_loss import categorical_focal_loss

In [None]:
# -------------------------
# Focal Loss Function
# -------------------------
def categorical_focal_loss(gamma=2., alpha=.25):
    def focal_loss(y_true, y_pred):
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1. - epsilon)
        cross_entropy = -y_true * K.log(y_pred)
        weight = alpha * K.pow(1 - y_pred, gamma)
        loss = weight * cross_entropy
        return K.sum(loss, axis=1)
    return focal_loss

In [None]:
# -------------------------
# Set random seeds for reproducibility
# -------------------------
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# -------------------------
# Colab Specific: Mount Google Drive
# -------------------------
print("Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted.")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted.


In [None]:
# -------------------------
# Unzip Dataset
# -------------------------
ZIP_SPEECH_PATH = '/content/drive/My Drive/mars_project/Audio_Speech_Actors_01-24.zip'
ZIP_SONG_PATH = '/content/drive/My Drive/mars_project/Audio_Song_Actors_01-24.zip'
EXTRACT_DIR = '/content/extracted_audio_data'
os.makedirs(EXTRACT_DIR, exist_ok=True)

!unzip -q "{ZIP_SPEECH_PATH}" -d "{EXTRACT_DIR}"
!unzip -q "{ZIP_SONG_PATH}" -d "{EXTRACT_DIR}"

In [None]:
# -------------------------
# Optimized Configuration
# -------------------------
DATA_PATH = EXTRACT_DIR
emotion_dict = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    # '08': 'surprised'
}


In [None]:
# -------------------------
# Feature Extraction
# -------------------------
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=3, offset=0.5, sr=22050)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=60)
    target_length = 130
    if mfcc.shape[1] < target_length:
        pad_width = target_length - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :target_length]
    return mfcc.T

In [None]:
# -------------------------
# Load Data
# -------------------------
features = []

def process_directory(path):
    for folder in os.listdir(path):
        folder_path = os.path.join(path, folder)
        if not os.path.isdir(folder_path):
            continue
        for file in os.listdir(folder_path):
            if file.endswith('.wav'):
                file_path = os.path.join(folder_path, file)
                label = emotion_dict.get(file.split('-')[2])
                if label is not None:
                    features.append([extract_features(file_path), label])

print("Processing Speech Data...")
process_directory(EXTRACT_DIR)

print("Processing Song Data...")
process_directory(EXTRACT_DIR)

Processing Speech Data...
Processing Song Data...


In [None]:
# -------------------------
# Create DataFrame
# -------------------------
df = pd.DataFrame(features, columns=['feature', 'emotion'])
# Remove "surprised" samples
df = df[df['emotion'] != 'surprised']  

In [None]:
# -------------------------
# Encode 
# -------------------------
X = np.array([i for i in df['feature']])
y = np.array(df['emotion'].tolist())

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)

In [None]:
# -------------------------
# Train Test Split
# -------------------------
X = np.array([i for i in df['feature']])
y = np.array(df['emotion'].tolist())

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, stratify=y_cat, random_state=42
)

y_train_labels = np.argmax(y_train, axis=1)


In [None]:
# -------------------------
# Augment
# -------------------------
def augment_audio_feature(mfcc, label, sr=22050):
    y = librosa.feature.inverse.mfcc_to_audio(mfcc.T, sr=sr)

    if label == 'happy':
        y = librosa.effects.pitch_shift(y, sr, n_steps=np.random.randint(-4, 5))
        y += 0.01 * np.random.randn(len(y))
        if np.random.rand() < 0.7:
            y = librosa.effects.time_stretch(y, np.random.uniform(0.85, 1.15))
    elif label == 'sad':
        y = librosa.effects.pitch_shift(y, sr, n_steps=np.random.randint(-2, 3))
        y += 0.005 * np.random.randn(len(y))
        if np.random.rand() < 0.5:
            y = librosa.effects.time_stretch(y, np.random.uniform(0.9, 1.1))

    mfcc_aug = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=60)
    if mfcc_aug.shape[1] < 130:
        mfcc_aug = np.pad(mfcc_aug, ((0, 0), (0, 130 - mfcc_aug.shape[1])), mode='constant')
    else:
        mfcc_aug = mfcc_aug[:, :130]
    return mfcc_aug.T

# Apply augmentation to happy/sad in training data
X_augmented = []
y_augmented = []

for x, y_onehot in zip(X_train, y_train):
    label_idx = np.argmax(y_onehot)
    label_str = le.inverse_transform([label_idx])[0]
    if label_str in ['happy', 'sad']:
        augmented = augment_audio_feature(x, label_str)
        X_augmented.append(augmented)
        y_augmented.append(y_onehot)

# Append augmented data
X_train = np.concatenate([X_train, np.array(X_augmented)], axis=0)
y_train = np.concatenate([y_train, np.array(y_augmented)], axis=0)


In [None]:
# -------------------------
# Scale
# -------------------------
num_samples, time_steps, num_mfcc = X_train.shape
scaler = StandardScaler()
X_train_flat = X_train.reshape(num_samples * time_steps, num_mfcc)
X_train_scaled = scaler.fit_transform(X_train_flat).reshape(num_samples, time_steps, num_mfcc)

X_test_flat = X_test.reshape(X_test.shape[0] * time_steps, num_mfcc)
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape[0], time_steps, num_mfcc)


In [None]:
# -------------------------
# Compute Class Weights
# -------------------------
class_weights = compute_class_weight('balanced', classes=np.unique(np.argmax(y_train, axis=1)), y=np.argmax(y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

In [None]:
# -------------------------
# SE Block
# -------------------------
from tensorflow.keras.layers import GlobalAveragePooling1D, Multiply, Dense

def SE_Block(input_tensor, reduction_ratio=16):
    filters = input_tensor.shape[-1]
    se = GlobalAveragePooling1D()(input_tensor)
    se = Dense(filters // reduction_ratio, activation='relu')(se)
    se = Dense(filters, activation='sigmoid')(se)
    se = Reshape((1, filters))(se)
    return Multiply()([input_tensor, se])

In [None]:
# -------------------------
# Final Model with SE Blocks
# -------------------------
num_classes = y_cat.shape[1]
input_shape = X_train.shape[1:]

inp = Input(shape=input_shape)

x = Conv1D(64, kernel_size=5, padding='same', activation='relu')(inp)
x = BatchNormalization()(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.3)(x)
x = SE_Block(x)

x = Conv1D(128, kernel_size=5, padding='same', activation='relu')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.3)(x)
x = SE_Block(x)

x = Conv1D(256, kernel_size=5, padding='same', activation='relu')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.3)(x)
x = SE_Block(x)

x = GlobalAveragePooling1D()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
out = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=inp, outputs=out)
model.compile(loss=categorical_focal_loss(gamma=2),
              optimizer=Adam(1e-4),
              metrics=['accuracy'])

model.summary()

In [None]:
# -------------------------
# Training
# -------------------------
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 79ms/step - accuracy: 0.3047 - loss: 0.3021 - val_accuracy: 0.4923 - val_loss: 0.2573 - learning_rate: 1.0000e-04
Epoch 2/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 76ms/step - accuracy: 0.5506 - loss: 0.1754 - val_accuracy: 0.6344 - val_loss: 0.1530 - learning_rate: 1.0000e-04
Epoch 3/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 74ms/step - accuracy: 0.6253 - loss: 0.1425 - val_accuracy: 0.7091 - val_loss: 0.1204 - learning_rate: 1.0000e-04
Epoch 4/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 77ms/step - accuracy: 0.6733 - loss: 0.1177 - val_accuracy: 0.7362 - val_loss: 0.1054 - learning_rate: 1.0000e-04
Epoch 5/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 80ms/step - accuracy: 0.6973 - loss: 0.1053 - val_accuracy: 0.7483 - val_loss: 0.0947 - learning_rate: 1.0000e-04
Epoch 6/100
[1m226/226[0m [32m━━

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

# 1. Predict class labels from test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# 2. Overall Accuracy
overall_accuracy = accuracy_score(y_true_classes, y_pred_classes)
print(f"\n✅ Overall Accuracy: {overall_accuracy * 100:.2f}%")

# 3. F1 Score (macro)
f1_macro = f1_score(y_true_classes, y_pred_classes, average='macro')
print(f"✅ F1 Score (Macro): {f1_macro * 100:.2f}%")

# 4. Per-class Accuracy
conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)
class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
print("\n✅ Per-Class Accuracy:")
for label, acc in zip(le.classes_, class_wise_accuracy):  # 🔁 use `le` instead of `label_encoder`
    print(f"{label}: {acc * 100:.2f}%")

# 5. Classification Report
print("\nClassification Report:")
print(classification_report(y_true_classes, y_pred_classes, target_names=le.classes_))


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step

✅ Overall Accuracy: 92.20%
✅ F1 Score (Macro): 93.11%

✅ Per-Class Accuracy:
angry: 87.38%
calm: 98.67%
disgust: 100.00%
fearful: 100.00%
happy: 87.71%
neutral: 100.00%
sad: 79.40%

Classification Report:
              precision    recall  f1-score   support

       angry       1.00      0.87      0.93       301
        calm       0.96      0.99      0.97       301
     disgust       0.93      1.00      0.97       153
     fearful       0.76      1.00      0.86       301
       happy       0.95      0.88      0.91       301
     neutral       1.00      1.00      1.00       150
         sad       0.96      0.79      0.87       301

    accuracy                           0.92      1808
   macro avg       0.94      0.93      0.93      1808
weighted avg       0.93      0.92      0.92      1808



In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc*100:.2f}%")

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.9309 - loss: 0.0188
Test Accuracy: 92.20%


In [None]:
# -------------------------
# Save Model and Scaler
# -------------------------
model.save('emotion_model.h5')
import pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)


