In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Constants
DATASET_PATH = '/kaggle/input/audio-emotions/Emotions'
SAMPLE_RATE = 22050
DURATION = 3
MAX_SAMPLES = SAMPLE_RATE * DURATION
TARGET_EMOTIONS = ['Angry', 'Happy', 'Suprised', 'Disgusted']

# Feature extraction
def extract_features(file_path):
    try:
        audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
        if len(audio) < MAX_SAMPLES:
            audio = np.pad(audio, (0, MAX_SAMPLES - len(audio)), 'constant')
        else:
            audio = audio[:MAX_SAMPLES]

        mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=13)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=SAMPLE_RATE)
        chroma = librosa.feature.chroma_stft(y=audio, sr=SAMPLE_RATE)
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        features = np.concatenate([mfccs, spectral_contrast, chroma, mel_spec_db], axis=0).T
        return features
    except Exception as e:
        print(f"Error extracting features from {file_path}: {e}")
        return None

# Prepare dataset with filtering
def prepare_dataset():
    features = []
    labels = []

    for emotion_folder in os.listdir(DATASET_PATH):
        if emotion_folder not in TARGET_EMOTIONS:
            continue
        
        emotion_path = os.path.join(DATASET_PATH, emotion_folder)
        if not os.path.isdir(emotion_path):
            continue

        print(f"Processing {emotion_folder} files...")
        for audio_file in os.listdir(emotion_path):
            if audio_file.endswith('.wav'):
                file_path = os.path.join(emotion_path, audio_file)
                audio_features = extract_features(file_path)
                if audio_features is not None:
                    features.append(audio_features)
                    labels.append(emotion_folder)

    features = np.array(features)
    labels = np.array(labels)
    print("Loaded emotion classes:", np.unique(labels))

    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)
    categorical_labels = to_categorical(encoded_labels)

    X_train, X_test, y_train, y_test = train_test_split(
        features, categorical_labels, test_size=0.2, random_state=42, stratify=categorical_labels
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_test, y_test, test_size=0.5, random_state=42, stratify=y_test
    )

    return X_train, X_val, X_test, y_train, y_val, y_test, label_encoder

# Build model
def build_model(input_shape, num_classes):
    model = Sequential([
        Conv1D(64, kernel_size=3, activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.2),

        Conv1D(128, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.2),

        Conv1D(256, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),

        LSTM(128, return_sequences=False),
        Dropout(0.4),

        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),

        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Train model
def train_model(X_train, y_train, X_val, y_val):
    input_shape = (X_train.shape[1], X_train.shape[2])
    num_classes = y_train.shape[1]
    model = build_model(input_shape, num_classes)

    checkpoint = ModelCheckpoint('best_emotion_model.keras', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True, verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-5, verbose=1)

    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[checkpoint, early_stopping, reduce_lr],
        verbose=1
    )

    return model, history

# Evaluate model
def evaluate_model(model, history, X_test, y_test, label_encoder):
    loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
    print(f"Test accuracy: {accuracy:.4f}")

    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    class_names = label_encoder.classes_
    report = classification_report(y_true_classes, y_pred_classes, target_names=class_names)
    print("\nClassification Report:\n", report)

    cm = confusion_matrix(y_true_classes, y_pred_classes)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train')
    plt.plot(history.history['val_accuracy'], label='Validation')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train')
    plt.plot(history.history['val_loss'], label='Validation')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.close()

    return accuracy, report

# Save model and encoder
def save_model(model, label_encoder):
    model.save('emotion_recognition_model.keras')
    np.save('label_encoder_classes.npy', label_encoder.classes_)
    print("Model and label encoder saved successfully.")

# Predict emotion for new audio
def predict_emotion(file_path, model, label_encoder):
    features = extract_features(file_path)
    features = np.expand_dims(features, axis=0)
    prediction = model.predict(features)[0]
    predicted_class = np.argmax(prediction)
    predicted_emotion = label_encoder.classes_[predicted_class]
    confidence = prediction[predicted_class]
    return predicted_emotion, confidence

# Main execution
if __name__ == "__main__":
    print("Starting audio emotion recognition training...")

    X_train, X_val, X_test, y_train, y_val, y_test, label_encoder = prepare_dataset()
    print(f"Training set: {X_train.shape}")
    print(f"Validation set: {X_val.shape}")
    print(f"Test set: {X_test.shape}")

    print("Training model...")
    model, history = train_model(X_train, y_train, X_val, y_val)

    print("Evaluating model...")
    accuracy, report = evaluate_model(model, history, X_test, y_test, label_encoder)

    print("Saving model...")
    save_model(model, label_encoder)

    print("Training complete!")
    print(f"Final test accuracy: {accuracy:.4f}")

    # Example usage
    print("\nExample prediction:")
    example_file = os.path.join(DATASET_PATH, 'Happy', os.listdir(os.path.join(DATASET_PATH, 'Happy'))[0])
    emotion, confidence = predict_emotion(example_file, model, label_encoder)
    print(f"Predicted emotion: {emotion} with confidence: {confidence:.4f}")


Starting audio emotion recognition training...
Processing Suprised files...
Processing Disgusted files...
Processing Angry files...
Processing Happy files...
Loaded emotion classes: ['Angry' 'Disgusted' 'Happy' 'Suprised']
Training set: (5431, 130, 160)
Validation set: (679, 130, 160)
Test set: (679, 130, 160)
Training model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3921 - loss: 1.6854
Epoch 1: val_accuracy improved from -inf to 0.31959, saving model to best_emotion_model.keras
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.3924 - loss: 1.6843 - val_accuracy: 0.3196 - val_loss: 1.3191 - learning_rate: 0.0010
Epoch 2/100
[1m167/170[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.4760 - loss: 1.2748
Epoch 2: val_accuracy improved from 0.31959 to 0.51399, saving model to best_emotion_model.keras
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4766 - loss: 1.2730 - val_accuracy: 0.5140 - val_loss: 1.0423 - learning_rate: 0.0010
Epoch 3/100
[1m168/170[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.5838 - loss: 1.0211
Epoch 3: val_accuracy did not improve from 0.51399
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━

In [2]:
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import load_model

# Load trained model and label encoder classes
model = load_model('/kaggle/working/emotion_recognition_model.keras')
label_classes = np.load('/kaggle/working/label_encoder_classes.npy')

# Constants
SAMPLE_RATE = 22050
DURATION = 3
MAX_SAMPLES = SAMPLE_RATE * DURATION

# Feature extraction (same as training)
def extract_features(file_path):
    try:
        audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
        if len(audio) < MAX_SAMPLES:
            audio = np.pad(audio, (0, MAX_SAMPLES - len(audio)), 'constant')
        else:
            audio = audio[:MAX_SAMPLES]

        mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=13)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=SAMPLE_RATE)
        chroma = librosa.feature.chroma_stft(y=audio, sr=SAMPLE_RATE)
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        features = np.concatenate([mfccs, spectral_contrast, chroma, mel_spec_db], axis=0)
        features = features.T
        return features
    except Exception as e:
        print(f"[ERROR] Feature extraction failed: {e}")
        return None

# Prediction
def predict_emotion(file_path):
    features = extract_features(file_path)
    if features is None:
        return None, None
    features = np.expand_dims(features, axis=0)
    prediction = model.predict(features)[0]
    predicted_index = np.argmax(prediction)
    predicted_emotion = label_classes[predicted_index]
    confidence = prediction[predicted_index]
    return predicted_emotion, confidence

# Continuous testing loop
def test_loop():
    print("🎤 Emotion Recognition Testing")
    print("Give path to a .wav file or type 'exit' to stop.\n")

    while True:
        file_path = input("🔎 Enter .wav file path: ").strip()

        if file_path.lower() == 'exit':
            print("👋 Exiting testing loop.")
            break

        if not os.path.isfile(file_path) or not file_path.endswith('.wav'):
            print("⚠️  Invalid file path. Please provide a valid .wav file.\n")
            continue

        emotion, confidence = predict_emotion(file_path)
        if emotion:
            print(f"✅ Emotion: {emotion} | 🔥 Confidence: {confidence:.2f}\n")
        else:
            print("❌ Could not predict emotion.\n")

if __name__ == "__main__":
    test_loop()


🎤 Emotion Recognition Testing
Give path to a .wav file or type 'exit' to stop.



StdinNotImplementedError: raw_input was called, but this frontend does not support input requests.