### Load Model & Preprocessing Objects

In [2]:
import pickle
import numpy as np
from tensorflow.keras.models import model_from_json, load_model
from tensorflow.keras.preprocessing import sequence
import librosa

# Load CNN model architecture
with open("CNN_model.json", "r") as json_file:
    loaded_model_json = json_file.read()
loaded_model = model_from_json(loaded_model_json)

# Load CNN model weights
loaded_model.load_weights("CNN_model.weights.h5")
loaded_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Load the fully trained model (if needed)
trained_model = load_model("trained_model.h5")

# Load preprocessing objects
with open("encoder2.pickle", "rb") as f:
    encoder2 = pickle.load(f)

with open("scaler2.pickle", "rb") as f:
    scaler2 = pickle.load(f)

print("✅ All models and preprocessing objects loaded successfully!")


  saveable.load_own_variables(weights_store.get(inner_path))


✅ All models and preprocessing objects loaded successfully!


### Feature Extraction Functions

In [3]:
def zcr(data, frame_length, hop_length):
    zcr = librosa.feature.zero_crossing_rate(data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(zcr)

def rmse(data, frame_length=2048, hop_length=512):
    rmse = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(rmse)

def mfcc(data, sr, frame_length=2048, hop_length=512, flatten: bool = True):
    mfcc = librosa.feature.mfcc(y=data, sr=sr)
    return np.squeeze(mfcc.T) if not flatten else np.ravel(mfcc.T)

def extract_features(data, sr=22050, frame_length=2048, hop_length=512, target_size=2376):
    result = np.array([])

    result = np.hstack((result,
                        zcr(data, frame_length, hop_length),
                        rmse(data, frame_length, hop_length),
                        mfcc(data, sr, frame_length, hop_length)
                       ))
    
    # Adjust feature vector size
    if len(result) < target_size:
        # Pad with zeros if smaller
        result = np.pad(result, (0, target_size - len(result)), mode='constant')
    else:
        # Truncate if larger
        result = result[:target_size]

    return result


def get_predict_feat(path):
    d, s_rate = librosa.load(path, duration=2.5, offset=0.6)
    res = extract_features(d)
    result = np.array(res)
    result = np.reshape(result, (1, 2376))  # Ensures correct shape
    i_result = scaler2.transform(result)
    final_result = np.expand_dims(i_result, axis=2)
    
    return final_result


### Prediction Function

In [4]:
emotions1 = {1: 'Neutral', 2: 'Calm', 3: 'Happy', 4: 'Sad', 5: 'Angry', 6: 'Fear', 7: 'Disgust', 8: 'Surprise'}

def prediction(path1):
    res = get_predict_feat(path1)
    predictions = loaded_model.predict(res)
    y_pred = encoder2.inverse_transform(predictions)
    print(f"🎤 Predicted Emotion: {y_pred[0][0]}")    


In [5]:
prediction("Datasets/RAVDESS/Actor_02/03-01-01-01-01-01-02.wav")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step
🎤 Predicted Emotion: neutral


In [6]:
prediction("Datasets/CREMA -D/AudioWAV/1001_DFA_HAP_XX.wav")
prediction("Datasets/CREMA -D/AudioWAV/1001_DFA_ANG_XX.wav")
prediction("Datasets/CREMA -D/AudioWAV/1001_DFA_DIS_XX.wav")
prediction("Datasets/CREMA -D/AudioWAV/1001_DFA_FEA_XX.wav")
prediction("Datasets/CREMA -D/AudioWAV/1001_DFA_NEU_XX.wav")
prediction("Datasets/CREMA -D/AudioWAV/1001_DFA_SAD_XX.wav")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
🎤 Predicted Emotion: happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
🎤 Predicted Emotion: angry
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
🎤 Predicted Emotion: disgust
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
🎤 Predicted Emotion: fear
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
🎤 Predicted Emotion: neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
🎤 Predicted Emotion: sad


In [7]:
prediction("Datasets/TESS Toronto emotional speech set data/OAF_angry/OAF_back_angry.wav")
prediction("Datasets/TESS Toronto emotional speech set data/OAF_disgust/OAF_back_disgust.wav")
prediction("Datasets/TESS Toronto emotional speech set data/OAF_Fear/OAF_back_fear.wav")
prediction("Datasets/TESS Toronto emotional speech set data/OAF_happy/OAF_back_happy.wav")
prediction("Datasets/TESS Toronto emotional speech set data/OAF_neutral/OA_bite_neutral.wav")
prediction("Datasets/TESS Toronto emotional speech set data/OAF_Pleasant_surprise/OAF_back_ps.wav")
prediction("Datasets/TESS Toronto emotional speech set data/OAF_Sad/OAF_back_sad.wav")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
🎤 Predicted Emotion: angry
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
🎤 Predicted Emotion: surprise
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
🎤 Predicted Emotion: fear
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
🎤 Predicted Emotion: happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
🎤 Predicted Emotion: neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
🎤 Predicted Emotion: surprise
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
🎤 Predicted Emotion: sad


In [8]:
prediction("Datasets/TESS Toronto emotional speech set data/YAF_angry/YAF_back_angry.wav")
prediction("Datasets/TESS Toronto emotional speech set data/YAF_disgust/YAF_back_disgust.wav")
prediction("Datasets/TESS Toronto emotional speech set data/YAF_fear/YAF_back_fear.wav")
prediction("Datasets/TESS Toronto emotional speech set data/YAF_happy/YAF_back_happy.wav")
prediction("Datasets/TESS Toronto emotional speech set data/YAF_neutral/YAF_back_neutral.wav")
prediction("Datasets/TESS Toronto emotional speech set data/YAF_pleasant_surprised/YAF_back_ps.wav")
prediction("Datasets/TESS Toronto emotional speech set data/YAF_sad/YAF_back_sad.wav")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
🎤 Predicted Emotion: angry
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
🎤 Predicted Emotion: disgust
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
🎤 Predicted Emotion: fear
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
🎤 Predicted Emotion: happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
🎤 Predicted Emotion: neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
🎤 Predicted Emotion: surprise
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
🎤 Predicted Emotion: sad


In [9]:
import sounddevice as sd
import numpy as np
import librosa
import time

# Function to record audio from microphone
def record_audio(duration=2.5, sr=22050):
    print("🎤 Listening... Speak now!")
    audio = sd.rec(int(duration * sr), samplerate=sr, channels=1, dtype='float32')
    sd.wait()
    return np.squeeze(audio)

# Function to extract features from real-time audio
def extract_features_live(data, sr=22050, frame_length=2048, hop_length=512, target_size=2376):
    result = np.hstack((zcr(data, frame_length, hop_length),
                         rmse(data, frame_length, hop_length),
                         mfcc(data, sr, frame_length, hop_length)))
    
    # Adjust feature vector size
    if len(result) < target_size:
        result = np.pad(result, (0, target_size - len(result)), mode='constant')
    else:
        result = result[:target_size]
    
    return result

# Real-time emotion recognition loop
def live_emotion_recognition():
    try:
        while True:
            audio_data = record_audio()
            features = extract_features_live(audio_data)
            features = np.reshape(features, (1, 2376))
            features_scaled = scaler2.transform(features)
            input_data = np.expand_dims(features_scaled, axis=2)

            prediction = loaded_model.predict(input_data)
            detected_emotion = encoder2.inverse_transform(prediction)

            print(f"🗣️ Detected Emotion: {detected_emotion[0][0]}")
            
            time.sleep(0.5)  # Small delay before next prediction
    
    except KeyboardInterrupt:
        print("\n🛑 Real-time emotion recognition stopped.")

# Start real-time emotion detection
live_emotion_recognition()

🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
🗣️ Detected Emotion: fear
🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
🗣️ Detected Emotion: disgust
🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
🗣️ Detected Emotion: disgust
🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
🗣️ Detected Emotion: disgust
🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
🗣️ Detected Emotion: fear
🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
🗣️ Detected Emotion: angry
🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
🗣️ Detected Emotion: disgust
🎤 Listening... Speak now!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
🗣️ Detected Emotion: 

In [None]:
import os
import glob

# Folder containing the audio files
folder_path = r"D:\Library\Documents\Projects\Speech Sentiment Analysis\Datasets\TESS Toronto emotional speech set data\OAF_happy"

# Get all .wav files in the folder
audio_files = glob.glob(os.path.join(folder_path, "*.wav"))

# Loop through each file and predict
for file in audio_files:
    print(f"🔍 Processing: {os.path.basename(file)}")
    prediction(file)
    print("-" * 50)  # Separator for readability
