In [10]:
import os
import numpy as np
import librosa
import librosa.feature
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical

In [13]:
def load_ravdess_data(data_path):
    labels = []
    features = []
    emotions = {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fearful',
        '07': 'disgust',
        '08': 'surprised'
    }
    
    for root, _, files in os.walk(data_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                emotion = None
                try:
                    emotion_code = file.split("-")[2]
                    emotion = emotions[emotion_code]
                except KeyError:
                    print(f"skipping file {file_path}: unrecognized emotion code {emotion_code}")
                    continue
                except IndexError:
                     print(f"skipping file {file_path}: file name format is incorrect")
                     continue
                    
                if emotion:
                    try:
                         y, sr = librosa.load(file_path, duration=2.5, offset=0.5)
                         if y is None or sr is None :
                              print(f"skipping file {file_path}: failed to load audio")
                              continue
                        
                         mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                         mfccs_scaled = np.mean(mfccs.T, axis=0)
                
                         features.append(mfccs_scaled)
                         labels.append(emotion)
                   
                    except Exception as e:
                        print(f"skipping file {file_path}: {e}")
                        
    return np.array(features), np.array(labels)
            

In [14]:
data_path = "Audio_Speech_Actors_01-24"
X, y = load_ravdess_data(data_path)

In [15]:
if X.size == 0 or y.size == 0:
    raise ValueError("no data loaded. please check the data pathor file format.")
    

In [17]:
encoder = LabelEncoder()
y_encoder = encoder.fit_transform(y)
y_categorical = to_categorical(y_encoder)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2,random_state=42)

In [21]:
model = Sequential()
model.add(Conv1D(64,kernel_size=3, activation='relu', input_shape=(X_train.shape[1],1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(Conv1D(128,kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(np.unique(y)),activation='softmax'))

In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [23]:
X_train_reshaped = np.expand_dims(X_train, axis=2)
X_test_reshaped = np.expand_dims(X_test, axis=2)

In [25]:
history = model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, validation_data=(X_test_reshaped, y_test))

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.1224 - loss: 14.0349 - val_accuracy: 0.2188 - val_loss: 2.0574
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1432 - loss: 3.9772 - val_accuracy: 0.1701 - val_loss: 2.0644
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1639 - loss: 2.5491 - val_accuracy: 0.2153 - val_loss: 2.0486
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1463 - loss: 2.3384 - val_accuracy: 0.1979 - val_loss: 2.0523
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1708 - loss: 2.1881 - val_accuracy: 0.1979 - val_loss: 2.0379
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1557 - loss: 2.1275 - val_accuracy: 0.1944 - val_loss: 2.0318
Epoch 7/50
[1m36/36[0m [32m━━━━━━━━━

In [26]:
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Accuracy: {accuracy* 100:.2f}%")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3635 - loss: 1.7797 
Test Accuracy: 34.72%


In [28]:
model.save('emotion_recognition_model.keras')