# Speech Emotion Recognition (SER) - AI Club Task
This project uses a 2D CNN to classify emotions from the RAVDESS dataset.
The target classes are: Neutral, Calm, Happy, Sad, Angry, Fearful, Disgust, Surprised.

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os

# Visualizing Angry vs Sad Spectrograms
def plot_spec(path, title):
    y, sr = librosa.load(path)
    yt, _ = librosa.effects.trim(y) # Phase 1: Silence Trimming
    spec = librosa.feature.melspectrogram(y=yt, sr=sr)
    db = librosa.power_to_db(spec, ref=np.max)
    plt.figure(figsize=(10, 3))
    librosa.display.specshow(db, sr=sr, x_axis='time', y_axis='mel')
    plt.title(title)
    plt.colorbar(format='%+2.0f dB')

# Replace these paths with actual files in your data folder
plot_spec('data/Actor_01/03-01-05-01-01-01-01.wav', 'Mel-Spectrogram: Angry (High Energy)')
plot_spec('data/Actor_01/03-01-04-01-01-01-01.wav', 'Mel-Spectrogram: Sad (Low Energy)')
plt.show()

In [None]:
def extract_features(data_path):
    X, y = [], []
    for folder in os.listdir(data_path):
        if not folder.startswith('Actor'): continue
        path = os.path.join(data_path, folder)
        for file in os.listdir(path):
            emotion = int(file.split('-')[2]) - 1 # Labeling
            audio, sr = librosa.load(os.path.join(path, file), duration=3)
            # Log-Mel Spectrogram + Padding to 128x128
            mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
            db = librosa.power_to_db(mel)
            if db.shape[1] < 128:
                db = np.pad(db, ((0,0), (0, 128 - db.shape[1])))
            else:
                db = db[:, :128]
            X.append(db)
            y.append(emotion)
    return np.array(X), np.array(y)

X, y = extract_features('data/')
X = X.reshape(X.shape[0], 128, 128, 1) # Reshape for 2D CNN

In [None]:
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5), # Regularization to prevent overfitting
    layers.Dense(8, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, validation_split=0.1)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

y_pred = np.argmax(model.predict(X_test), axis=1)
emotions = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fearful', 'Disgust', 'Surprised']

print(classification_report(y_test, y_pred, target_names=emotions))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=emotions, yticklabels=emotions)
plt.title('Confusion Matrix')
plt.show()

In [None]:
model.save('emotion_model.h5')