In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uwrfkaggler/ravdess-emotional-speech-audio?dataset_version_number=1...


100%|██████████| 429M/429M [00:06<00:00, 69.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1


In [None]:
dataset_path = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1"


In [None]:
import os
import librosa

# Path to an example audio file (you should specify an actual path here)
example_audio_file = os.path.join(dataset_path, "Actor_01", "03-01-01-01-01-01-01.wav")

# Load the audio file
audio, sr = librosa.load(example_audio_file, sr=None) #no resample

# Print the audio properties
print(f"Audio shape: {audio.shape}") #audio length
print(f"Sample rate: {sr}")


Audio shape: (158558,)
Sample rate: 48000


In [None]:
import librosa.display
import numpy as np

# Function to extract MFCC Mel-Frequency Cepstral Coefficients features from an audio signal
def extract_features(audio, sr):
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return np.mean(mfcc.T, axis=0)

# Extract features from the example audio
features = extract_features(audio, sr)

# Print the extracted features
print("Extracted MFCC features:")
print(features)


Extracted MFCC features:
[-726.2172      68.54142      3.2933977   12.2053       5.5102777
   13.66741     -2.9838285    3.0980296   -3.3108134   -1.5643842
   -7.8616524   -2.1242816    2.849204 ]


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize an empty list to store features and labels
X = []
y = []

# Define emotion labels
emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]

# Iterate through the dataset and extract features and labels
for actor in os.listdir(dataset_path):
    actor_path = os.path.join(dataset_path, actor)
    if os.path.isdir(actor_path):
        for file in os.listdir(actor_path):
            if file.endswith(".wav"):
                file_path = os.path.join(actor_path, file)
                # Load the audio file
                audio, sr = librosa.load(file_path, sr=None)
                # Extract features
                features = extract_features(audio, sr)
                X.append(features)
                # Get the emotion from the file name (e.g., angry, happy, etc.)
                emotion = file.split("-")[2]  # Assuming emotion is the 3rd part of filename
                y.append(emotion)

# Convert X and y to numpy arrays
X = np.array(X)
y = np.array(y)

# Encode the labels (emotions)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


Training data shape: (1152, 13)
Test data shape: (288, 13)


In [None]:


from sklearn.metrics import classification_report, confusion_matrix

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

          01       0.71      0.31      0.43        16
          02       0.47      0.71      0.56        31
          03       0.62      0.48      0.54        48
          04       0.53      0.50      0.51        36
          05       0.60      0.46      0.52        39
          06       0.55      0.63      0.59        35
          07       0.59      0.68      0.63        40
          08       0.60      0.65      0.62        43

    accuracy                           0.57       288
   macro avg       0.58      0.55      0.55       288
weighted avg       0.58      0.57      0.56       288

Confusion Matrix:
[[ 5  7  1  1  0  0  2  0]
 [ 1 22  1  5  0  1  1  0]
 [ 0  1 23  3  6  6  1  8]
 [ 1  5  2 18  2  5  1  2]
 [ 0  3  5  3 18  2  3  5]
 [ 0  2  1  2  2 22  5  1]
 [ 0  6  2  1  1  0 27  3]
 [ 0  1  2  1  1  4  6 28]]


In [None]:
new_audio_file = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_01/03-01-01-01-01-01-01.wav"


In [None]:
import os
import librosa

# Correct path to an actual audio file
new_audio_file = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_01/03-01-01-01-01-01-01.wav"

# Load the audio file
audio, sr = librosa.load(new_audio_file, sr=None)

# Extract features (assuming `extract_features` is defined elsewhere)
new_features = extract_features(audio, sr).reshape(1, -1)

# Predict emotion for the new audio file
predicted_label = classifier.predict(new_features)
predicted_emotion = encoder.inverse_transform(predicted_label)

# Output the predicted emotion
print(f"Predicted Emotion: {predicted_emotion[0]}")


Predicted Emotion: 01


In [None]:
emotion_classes = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']


In [None]:
# Predicted label (e.g., 0, 1, etc.)
predicted_label = 1  # Example predicted label

# Map the numeric label to the corresponding emotion name
predicted_emotion = emotion_classes[predicted_label]

print(f"Predicted Emotion: {predicted_emotion}")


Predicted Emotion: disgust


In [None]:
import librosa

# Path to the new audio file (make sure the path is correct)
new_audio_file = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_01/03-01-01-01-01-01-01.wav"

# Load the audio file
audio, sr = librosa.load(new_audio_file, sr=None)

# Extract features (assuming `extract_features` is defined elsewhere)
new_features = extract_features(audio, sr).reshape(1, -1)

# Predict emotion for the new audio file
predicted_label = classifier.predict(new_features)

# Emotion classes in words
emotion_classes = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

# Map the predicted label to the corresponding emotion name
predicted_emotion = emotion_classes[predicted_label[0]]

# Output the predicted emotion
print(f"Predicted Emotion: {predicted_emotion}")


Predicted Emotion: angry


In [None]:
print(f"y_train shape before one-hot encoding: {y_train.shape}")


y_train shape before one-hot encoding: (1003, 7, 7, 7, 7)


In [None]:
# Check the unique values in y_train to ensure they are valid
print(f"Unique labels in y_train: {np.unique(y_train)}")


Unique labels in y_train: [0. 1.]


In [None]:
y_train = keras.utils.to_categorical(y_train, len(emotion_classes))  # One-hot encoding


In [None]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


X_train shape: (1003, 13, 1)
y_train shape: (1003, 7, 7, 7, 7, 7)


In [None]:
import os
import librosa
import numpy as np

# Path to the dataset directory
dataset_path = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1"

# Emotion classes in words
emotion_classes = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

# Function to extract features (make sure this function is defined as per your requirements)
def extract_features(audio, sr):
    # For example, extract MFCCs (Mel-frequency cepstral coefficients)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

# Loop through all actors and their corresponding audio files
for actor_folder in os.listdir(dataset_path):
    actor_path = os.path.join(dataset_path, actor_folder)
    if os.path.isdir(actor_path):  # Ensure it's a directory
        for audio_file in os.listdir(actor_path):
            audio_file_path = os.path.join(actor_path, audio_file)

            # Ensure it's a file, not a directory
            if os.path.isfile(audio_file_path):
                try:
                    # Load the audio file
                    audio, sr = librosa.load(audio_file_path, sr=None)

                    # Extract features from the audio
                    features = extract_features(audio, sr).reshape(1, -1)

                    # Predict the emotion label
                    predicted_label = classifier.predict(features)

                    # Map the predicted label to the corresponding emotion
                    predicted_emotion = emotion_classes[predicted_label[0]]

                    # Print or store the result
                    print(f"File: {audio_file_path}, Predicted Emotion: {predicted_emotion}")
                except Exception as e:
                    print(f"Error processing file {audio_file_path}: {e}")


File: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_17/03-01-03-02-02-02-17.wav, Predicted Emotion: fear
File: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_17/03-01-02-02-01-01-17.wav, Predicted Emotion: disgust
File: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_17/03-01-06-02-02-02-17.wav, Predicted Emotion: sad
File: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_17/03-01-06-01-02-01-17.wav, Predicted Emotion: sad
File: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_17/03-01-06-01-01-02-17.wav, Predicted Emotion: sad
File: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_17/03-01-04-01-01-02-17.wav, Predicted Emotion: happy
File: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/A

In [None]:
# Ensure labels are within the correct range
print(np.unique(y_train))  # Print unique values in y_train to verify

# If the labels are out of bounds (i.e., greater than the number of classes), handle them
assert np.all(np.array(y_train) < len(emotion_classes)), "Label index out of range"

# One-hot encoding
y_train = keras.utils.to_categorical(y_train, len(emotion_classes))  # One-hot encoding


[0. 1.]


In [None]:
# Check for any labels that are out of bounds
y_train = [label for label in y_train if label < len(emotion_classes)]

# Verify the unique labels again
print(np.unique(y_train))

# One-hot encode the cleaned labels
y_train = keras.utils.to_categorical(y_train, len(emotion_classes))  # One-hot encoding


[0 1 2 3 4 5 6]
