<h1 style="color:Green; font-size:3em;">Import and Check RAVDESS Dataset</h1>

<h2 style="color:purple; font-size:2em;">Dataset Info</h2>
<p style="font-size:15px;"">-Name: RAVDESS – Ryerson Audio-Visual Database of Emotional Speech and Song<br>
-Subset: Speech-only (No song, no video)<br>
-Total samples: 1440 audio files (60 per actor × 24 actors)<br>
-Emotions: 8 (neutral, calm, happy, sad, angry, fearful, disgust, surprised)<br>
-File format: .wav (mono, 48kHz)</p>

<h2 style="color:purple; font-size:2em;">Import Required Libraries</h2>

In [1]:
import os 
import librosa
import numpy as np 
import pandas as pd
from glob import glob 
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

<h2 style="color:purple;">Set Data Directory</h2>

In [2]:
DATA_PATH = "../data/"
OUTPUT_PATH = "./processed_data"
os.makedirs(OUTPUT_PATH, exist_ok=True)

<h2 style="color:purple;">Define Emotion Code Mapping</h2>

In [3]:
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

<h2 style="color:purple;">Emotion Categories for feature enhancement</h2>

In [4]:
high_arousal = ['angry', 'fearful', 'happy', 'surprised']
low_arousal = ['calm', 'sad', 'neutral', 'disgust']

<h2 style="color:purple;">Feature Extraction</h2>

In [5]:
def extract_features(y, sr, emotion):
    """Extract enhanced audio features with emotion-specific processing"""
    features = {}
    
    # Basic features
    features['duration'] = librosa.get_duration(y=y, sr=sr)
    features['loudness'] = np.mean(librosa.amplitude_to_db(np.abs(y)))
    
    # Voice activity detection
    intervals = librosa.effects.split(y, top_db=20)
    active_frames = np.sum([end-start for start, end in intervals])
    features['voice_ratio'] = active_frames / len(y)
    
    # Spectral features with temporal statistics
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    
    for name, feature in [('centroid', spectral_centroid),
                         ('rolloff', spectral_rolloff),
                         ('bandwidth', spectral_bandwidth)]:
        features[f'{name}_mean'] = np.mean(feature)
        features[f'{name}_std'] = np.std(feature)
        features[f'{name}_skew'] = skew(feature)
        features[f'{name}_kurt'] = kurtosis(feature)
    
    # MFCCs with deltas
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc)
    
    for i in range(13):
        features[f'mfcc_{i+1}'] = np.mean(mfcc[i])
        features[f'mfcc_{i+1}_delta'] = np.mean(mfcc_delta[i])
    
    # Chroma and advanced features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    
    for i in range(12):
        features[f'chroma_{i+1}'] = np.mean(chroma[i])
    for i in range(7):
        features[f'contrast_{i+1}'] = np.mean(contrast[i])
    for i in range(6):
        features[f'tonnetz_{i+1}'] = np.mean(tonnetz[i])
    
    # Emotion-specific features
    if emotion in high_arousal:
        harmonic, percussive = librosa.effects.hpss(y)
        features['energy_ratio'] = np.sum(percussive**2) / (np.sum(harmonic**2) + 1e-6)
    elif emotion in low_arousal:
        features['silence_ratio'] = 1 - features['voice_ratio']
    
    # Pitch features
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch_mean = np.mean(pitches[magnitudes > np.median(magnitudes)])
    features['pitch_mean'] = pitch_mean if not np.isnan(pitch_mean) else 0
    
    return features

<h2 style="color:purple;">Audio processing function with augmentation
</h2>

In [6]:
def process_file(file_path):
    try:
        # Parse filename
        filename = os.path.basename(file_path)
        parts = filename.split('-')
        actor_num = int(parts[6].split('.')[0])
        emotion_code = parts[2]
        emotion = emotion_map[emotion_code]

        # Load and preprocess audio
        y, sr = librosa.load(file_path, sr=16000, mono=True)
        y = librosa.effects.preemphasis(y)
        y = librosa.effects.trim(y, top_db=25)[0]
        
        # RMS normalization
        rms = np.sqrt(np.mean(y**2))
        y = y / (rms + 1e-6)
        
        # Base features
        features = {
            'filename': filename,
            'actor': actor_num,
            'gender': 'male' if actor_num % 2 == 0 else 'female',
            'emotion': emotion,
            'emotion_code': emotion_code
        }
        features.update(extract_features(y, sr, emotion))
        
        # Data augmentation
        augmented_features = []
        
        # Time stretching
        y_stretch = librosa.effects.time_stretch(y, rate=0.8)
        stretch_feats = extract_features(y_stretch, sr, emotion)
        stretch_feats.update({
            'filename': filename + '_stretch',
            'actor': actor_num,
            'gender': 'male' if actor_num % 2 == 0 else 'female',
            'emotion': emotion,
            'emotion_code': emotion_code
        })
        augmented_features.append(stretch_feats)
        
        # Pitch shifting
        y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=2)
        shift_feats = extract_features(y_shift, sr, emotion)
        shift_feats.update({
            'filename': filename + '_shift',
            'actor': actor_num,
            'gender': 'male' if actor_num % 2 == 0 else 'female',
            'emotion': emotion,
            'emotion_code': emotion_code
        })
        augmented_features.append(shift_feats)
        
        return [features] + augmented_features
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

<h2 style="color:purple;">Process all audio files
</h2>

In [7]:
print("Finding all audio files...")
all_files = []
for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            all_files.append(os.path.join(root, file))

print(f"Processing {len(all_files)} files with augmentation...")
results = []
for file in tqdm(all_files, desc="Processing files"):
    data = process_file(file)
    if data:
        results.extend(data)

Finding all audio files...
Processing 1440 files with augmentation...


Processing files: 100%|███████████████████| 1440/1440 [1:55:50<00:00,  4.83s/it]


<h3 style='color:purple'>Create Dataframe and Save</h3>

In [8]:
df = pd.DataFrame(results)
df

Unnamed: 0,filename,actor,gender,emotion,emotion_code,duration,loudness,voice_ratio,centroid_mean,centroid_std,...,contrast_7,tonnetz_1,tonnetz_2,tonnetz_3,tonnetz_4,tonnetz_5,tonnetz_6,silence_ratio,pitch_mean,energy_ratio
0,03-01-07-01-01-02-18.wav,18,male,disgust,07,2.176,-17.363924,0.764706,3103.861734,1133.595952,...,51.832581,-0.018983,-0.025689,-0.019540,0.037315,-0.004874,-0.002966,0.235294,2245.666748,
1,03-01-07-01-01-02-18.wav_stretch,18,male,disgust,07,2.720,-16.133003,0.776471,3117.783534,1044.945930,...,51.422385,-0.005568,-0.029710,-0.014867,0.044577,-0.008606,-0.012778,0.223529,2203.087402,
2,03-01-07-01-01-02-18.wav_shift,18,male,disgust,07,2.176,-16.485405,0.823529,3229.374024,1080.269650,...,53.300893,-0.029735,0.004281,0.034622,-0.051420,0.008897,0.000196,0.176471,2135.273682,
3,03-01-05-01-02-01-18.wav,18,male,angry,05,1.632,-12.006447,0.980392,3149.329740,1393.901212,...,50.715246,-0.001349,-0.016009,0.012247,-0.020919,-0.010431,-0.004213,,2139.219238,23.973597
4,03-01-05-01-02-01-18.wav_stretch,18,male,angry,05,2.040,-12.320189,0.988235,3173.226784,1353.186854,...,50.693348,0.019958,-0.022524,0.015145,-0.022695,-0.009523,-0.004047,,2079.598633,13.721988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4315,03-01-07-02-02-02-21.wav_stretch,21,female,disgust,07,2.440,-17.121801,0.681967,3172.201855,1212.521763,...,50.792559,-0.010724,0.030826,-0.064574,-0.008770,-0.001116,0.002784,0.318033,2215.400879,
4316,03-01-07-02-02-02-21.wav_shift,21,female,disgust,07,1.952,-17.909994,0.819672,3362.734136,1220.548073,...,52.656174,0.015232,0.011795,0.079857,0.013958,-0.009284,-0.002340,0.180328,2332.780518,
4317,03-01-07-01-02-01-21.wav,21,female,disgust,07,1.696,-17.534981,0.792453,2965.614712,1101.992031,...,51.646616,-0.024405,-0.000358,-0.067770,-0.032034,-0.011848,0.000847,0.207547,2187.190674,
4318,03-01-07-01-02-01-21.wav_stretch,21,female,disgust,07,2.120,-16.670708,0.815094,2993.489932,1055.075245,...,51.237660,-0.024020,0.016230,-0.064485,0.004123,-0.017546,-0.010262,0.184906,2201.601807,


In [9]:
df = pd.DataFrame(results)
df.to_csv("audio_metadata.csv", index=False)

In [10]:
print(f"\nDone! Processed {len(df)} files.")
print("Saved to 'audio_metadata.csv'")
print("\nSample data:")
print(df.head())


Done! Processed 4320 files.
Saved to 'audio_metadata.csv'

Sample data:
                           filename  actor gender  emotion emotion_code  \
0          03-01-07-01-01-02-18.wav     18   male  disgust           07   
1  03-01-07-01-01-02-18.wav_stretch     18   male  disgust           07   
2    03-01-07-01-01-02-18.wav_shift     18   male  disgust           07   
3          03-01-05-01-02-01-18.wav     18   male    angry           05   
4  03-01-05-01-02-01-18.wav_stretch     18   male    angry           05   

   duration   loudness  voice_ratio  centroid_mean  centroid_std  ...  \
0     2.176 -17.363924     0.764706    3103.861734   1133.595952  ...   
1     2.720 -16.133003     0.776471    3117.783534   1044.945930  ...   
2     2.176 -16.485405     0.823529    3229.374024   1080.269650  ...   
3     1.632 -12.006447     0.980392    3149.329740   1393.901212  ...   
4     2.040 -12.320189     0.988235    3173.226784   1353.186854  ...   

   contrast_7  tonnetz_1  tonnetz_2  

In [11]:
# Summary
print("\nShape of dataset:", df.shape)
print("\nColumns:", df.columns.tolist())


Shape of dataset: (4320, 74)

Columns: ['filename', 'actor', 'gender', 'emotion', 'emotion_code', 'duration', 'loudness', 'voice_ratio', 'centroid_mean', 'centroid_std', 'centroid_skew', 'centroid_kurt', 'rolloff_mean', 'rolloff_std', 'rolloff_skew', 'rolloff_kurt', 'bandwidth_mean', 'bandwidth_std', 'bandwidth_skew', 'bandwidth_kurt', 'mfcc_1', 'mfcc_1_delta', 'mfcc_2', 'mfcc_2_delta', 'mfcc_3', 'mfcc_3_delta', 'mfcc_4', 'mfcc_4_delta', 'mfcc_5', 'mfcc_5_delta', 'mfcc_6', 'mfcc_6_delta', 'mfcc_7', 'mfcc_7_delta', 'mfcc_8', 'mfcc_8_delta', 'mfcc_9', 'mfcc_9_delta', 'mfcc_10', 'mfcc_10_delta', 'mfcc_11', 'mfcc_11_delta', 'mfcc_12', 'mfcc_12_delta', 'mfcc_13', 'mfcc_13_delta', 'chroma_1', 'chroma_2', 'chroma_3', 'chroma_4', 'chroma_5', 'chroma_6', 'chroma_7', 'chroma_8', 'chroma_9', 'chroma_10', 'chroma_11', 'chroma_12', 'contrast_1', 'contrast_2', 'contrast_3', 'contrast_4', 'contrast_5', 'contrast_6', 'contrast_7', 'tonnetz_1', 'tonnetz_2', 'tonnetz_3', 'tonnetz_4', 'tonnetz_5', 'tonn

In [12]:
# Check unique emotions
print("\nEmotion classes:", df['emotion'].unique())


Emotion classes: ['disgust' 'angry' 'sad' 'calm' 'neutral' 'fearful' 'happy' 'surprised']


In [13]:
# Check for missing values
print("\nMissing values:\n", df.isnull().sum())


Missing values:
 filename            0
actor               0
gender              0
emotion             0
emotion_code        0
                 ... 
tonnetz_5           0
tonnetz_6           0
silence_ratio    2304
pitch_mean          0
energy_ratio     2016
Length: 74, dtype: int64


In [14]:
# Summary stats
print("\nBasic statistics:")
print(df.describe(include='all'))


Basic statistics:
                        filename        actor gender  emotion emotion_code  \
count                       4320  4320.000000   4320     4320         4320   
unique                      4320          NaN      2        8            8   
top     03-01-07-01-01-02-18.wav          NaN   male  disgust           07   
freq                           1          NaN   2160      576          576   
mean                         NaN    12.500000    NaN      NaN          NaN   
std                          NaN     6.922988    NaN      NaN          NaN   
min                          NaN     1.000000    NaN      NaN          NaN   
25%                          NaN     6.750000    NaN      NaN          NaN   
50%                          NaN    12.500000    NaN      NaN          NaN   
75%                          NaN    18.250000    NaN      NaN          NaN   
max                          NaN    24.000000    NaN      NaN          NaN   

           duration     loudness  voice_rati

In [21]:
len(df.columns.to_list())

74