In [None]:
from tqdm import tqdm
import os
import librosa
import numpy as np

def load_and_process_audio(file_path, max_pad_len=None):
    # load a .wav file with sampling rate
    audio, sr = librosa.load(file_path, sr=22050)
    
    # spectrogram
    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
    
    # convert to db
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    # padding & tripping the calculated spectrogram!
    if max_pad_len:
        # this part confused me at first
        # we are just normalizing! 
        # making it smaller or larger as needed
        pad_width = max_pad_len - S_dB.shape[1]
        # pad if smaller
        if pad_width > 0:
            S_dB = np.pad(S_dB, pad_width=((0, 0), (0, pad_width)), mode='constant')
        # trim if larger
        else:
            S_dB = S_dB[:, :max_pad_len]
    
    # normalize dimensions
    S_dB = S_dB[..., np.newaxis]
    return S_dB

In [None]:
# from dataset
max_pad_len = 228

In [None]:
def load_data(directory, max_pad_len):
    # assign data & labels
    data = []
    labels = []
    for subdir in tqdm(os.listdir(directory)):
        subdir_path = os.path.join(directory, subdir)
        if os.path.isdir(subdir_path):
            # walk through all (thanks cursor for the help here)
            for filename in os.listdir(subdir_path):
                
                file_path = os.path.join(subdir_path, filename)
                
                if os.path.isfile(file_path) and file_path.endswith('.wav'):
                    spectrogram = load_and_process_audio(file_path, max_pad_len)
                    # based on filename format from dataset
                    emotion = filename.split('-')[2]
                    
                    data.append(spectrogram)
                    labels.append(emotion)
                    
    return np.array(data), np.array(labels)

In [None]:
import tensorflow as tf

def build_model(input_shape, dimensions):
    model = tf.keras.Sequential([
        # lots of stuff happening here!
        # we start with messy spectrogram data, but want to end up with a dimensional space
        
        # first round of feature extraction
        # capture low-level spatial features (ie textures)
        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),

        # reduces spatial dimensions for next layer
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),

        # second round of feature extraction, more complex
        # capture more complex spatial features
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),

        # reduce spatial dimensions again
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        
        # now in the format we need
        # convert 2d feature maps into 1d
        tf.keras.layers.Flatten(),
        
        # 1D representation -> embedding
        # represent embedding as dimensional vector
        tf.keras.layers.Dense(dimensions, activation='relu'),
        
        # different classifications of emotions (we have 8 of them!)
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X, y = load_data('ravdess/', max_pad_len)
X.shape, X[0].shape

In [None]:
# similar approach to contrastive learning!
# we want to encode data based on our labels
# that's kinda the point of this all

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

In [None]:
input_shape = (128, max_pad_len, 1)
num_classes = len(np.unique(y_encoded))
dimensions = 256

model = build_model(input_shape, dimensions)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y_categorical, epochs=10, batch_size=32)

In [None]:
# given from dataset!

emotion_mapping = {
    1: 'neutral',
    2: 'calm',
    3: 'happy',
    4: 'sad',
    5: 'angry',
    6: 'fearful',
    7: 'disgust',
    8: 'surprised'
}

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

# initialize model to extract features like input and layers
_ = model.predict(X)

# dense layer
embeddings = model.layers[-2].output

# run model on our initial data
model_embedding = tf.keras.Model(inputs=model.input, outputs=embeddings)
X_embed = model_embedding.predict(X)

# reduce down to 2dim space
tsne = TSNE(n_components=2, random_state=42)
X_reduced = tsne.fit_transform(X_embed)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_encoded, cmap='viridis')

cbar = plt.colorbar(scatter, ticks=range(len(np.unique(y_encoded))))

cbar.ax.set_yticklabels([emotion_mapping[i] for i in sorted(emotion_mapping)])

plt.title('EMBEDDINGS REDUCED TO 2 DIMENSIONS')
plt.show()

In [None]:
import pandas as pd
import plotly.express as px

# in 3d, had to look this one up unfortunately but it looks so cool
tsne_3d = TSNE(n_components=3, random_state=42)
X_reduced_3d = tsne_3d.fit_transform(X_embed)

fig = px.scatter_3d(
    x=X_reduced_3d[:, 0],
    y=X_reduced_3d[:, 1],
    z=X_reduced_3d[:, 2],
    color=emotion_labels[y_encoded],
    title='3D NOW???'
)

fig.update_traces(marker=dict(size=5))
fig.show()