In [2]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift


# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define Residual Block
def residual_block(x, filters, kernel_size, strides=1, activation='relu'):
    y = layers.Conv1D(filters, kernel_size, strides=strides, padding='same')(x)
    y = layers.BatchNormalization()(y)
    y = layers.Activation(activation)(y)

    y = layers.Conv1D(filters, kernel_size, padding='same')(y)
    y = layers.BatchNormalization()(y)

    # Shortcut connection
    if strides != 1 or x.shape[-1] != filters:
        x = layers.Conv1D(filters, 1, strides=strides, padding='same')(x)
        x = layers.BatchNormalization()(x)

    # Merge
    y = layers.add([x, y])
    y = layers.Activation(activation)(y)
    return y

# Define VQ-MAE-S ResNet model architecture
def build_resnet(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv1D(32, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2)(x)

    x = residual_block(x, filters=64, kernel_size=3)
    x = layers.MaxPooling1D(2)(x)

    x = residual_block(x, filters=128, kernel_size=3)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    return model

# Build the model
resnet_model = build_resnet(input_shape=X_train_reshaped.shape[1:], num_classes=num_classes)

# Compile the model
resnet_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
resnet_model.summary()

# Train the model
resnet_history = resnet_model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
test_loss, test_acc = resnet_model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128, 1)]             0         []                            
                                                                                                  
 conv1d (Conv1D)             (None, 128, 32)              128       ['input_1[0][0]']             
                                                                                                  
 max_pooling1d (MaxPooling1  (None, 64, 32)               0         ['conv1d[0][0]']              
 D)                                                                                               
                                                                                                  
 conv1d_1 (Conv1D)           (None, 64, 64)               6208      ['max_pooling1d[0][0]']   

In [3]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift
import tensorflow as tf

# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
    PitchShift(min_semitones=-5, max_semitones=5, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define Residual Block
def residual_block(x, filters, kernel_size, strides=1, activation='relu'):
    y = layers.Conv1D(filters, kernel_size, strides=strides, padding='same')(x)
    y = layers.BatchNormalization()(y)
    y = layers.Activation(activation)(y)

    y = layers.Conv1D(filters, kernel_size, padding='same')(y)
    y = layers.BatchNormalization()(y)

    # Shortcut connection
    if strides != 1 or x.shape[-1] != filters:
        x = layers.Conv1D(filters, 1, strides=strides, padding='same')(x)
        x = layers.BatchNormalization()(x)

    # Merge
    y = layers.add([x, y])
    y = layers.Activation(activation)(y)
    return y

# Define VQ-MAE-S ResNet model architecture
def build_resnet(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv1D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2)(x)

    x = residual_block(x, filters=128, kernel_size=3)
    x = layers.MaxPooling1D(2)(x)

    x = residual_block(x, filters=256, kernel_size=3)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    return model

# Build the model
resnet_model = build_resnet(input_shape=X_train_reshaped.shape[1:], num_classes=num_classes)

# Adjust learning rate
optimizer = tf.keras.optimizers.Adam(lr=0.0001)

# Early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Compile the model
resnet_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
resnet_model.summary()

# Train the model
resnet_history = resnet_model.fit(X_train_reshaped, y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on test data
test_loss, test_acc = resnet_model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)




Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 128, 1)]             0         []                            
                                                                                                  
 conv1d_7 (Conv1D)           (None, 128, 64)              256       ['input_2[0][0]']             
                                                                                                  
 max_pooling1d_3 (MaxPoolin  (None, 64, 64)               0         ['conv1d_7[0][0]']            
 g1D)                                                                                             
                                                                                                  
 conv1d_8 (Conv1D)           (None, 64, 128)              24704     ['max_pooling1d_3[0][0]'

In [4]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift
import tensorflow as tf

# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
    PitchShift(min_semitones=-5, max_semitones=5, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define a learning rate scheduler
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3)

# Early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define Residual Block
def residual_block(x, filters, kernel_size, strides=1, activation='relu'):
    y = layers.Conv1D(filters, kernel_size, strides=strides, padding='same')(x)
    y = layers.BatchNormalization()(y)
    y = layers.Activation(activation)(y)

    y = layers.Conv1D(filters, kernel_size, padding='same')(y)
    y = layers.BatchNormalization()(y)

    # Shortcut connection
    if strides != 1 or x.shape[-1] != filters:
        x = layers.Conv1D(filters, 1, strides=strides, padding='same')(x)
        x = layers.BatchNormalization()(x)

    # Merge
    y = layers.add([x, y])
    y = layers.Activation(activation)(y)
    return y

# Define VQ-MAE-S ResNet model architecture with increased complexity
def build_resnet(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv1D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2)(x)

    x = residual_block(x, filters=128, kernel_size=3)
    x = layers.MaxPooling1D(2)(x)

    x = residual_block(x, filters=256, kernel_size=3)
    x = layers.MaxPooling1D(2)(x)

    x = residual_block(x, filters=512, kernel_size=3)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    return model

# Build the model
resnet_model = build_resnet(input_shape=X_train_reshaped.shape[1:], num_classes=num_classes)

# Compile the model with the legacy optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
resnet_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
resnet_model.summary()

# Train the model with the learning rate scheduler and early stopping
resnet_history = resnet_model.fit(X_train_reshaped, y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[lr_scheduler, early_stopping])

# Evaluate the model on test data
test_loss, test_acc = resnet_model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)




Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 128, 1)]             0         []                            
                                                                                                  
 conv1d_14 (Conv1D)          (None, 128, 64)              256       ['input_3[0][0]']             
                                                                                                  
 max_pooling1d_6 (MaxPoolin  (None, 64, 64)               0         ['conv1d_14[0][0]']           
 g1D)                                                                                             
                                                                                                  
 conv1d_15 (Conv1D)          (None, 64, 128)              24704     ['max_pooling1d_6[0][0]'

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift
import tensorflow as tf

# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
    PitchShift(min_semitones=-5, max_semitones=5, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define VQ-MAE-S ResNet model architecture with increased complexity
def build_resnet(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv1D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(128, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(256, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(512, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    return model

# Build the model
resnet_model = build_resnet(input_shape=X_train_reshaped.shape[1:], num_classes=num_classes)

# Compile the model
resnet_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
resnet_model.summary()

# Train the model
history = resnet_model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
test_loss, test_acc = resnet_model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 128, 1)]          0         
                                                                 
 conv1d_24 (Conv1D)          (None, 128, 64)           256       
                                                                 
 max_pooling1d_10 (MaxPooli  (None, 64, 64)            0         
 ng1D)                                                           
                                                                 
 conv1d_25 (Conv1D)          (None, 64, 128)           24704     
                                                                 
 max_pooling1d_11 (MaxPooli  (None, 32, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_26 (Conv1D)          (None, 32, 256)           9856

In [6]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
import tensorflow as tf

# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
    PitchShift(min_semitones=-5, max_semitones=5, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define VQ-MAE-S ResNet model architecture with dropout regularization and early stopping
def build_resnet_dropout(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv1D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(128, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(256, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(512, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)  # Add dropout regularization
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    return model

# Build and compile the model
resnet_dropout_model = build_resnet_dropout(input_shape=X_train_reshaped.shape[1:], num_classes=num_classes)
resnet_dropout_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
resnet_dropout_model.summary()

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with dropout regularization and early stopping
history = resnet_dropout_model.fit(X_train_reshaped, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on test data
test_loss, test_acc = resnet_dropout_model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 128, 1)]          0         
                                                                 
 conv1d_28 (Conv1D)          (None, 128, 64)           256       
                                                                 
 max_pooling1d_14 (MaxPooli  (None, 64, 64)            0         
 ng1D)                                                           
                                                                 
 conv1d_29 (Conv1D)          (None, 64, 128)           24704     
                                                                 
 max_pooling1d_15 (MaxPooli  (None, 32, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_30 (Conv1D)          (None, 32, 256)           9856

In [7]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
import tensorflow as tf

# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
    PitchShift(min_semitones=-5, max_semitones=5, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define VQ-MAE-S ResNet model architecture with dropout regularization and early stopping
def build_resnet_dropout(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv1D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(128, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(256, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Conv1D(512, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)  # Add dropout regularization
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs, outputs)
    return model

# Build and compile the model
resnet_dropout_model = build_resnet_dropout(input_shape=X_train_reshaped.shape[1:], num_classes=num_classes)
resnet_dropout_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
resnet_dropout_model.summary()

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with dropout regularization and early stopping
history = resnet_dropout_model.fit(X_train_reshaped, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on test data
test_loss, test_acc = resnet_dropout_model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)


Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 128, 1)]          0         
                                                                 
 conv1d_32 (Conv1D)          (None, 128, 64)           256       
                                                                 
 max_pooling1d_18 (MaxPooli  (None, 64, 64)            0         
 ng1D)                                                           
                                                                 
 conv1d_33 (Conv1D)          (None, 64, 128)           24704     
                                                                 
 max_pooling1d_19 (MaxPooli  (None, 32, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_34 (Conv1D)          (None, 32, 256)           9856

In [8]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
import tensorflow as tf

# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
    PitchShift(min_semitones=-5, max_semitones=5, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define the modified model
model = models.Sequential([
    layers.Conv1D(256, 10, padding='same', input_shape=(X_train_reshaped.shape[1], 1)),
    layers.Activation('relu'),
    layers.Conv1D(256, 10, padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(128, 10, padding='same'),
    layers.Activation('relu'),
    layers.Conv1D(128, 10, padding='same'),
    layers.Activation('relu'),
    layers.Conv1D(128, 10, padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(64, 10, padding='same'),
    layers.Activation('relu'),
    layers.Conv1D(64, 10, padding='same'),
    layers.Activation('relu'),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_classes, activation='softmax')
])

# Use Adam optimizer with a lower learning rate
opt = tf.keras.optimizers.Adam(lr=0.0001)

# Compile the model
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
test_loss, test_acc = model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_36 (Conv1D)          (None, 128, 256)          2816      
                                                                 
 activation_14 (Activation)  (None, 128, 256)          0         
                                                                 
 conv1d_37 (Conv1D)          (None, 128, 256)          655616    
                                                                 
 batch_normalization_21 (Ba  (None, 128, 256)          1024      
 tchNormalization)                                               
                                                                 
 activation_15 (Activation)  (None, 128, 256)          0         
                                                                 
 max_pooling1d_22 (MaxPooli  (None, 42, 256)           0         
 ng1D)                                                  

In [9]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras import layers, models
import tensorflow as tf
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift

# Load the dataset and perform feature extraction
df = pd.read_csv("/Users/roshanscaria/Desktop/Audio Emotion/Data_path.csv")

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
    TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5),
    PitchShift(min_semitones=-5, max_semitones=5, p=0.5)
])

# Initialize lists to store features and labels
features = []
labels = []

# Loop feature extraction over the entire dataset
for label, path in zip(df['labels'], df['path']):
    # Load audio file
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=2.5, sr=44100, offset=0.5)

    # Apply augmentation
    X_augmented = augment(samples=X, sample_rate=sample_rate)

    # Compute Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=X_augmented, sr=sample_rate, n_mels=128)

    # Convert to decibel scale (log-mel spectrogram)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Take the mean as the feature
    feature = np.mean(log_mel_spectrogram, axis=1)

    # Append feature and label to the lists
    features.append(feature)
    labels.append(label)

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Initialize a LabelEncoder object
lb = LabelEncoder()

# Encode the labels
y_encoded = lb.fit_transform(y)

# Perform one-hot encoding on the labels
num_classes = len(np.unique(y))
y_encoded = to_categorical(y_encoded, num_classes=num_classes)

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data to fit the input shape of the model
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define the modified model
model = models.Sequential([
    layers.Conv1D(256, 10, padding='same', input_shape=(X_train_reshaped.shape[1], 1)),
    layers.Activation('relu'),
    layers.Conv1D(256, 10, padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(128, 10, padding='same'),
    layers.Activation('relu'),
    layers.Conv1D(128, 10, padding='same'),
    layers.Activation('relu'),
    layers.Conv1D(128, 10, padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(64, 10, padding='same'),
    layers.Activation('relu'),
    layers.Conv1D(64, 10, padding='same'),
    layers.Activation('relu'),
    layers.Flatten(),
    layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    layers.Dropout(0.6),
    layers.Dense(num_classes, activation='softmax')
])

# Use Adam optimizer with a lower learning rate
opt = tf.keras.optimizers.Adam(lr=0.0001)

# Compile the model
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Implement Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# Train the model with early stopping
history = model.fit(
    X_train_reshaped, y_train, 
    epochs=50, 
    batch_size=32, 
    validation_split=0.2, 
    callbacks=[early_stopping]
)

# Evaluate the model on test data
test_loss, test_acc = model.evaluate(X_test_reshaped, y_test)
print("Test accuracy:", test_acc)




Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_43 (Conv1D)          (None, 128, 256)          2816      
                                                                 
 activation_21 (Activation)  (None, 128, 256)          0         
                                                                 
 conv1d_44 (Conv1D)          (None, 128, 256)          655616    
                                                                 
 batch_normalization_23 (Ba  (None, 128, 256)          1024      
 tchNormalization)                                               
                                                                 
 activation_22 (Activation)  (None, 128, 256)          0         
                                                                 
 max_pooling1d_24 (MaxPooli  (None, 42, 256)           0         
 ng1D)                                                