# **ELEC 378 FINAL PROJECT: SPEECH EMOTION CLASSIFICATION**
* Team JARL
* Jasmine Lee, Arielle Sanford, Robert Heeter, Lindsey Russ
* ELEC 378: Machine Learning: Concepts & Techniques
* Rice University

* Submitted 2 May 2023

In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt

import librosa


## **Get training dataset and calculate MFCCs**

In [None]:
directory = os.path.join(os.getcwd(),'elec-378-sp2023-speech-emotion-classification/data/data/')
data = np.empty((1125, 2), dtype=object)

emotion_to_id = {
    "angry" : 0,
    "calm" : 1,
    "disgust" : 2,
    "fearful" : 3,
    "happy" : 4,
    "neutral" : 5,
    "sad" : 6,
    "surprised" : 7
}

i = 0
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)

    if os.path.isfile(f):
        emotion = filename[:len(filename)-7]
        data[i][0] = "/"+filename
        data[i][1] = int(emotion_to_id[emotion])
        i += 1

def make_mfcc(file, n_mfcc):
    sig, sr = librosa.load(file)
    sig_mfcc = librosa.feature.mfcc(y=sig, sr=sr, n_mfcc=n_mfcc, S=None, htk=True)
    sig_mfcc_avg = np.mean(sig_mfcc, axis=1)

    return sig_mfcc_avg

n_mfcc = 38

X = np.empty((len(data), n_mfcc), dtype=float)
y = np.empty((len(data)), dtype=int)

for i in range(len(data)):
    file = directory + data[i][0]
    X[i] = make_mfcc(file, n_mfcc=n_mfcc)
    y[i] = data[i][1]
    
X_train = X
y_train = y


## **Get testing dataset and calculate MFCCs (FOR KAGGLE)**

In [None]:
# directory = os.path.join(os.getcwd(),'elec-378-sp2023-speech-emotion-classification/test/test/')
# data = np.empty((315, 2), dtype=object)

# emotion_to_id = {
#     "angry" : 0,
#     "calm" : 1,
#     "disgust" : 2,
#     "fearful" : 3,
#     "happy" : 4,
#     "neutral" : 5,
#     "sad" : 6,
#     "surprised" : 7
# }

# i = 0
# for filename in os.listdir(directory):
#     f = os.path.join(directory, filename)

#     if os.path.isfile(f):
#         data[i][0] = "/"+filename
#         i += 1

# def make_mfcc(file, n_mfcc):
#     sig, sr = librosa.load(file)
#     sig_mfcc = librosa.feature.mfcc(y=sig, sr=sr, n_mfcc=n_mfcc, S=None, htk=True)
#     sig_mfcc_avg = np.mean(sig_mfcc, axis=1)

#     return sig_mfcc_avg

# n_mfcc = 3

# X = np.empty((len(data), n_mfcc), dtype=float)

# for i in range(len(data)):
#     file = directory + data[i][0]
#     X[i] = make_mfcc(file, n_mfcc=n_mfcc)

# X_test = X


## **Split train/test data (NOT FOR KAGGLE)**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


## **Support vector machine (SVM)**

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

clf = make_pipeline(RobustScaler(), SVC(C=20, tol=0.001))
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"n_mfcc: {n_mfcc}, c: {c}, acc: {accuracy*100}%")


## **Multilayer perceptron (MLP)**

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_params = {'activation': 'relu', 
              'solver': 'lbfgs', 
              'hidden_layer_sizes': 1283, 
              'alpha': 0.3849485717707319, 
              'batch_size': 163, 
              'learning_rate': 'constant',
              'max_iter':1000}

clf = make_pipeline(RobustScaler(), MLPClassifier(**mlp_params))
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"{accuracy*100}%")


## **Convolutional neural network (CNN)**

In [None]:
import tensorflow as tf
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import confusion_matrix

model = Sequential()

# model2.add(layers.Conv2D(64, (4, 4), activation='relu', kernel_regularizer=regularizers.l2(l=0.01)))
# model2.add(layers.MaxPooling2D((2, 2)))# Hidden Layer 2
# model2.add(layers.Conv2D(128, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(l=0.01)))
# model2.add(layers.MaxPooling2D((2,2)))

# model3.add(layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', input_shape=(96, 96, 3)))

# random.seed(123) # Establish Consistency in resultsmodel4 = Sequential() # Instantiate the 4th Modelmodel4.add(layers.Conv2D(32, (3, 3), activation=’relu’, input_shape=(96, 96, 3)))
# model4.add(layers.MaxPooling2D((2, 2)))
# model4.add(Dropout(0.4))
# model4.add(layers.Conv2D(64, (4, 4), activation='relu'))
# model4.add(layers.MaxPooling2D((2, 2)))
# model4.add(Dropout(0.4)) # Flattening- Convert 2D matrix to a 1D vector
# model4.add(layers.Flatten())
# model4.add(layers.Dense(512, activation = 'relu'))
# model4.add(Dropout(0.2))
# model4.add(layers.Dense(1, activation='sigmoid'))

# model5.add(layers.Conv2D(32, (3, 3), activation='relu', kernel_constraint=unit_norm(), input_shape=(96, 96, 3)))


# model.add(Conv1D(128, 16, padding='same', input_shape=(40,1)))
# model.add(Activation('relu'))

# model.add(Conv1D(128, 16, padding='same', input_shape=(40,1)))
# model.add(BatchNormalization())
# model.add(Activation('relu'))
# # model.add(Dropout(0.4))

# model.add(MaxPooling1D(pool_size=8))

# model.add(Conv1D(128, 16, padding='same', input_shape=(40,1)))
# model.add(Activation('relu'))

# model.add(Conv1D(128, 16, padding='same', input_shape=(40,1)))
# model.add(Activation('relu'))

# model.add(Conv1D(128, 16, padding='same', input_shape=(40,1)))
# model.add(Activation('relu'))

# # model.add(Conv1D(128, 16, padding='same', input_shape=(40,1)))
# # model.add(BatchNormalization())
# # model.add(Activation('relu'))

# # model.add(MaxPooling1D(pool_size=5))

# # model.add(Conv1D(128, 5, padding='same', input_shape=(40,1)))
# # model.add(Activation('relu'))

# # model.add(Conv1D(128, 5, padding='same', input_shape=(40,1)))
# # model.add(Activation('relu'))

# # model.add(Dropout(0.2))
# model.add(Flatten())
# # model.add(Dense(10, kernel_regularizer='l2', bias_regularizer='l2'))
# model.add(Dense(8))
# # model.add(Activation('softmax'))

# opt = keras.optimizers.RMSprop(learning_rate=0.0001, rho=0.9, epsilon=None, decay=0.0)


# model.add(Conv1D(128, 5, padding='same', input_shape=(40,1)))
# model.add(Activation('relu'))
# model.add(Dropout(0.1))
# model.add(MaxPooling1D(pool_size=(8)))
# model.add(Conv1D(128, 5, padding='same',))
# model.add(Activation('relu'))
# model.add(Dropout(0.1))
# model.add(Flatten())
# model.add(Dense(10))
# model.add(Activation('softmax'))
# opt = keras.optimizers.RMSprop(learning_rate=0.0005, rho=0.9, epsilon=None, decay=0.0)

model.add(Conv1D(128, 5,padding='same', input_shape=(34,1)))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 5,padding='same',))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation('softmax'))
opt = keras.optimizers.RMSprop(lr=0.00005, rho=0.9, epsilon=None, decay=0.0)


In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])


In [None]:
X_train_cnn = np.expand_dims(X_train, axis=2)
X_test_cnn = np.expand_dims(X_test, axis=2)

cnnhistory = model.fit(X_train_cnn, y_train, batch_size=4, epochs=60, validation_data=(X_test_cnn, y_test))


In [None]:
plt.plot(cnnhistory.history['loss'])
plt.plot(cnnhistory.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
plt.plot(cnnhistory.history['accuracy'])
plt.plot(cnnhistory.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
loss, accuracy = model.evaluate(X_test_cnn, y_test)
print(f"{accuracy*100}%")

y_pred = model.predict(X_test_cnn)
print(np.shape(y_pred))
y_pred = np.argmax(y_pred, axis=1)
print(y_pred)

accuracy = accuracy_score(y_test, y_pred)
print(f"{accuracy*100}%")


## **Logistic regression**

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression

clf = make_pipeline(RobustScaler(), LogisticRegression(tol=0.00001, max_iter=10000))
clf.fit(X_train, y_train)


## **k-nearest neighbors**

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier

clf = make_pipeline(RobustScaler(), KNeighborsClassifier(n_neighbors = 8))
clf.fit(X_train, y_train)


## **Other models**

In [None]:
import pandas as pd
import numpy as np
import os
import random
import sys
import glob 
import librosa
import librosa.display
import matplotlib.pyplot as plt
# import seaborn as sns

# from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import unique_labels
from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# import lightgbm as lgb
# import xgboost as xgb
# import optuna
# from tqdm import tqdm

# import warnings
# warnings.filterwarnings('ignore')


In [None]:
def extract_feature(file_name):
    """Function Extracts Features from WAV file"""
    X, sample_rate = librosa.load(file_name)
    stft=np.abs(librosa.stft(X))
    result=np.array([])
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    result=np.hstack((result, mfccs))
    chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    result=np.hstack((result, chroma))
    mel=np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
    result=np.hstack((result, mel))
    return result


In [None]:
emotions = {
    "angry" : 0,
    "calm" : 1,
    "disgust" : 2,
    "fearful" : 3,
    "happy" : 4,
    "neutral" : 5,
    "sad" : 6,
    "surprised" : 7
}


In [None]:
def load_data(test_size=0.2):
    x,y=[],[]
    
    directory = os.path.join(os.getcwd(),'elec-378-sp2023-speech-emotion-classification/data/data/')


    for filename in os.listdir(directory):
        file = os.path.join(directory, filename)

        emotion=emotions[filename[:len(filename)-7]]
        feature=extract_feature(file)
        x.append(feature)
        y.append(emotion)

    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)


In [None]:
X_train, X_test, y_train, y_test = load_data()
print((X_train.shape[0], X_test.shape[0]))

# np.set_printoptions(threshold=np.inf)
print(np.shape(X_test))
print(X_train[0])

print(f'Features extracted: {X_train.shape[1]}')


In [None]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


In [None]:
mlp_params = {'activation': 'relu', 
              'solver': 'lbfgs', 
              'hidden_layer_sizes': 1283, 
              'alpha': 0.3849485717707319, 
              'batch_size': 163, 
              'learning_rate': 'constant',
              'max_iter':1000}


In [None]:
# clf_model = MLPClassifier(**mlp_params)
# clf_model.fit(X_train, y_train)
# y_pred = clf_model.predict(X_test)

from sklearn import preprocessing

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score


scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

clf = MLPClassifier(**mlp_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


# y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(y_pred)
print(f"{accuracy*100}%")


In [None]:
v4_params = {'estimators':[('mlp', models['mlp']), 
                          ('xgb', models['xgb'])], 
            'voting':'soft'}

from sklearn.ensemble import VotingClassifier

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

clf = make_pipeline(RobustScaler(), VotingClassifier(**v4_params))
clf.fit(X_train, y_train)
y_pred = clf_model.predict(X_test)


# y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(y_pred)
print(f"{accuracy*100}%")


## **Exporting predictions (FOR KAGGLE)**

In [None]:
id_to_emotion = dict((v, k) for k, v in emotion_to_id.items())
y_pred = [id_to_emotion[x] for x in y_pred]

print(np.shape(y_pred))
print(y_pred)

import pandas as pd
names = [x[1:len(x)-4] for x in data[:,0]]

df = pd.DataFrame(list(zip(names, y_pred)), columns=['filename', 'label'])
df.to_csv("y_kaggle_svm16.csv", index=False)
