# Project: Speech Emotional Recognition

## Requirements
1. OpenSmile

## 1. Preprocessing input data

### 1.1. Copying input wav files to data_path/input

Import the required modules.

In [3]:
import os
import shutil
from sklearn.model_selection import train_test_split
import csv
import numpy as np

Assuming all required files are within the "data" directory

In [6]:
data_path = "/work/peiyun/data"
input_path = os.path.join(data_path, "input")

Create input directory (data_path/input) if not exist.

In [17]:
if not os.path.exists(input_path):
    os.makedirs(input_path)

Copying all input files to input_path. (10,039 utterances in total)

In [18]:
print input_path

/work/peiyun/data/input


In [19]:
# for each session
for session in range(1,6):
    path = os.path.join(data_path, "IEMOCAP", "Session" + str(session), "sentences", "wav")
    
    # for each dialog
    for dialog in os.listdir(path):
        dialog_path = os.path.join(path, dialog)
            
        # for each utterance (file)
        for filename in os.listdir(dialog_path):
            if filename.endswith(".wav"):
                shutil.copy(os.path.join(dialog_path, filename), os.path.join(input_path, filename))

### 1.2. Getting a dictionary of utterance labels

In [20]:
label_dict = {}

# for each session
for session in range(1,6):
    path = os.path.join(data_path, "IEMOCAP/Session" + str(session), "dialog", "EmoEvaluation")
    
    # for file in the session
    for filename in os.listdir(path):
        
        # only interested in "summary" txt files
        if filename.endswith(".txt"):
            f = open(os.path.join(path, filename), "r")
            for line in f.readlines():
                if line[0] == "[":
                    name, label = line.split("\t")[1:3]
                    label_dict[name] = label
            f.close()

### 1.3. Split files into training and test sets (70% training, 15% test, 15% dev, seed = 100)
(seed = 100 for reproducibility)

In [21]:
# get filename list
filename_ls = []
for filename in os.listdir(input_path):
    if filename.endswith(".wav"):
        filename_ls.append(filename[:-4])  # [:-4] for removing .wav
        
# get corresponding label list
label_ls = []
for filename in filename_ls:
    label_ls.append(label_dict[filename])
        
# splitting into train and test
filename_train, filename_remain, label_train, label_remain = train_test_split(filename_ls, label_ls, 
                                                                              train_size=0.7, random_state=100,
                                                                              shuffle = True)

# splitting into train and test
filename_dev, filename_test, label_dev, label_test = train_test_split(filename_remain, label_remain, 
                                                                      test_size=0.5, random_state=100,
                                                                      shuffle = True)

### 1.4. Writing labels into csv file (7027 train instances, 1506 test instances, 1506 dev instances)

In [22]:
with open(os.path.join(data_path, "label.csv") , mode='w') as label_file:
    writer = csv.writer(label_file, delimiter=",")
    
    # training instances
    for filename in filename_train:
        writer.writerow([filename, "train", label_dict[filename]])
            
    # test instances
    for filename in filename_test:
        writer.writerow([filename, "test", label_dict[filename]])
        
    # test instances
    for filename in filename_dev:
        writer.writerow([filename, "dev", label_dict[filename]])
                
    label_file.close()

### 1.5. Move files into test, dev, and train directories

Create test and train directory for input instances if not exist.

In [23]:
for data_type in ["train", "test", "dev"]:
    path = os.path.join(input_path, data_type)
    if not os.path.exists(path):
        os.makedirs(path)

Moving files to its directory.

In [24]:
for filename in filename_train:
    shutil.move(os.path.join(input_path, filename + ".wav"), os.path.join(input_path, "train", filename + ".wav"))
    
for filename in filename_test:
    shutil.move(os.path.join(input_path, filename + ".wav"), os.path.join(input_path, "test", filename + ".wav"))
    
for filename in filename_dev:
    shutil.move(os.path.join(input_path, filename + ".wav"), os.path.join(input_path, "dev", filename + ".wav"))

## 2. Feature Extraction with openSMILE

File for feature extractions.

In [4]:
%%file feature.py

# Import the required modules
import argparse
import os
from subprocess import call
import csv
import sys
import numpy as np
from time import gmtime, strftime, time

# Global variables
data_path = "/work/peiyun/data"

# Get the ground_truth label number of the file
def get_label(label_file, filename):
    
    with open(label_file, mode = "r") as f:
        reader = csv.reader(f)
        for row in reader:
            name, data_type, label = row
            if name == filename:
                return label

# Check input and output directories
def check_dirs(args):
    
    # Check input directory (if not exist -> error)
    if not os.path.exists(os.path.join(data_path, args.input_dir)):
        print "Error: input directory not exist"
        return False
    for data_type in ["train", "test", "dev"]:
        path = os.path.join(data_path, args.input_dir, data_type)
        if not (os.path.exists(path)):
            print "Error: input directory missing train or test directories"
            return False
    
    # Check output directory (if not exist -> create one)
    if not os.path.exists(os.path.join(data_path, args.output_dir)):
        os.makedirs(os.path.join(data_path, args.output_dir))
    for data_type in ["train", "test", "dev"]:
        path = os.path.join(data_path, args.output_dir, data_type)
        if not os.path.exists(path):
            os.makedirs(path)
        config_path = os.path.join(path, args.config[:-5])
        if not os.path.exists(config_path):
            os.makedirs(config_path)
    
    return True

# Function for extracting features with openSMILE (Return whether successed)
def extract_features(args):
    if not check_dirs(args):
        return False   # failed to read inputs
            
    # Iterate over wav audio files in input directory
    for data_type in ["train", "test", "dev"]:
        path_in = os.path.join(data_path, args.input_dir, data_type)
        
        for filename in os.listdir(path_in):
            
            # Only interested in wav files
            if filename.endswith(".wav"):
                # in
                file_in = os.path.join(path_in, filename)
                config = os.path.join(data_path, "config", args.config)
                
                filename = filename[:-4]  # [:-4] for removing .wav
                
                # out
                path_out = os.path.join(data_path, args.output_dir, data_type, args.config[:-5])
                csv_out = os.path.join(path_out, filename + "_" + args.config[:-5] + ".csv")
                arff_out = os.path.join(path_out, filename + "_" + args.config[:-5] + ".arff")  # [:-5] for removing .conf
                label = get_label(os.path.join(data_path, args.label), filename)
                
                # use openSMILE
                call(["SMILExtract", "-l", "0", "-noconsoleoutput", "-I", file_in, 
                      "-C", config, "-D", csv_out, "-O", arff_out, "-instname", filename, "-label", label])
                
    return True

# Obtaining args from terminal
def get_args():
    
    parser = argparse.ArgumentParser(description='Extract features for files in the directory using openSMILE')
    
    parser.add_argument("-i",                   # optional argument (no "-" for positional)
                        "--input_dir",   # name of the attribute (dest)
                        action = "store",       # can be "store", "store_const", "store_true", etc.
                        # nargs = N for associating N args with a single action
                        # const = ... to hold constant values
                        # default = ... to set default value
                        type = str,             # check arg type
                        # choice = [.., .., ..] # restrict set of values
                        required = True,        # make an option required
                        # metavar = "XXX" for changing display name
                        help = "The directory of input audio files (wav)")
    
    parser.add_argument("-o", "--output_dir", type = str, required = True, help = "The directory of results")
    parser.add_argument("-c", "--config", type = str, required = True, help = "Configuration filename")
    parser.add_argument("-l", "--label", type = str, required = True, help = "Label filename")
    args = parser.parse_args()
    
    return args

def main():
    # Obtaining terminal args
    args = get_args()
    
    start_time = time()
    
    # Extracting features according to args
    if not extract_features(args):
        print "Failed to extract features"
    else:
        end_time = time()
        print("Time taken for extracting features:", strftime("%H:%M:%S", gmtime(end_time - start_time)))
        print "Successfully extracted features"

# If running the file directly
if __name__ == "__main__":
    main()

Overwriting feature.py


Running script for extracting features. 

In [3]:
%%!
python feature.py -i "input" -o "output" -c "IS09_emotion.conf" -l "label.csv"

["('Time taken for extracting features:', '00:00:00')",
 'Successfully extracted features']

In [83]:
%%!
python feature.py -i "input" -o "output" -c "IS10_paraling.conf" -l "label.csv"

["('Time taken for extracting features:', '00:35:29')",
 'Successfully extracted features']

## 3. Gated Convolutional Network for emotion recognition

Import the required modules.

In [1]:
from keras.models import Sequential
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing import sequence
from keras.layers import Conv1D, Input, MaxPooling1D
from keras.layers import add, multiply
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.models import Model, Sequential, load_model
from keras.optimizers import Adam, SGD
import os
import numpy as np
import csv

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [1]:
data_path = "/work/peiyun/data"

### 3.1. Set parameters for GCNN

In [26]:
# Parameters
MAX_LEN = 22500 # 22499 (for each utterance)
BATCH_SIZE = 4
NUM_EPOCH = 32
THRESHOLD = 0.5

IS_DILATED = False
NUM_FILTER = 64
DILATION_RATE = 1
N_STACK = 1

### 3.2. Extracting the matrices and labels for all data types and store as numpy files

Function for obtaining label according to the label.csv file.

In [115]:
# Get the ground_truth label number of the file
def get_label(filename):
    label_file = os.path.join(data_path, "label.csv")
    
    with open(label_file, mode = "r") as f:
        reader = csv.reader(f)
        for row in reader:
            name, data_type, label = row
            # print filename
            if name == filename:
                return label

Function for extracting the corresponding normalised input and the ground truth output of a file.

In [113]:
def extract_Xy(file_path):
    
    # Obtain dataframe for the csv file (one utterance/instance)
    df = pd.read_csv(file_path, sep = ";")
    
    # Get filename
    filename = df["name"][0][1:-1]  # [1:-1] for removing single quotation marks
    
    # Clean unnecessary columns
    df = df.drop(columns = ["name", "frameTime"])
    
    # Normalise data
    x = df.values                # dataframe to a numpy array
    min_max_scalar = MinMaxScaler()  
    x_scaled = min_max_scalar.fit_transform(x)     # scaling each feature (each column) to range: (0,1)
    df = pd.DataFrame(x_scaled)  # numpy array back to dataframe
    
    # Obtain matrix X and label y
    x_data = sequence.pad_sequences(df.values.T, padding='post', dtype='float64', maxlen=MAX_LEN).T
    
    y_data = get_label(filename)
    
    return x_data, y_data

In [114]:
extract_Xy("/work/peiyun/data/output/dev/IS09_emotion/Ses05M_script03_2_M040_IS09_emotion.csv")

(array([[0.09267241, 0.96610406, 0.54997998, ..., 0.31054128, 0.40723506,
         0.48860733],
        [0.08859053, 0.98569684, 0.55554785, ..., 0.31111108, 0.35876168,
         0.48860733],
        [0.08139378, 1.        , 0.52159077, ..., 0.30940166, 0.29020727,
         0.48860733],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]), 'ang')

In [21]:
# IS09 has 32 LLDs and IS10 has 76 LLDs
def get_feature_num(config):
    
    # enter the folder for the config under train (can also use test/dev, same result)
    path = os.path.join(data_path, "output", "train", config)
    
    # enter any config folder (nums of files are the same for all configs)
    for filename in os.listdir(path):
        if filename.endswith(".csv"):
            path = os.path.join(path, filename)
            break
    
    # Obtain dataframe for the csv file (one utterance/instance)
    df = pd.read_csv(path, sep = ";")
    
    # Clean unnecessary columns
    df = df.drop(columns = ["name", "frameTime"])

    return len(df.columns)

Extract the required matrix and label pairs for each data type and store as numpy files. 
<br>
NOTE: np.load(filename.npy) to load data

In [22]:
def get_file_num(data_type):
    
    # enter the folder for the data_type
    path = os.path.join(data_path, "output", data_type)
    
    # enter any config folder (nums of files are the same for all configs)
    for config in os.listdir(path):
        path = os.path.join(path, config)
        break
        
    return len(os.listdir(path))/2   # only half are csv files

In [None]:
temp_dir = os.path.join(data_path, "temp")
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

In [124]:
numpy_dir = "numpy_var"

In [None]:
for config in ["IS09_emotion", "IS10_paraling"]:
    
    for data_type in ["train", "test", "dev"]:

        # temp dat file for storing X
        X_data_filename = os.path.join(temp_dir, data_type + ".dat")
        
        # Obtain the number of features and number of files
        feature_num = get_feature_num(config)
        file_num = get_file_num(data_type)
        
 
        # initialise lists (using memory-map for accessing small segments of large files on disk)
        X = np.memmap(X_data_filename, dtype='float64', mode='w+', shape=((file_num, MAX_LEN, feature_num)))
        Y = []

        # path for the directory of each data_type
        path = os.path.join(data_path, "output", data_type, config)
        
        i = 0

        # iterate through all files
        for filename in os.listdir(path):

            # only interested in csv files
            if not filename.endswith(".csv"):
                continue

            # extract data
            x_data, y_data = extract_Xy(os.path.join(path, filename))
            X[i] = x_data
            Y.append(y_data)
            
            i += 1
            
            if i%100 == 0:
                print "num of file processed: " + str(i)

        # save as numpy file
        np.save(os.path.join(data_path, numpy_dir, data_type + "_" + config + "_X.npy"), X)
        np.save(os.path.join(data_path, numpy_dir, data_type + "_" + config + "_Y.npy"), Y)
        
        # delete temp data
        del X
        
# delete temp dir
shutil.rmtree(temp_dir)

### 3.3. Read all the required data from the numpy files.

Function for obtaining the required data for the configuration. 

In [136]:
def get_variables(config):
    
    # variables
    X_train = np.load(os.path.join(data_path, numpy_dir, "train_" + config + "_X.npy"), mmap_mode='r')
    y_train = np.load(os.path.join(data_path, numpy_dir, "train_" + config + "_Y.npy"), mmap_mode='r')
    
    X_test = np.load(os.path.join(data_path, numpy_dir, "test_" + config + "_X.npy"), mmap_mode='r')
    y_test = np.load(os.path.join(data_path, numpy_dir, "test_" + config + "_Y.npy"), mmap_mode='r')
    
    X_dev = np.load(os.path.join(data_path, numpy_dir, "dev_" + config + "_X.npy"), mmap_mode='r')
    y_dev = np.load(os.path.join(data_path, numpy_dir, "dev_" + config + "_Y.npy"), mmap_mode='r')
    
    return X_train, y_train, X_test, y_test, X_dev, y_dev

Reading all data for config: IS09_emotion.

In [137]:
X_train, y_train, X_test, y_test, X_dev, y_dev = get_variables("IS09_emotion")

### 3.4. Define pre-activation residual block

In [None]:
def residual_block(input_layer):   # containing: convolution + gated linear unit
    
    # Obtain convolution layer (1D, temporal convolution) by creating a convolution kernel
    tanh_out = Conv1D(NUM_FILTER, 
                      kernel_size = 2,  # length of 1D convolution window
                      kernel_initializer= "random_uniform",  # initialization of filters
                      dilation_rate=dilation_rate
                      padding= "same")(input_layer)   # Conv1D “depending” on the input layer

    # Normalise the result after Conv1D
    tanh_out = BatchNormalization()(tanh_out)

    # Obtain the convolutional layer with sigmoid transformation (activation)
    sigmoid_out = Conv1D(NUM_FILTER, 
                         kernel_size = 2,
                         kernel_initializer='random_uniform',
                         dilation_rate= DILATION_RATE,
                         padding= "same")(input_layer)
    
    # Normalise the layer
    sigmoid_out = BatchNormalization()(sigmoid_out)
    
    # Activation function for the layer
    sigmoid_out = Activation("sigmoid")(sigmoid_out)

    # Element-wise multiplication
    merged = multiply([tanh_out, sigmoid_out])
    
    # Max Pooling for the merged result
    merged = MaxPooling1D(pool_size = 2)(merged)
    
    return merged

### 3.5. Define a function for generating the whole model

In [None]:
def generate_model(input_shape):  # input shape: num_files x time x LLD *****
    
    input_layer = Input(input_shape[1], input_shape[2])  # time x LLD
    gated_cnn = residual_block(input_layer)  # first block
    
    for i in range(0, N_STACK - 1):  # the rest blocks
        gated_cnn = residual_block(gated_cnn)
    
    # Flattening
    gated_cnn = Flatten()(gated_cnn)
    
    # Fully Connected Layer
    gated_cnn = Dense(256, kernel_initializer='random_uniform')(gated_cnn) 
    gated_cnn = BatchNormalization()(gated_cnn)
    gated_cnn = Activation('relu')(gated_cnn)
    gated_cnn = Dropout(0.5)(gated_cnn)
    
    # Fully Connected Layer then sigmoid
    gated_cnn = Dense(1, activation='sigmoid')(gated_cnn)

    all_model = Model(inputs = input_layer, outputs = gated_cnn) # This model will include all layers required in the 
                                                                 # computation of "outputs" given "inputs"
    
    # Compiling model
#     sgd = SGD(lr = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)  # optimizer
#     all_model.compile(loss='binary_crossentropy',
#                       optimizer='sgd',
#                       metrics=["accuracy"])

    all_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy', precision, recall, fscore])
#     all_model.summary()
    
    return all_model

### 3.6. Training a GCNN model with the feature configuration

Function for training a GCN model with the config.

In [None]:
def training(X_train, y_train, X_test, y_test, X_dev, y_dev, model_name):
    model = generate_model()  # model after compilation
    
    # fitting training data
    model.fit(X_train, y_train,
              batch_size = BATCH_SIZE,
              epochs = NUM_EPOCH
              validation_data = (X_dev, y_dev),
              shuffle = True)
    
    # saving model
    model.save(model_name + ".model")  # save model
    
#     # load model
#     new_model = load_model("epic_num_reader.model")
    

Begin training model by calling the training function.

In [None]:
training(X_train, y_train, X_test, y_test, X_dev, y_dev, model_name)