In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!pip install ekphrasis

Collecting ekphrasis
[?25l  Downloading https://files.pythonhosted.org/packages/92/e6/37c59d65e78c3a2aaf662df58faca7250eb6b36c559b912a39a7ca204cfb/ekphrasis-0.5.1.tar.gz (80kB)
[K     |████████████████████████████████| 81kB 2.6MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/4f/a6/728666f39bfff1719fc94c481890b2106837da9318031f71a8424b662e12/colorama-0.4.1-py2.py3-none-any.whl
Collecting ujson
[?25l  Downloading https://files.pythonhosted.org/packages/16/c4/79f3409bc710559015464e5f49b9879430d8f87498ecdc335899732e5377/ujson-1.35.tar.gz (192kB)
[K     |████████████████████████████████| 194kB 8.2MB/s 
Collecting ftfy
[?25l  Downloading https://files.pythonhosted.org/packages/75/ca/2d9a5030eaf1bcd925dab392762b9709a7ad4bd486a90599d93cd79cb188/ftfy-5.6.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 8.5MB/s 
Building wheels for collected packages: ekphrasis, ujson, ftfy
  Building wheel for ekphrasis (setup.py) ... [?25l[?25hdone
  Created w

In [0]:
%tensorflow_version 1.x

In [0]:
import re
import io
import os
import json
import argparse
import numpy as np
import pandas as pd
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.classes.preprocessor import TextPreProcessor
from keras.models import load_model
from keras.models import Model
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import KFold
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Input, Dense, Embedding, LSTM, Concatenate, Reshape, GRU, Bidirectional, Dropout, Conv1D, Flatten, MaxPool1D, TimeDistributed, Add
from keras.models import Sequential
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from keras.layers import LeakyReLU
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [0]:
%cd contextual_emotion_detection/glove

In [0]:
!unzip "../../drive/My Drive/glove.840B.300d.zip" -d "contextual_emotion_detection/data"

In [0]:
!unzip "../../drive/My Drive/embedding-results.zip" -d "contextual_emotion_detection/data"

In [0]:
projectpath = '..'
datapath = os.path.join(projectpath, 'data')

train_path = os.path.join(datapath, 'train.txt')
test_path =  os.path.join(datapath, 'test.txt')
validation_path = os.path.join(datapath, 'dev.txt')
result_file_name = 'predictions.txt'
glove_path = datapath
sswe_path = os.path.join(datapath, 'embedding-results')

In [0]:
np.random.seed(7)

In [0]:
# Callback function to compute F1 score for every epoch
class ComputeMetricsCallback(Callback):
    def __init__(self, test_data):
        self.test_data = test_data

    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        r = self.model.predict(x)
        compute_metrics(r, y)

In [0]:
# Model Configurations

# 4 emotion classes: Happy, Sad, Angry, Others
NUM_OF_CLASSES = 4

# Maximum length of input phrase sequence
MAX_SEQUENCE_LENGTH = 64

# Maximum number of words to keep based on word frequency.
MAX_NB_WORDS = 20000

# The encoded vector dimension
EMBEDDING_DIM = 300

# Batch size for training - helps prevent overfitting
BATCH_SIZE = 200

# LSTM layer size
LSTM_DIM = 300

# Lower learning rate helps the model to converge. But it
# should be high enough that the model can converge in the given
# number of iterations.
LEARNING_RATE = 0.001

# Number of epochs in each iteration
NUM_EPOCHS = 10

# The proportion of neural units that should be randomly dropped
# for regularization technique helping to reduce overfitting 
DROPOUT_RATIO = 0.2

label_to_emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
emotion_to_label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}
angry_identifier = {"others": 0, "happy": 0, "sad": 0, "angry": 3}
sad_identifier = {"others": 0, "happy": 0, "sad": 2, "angry": 0}
happy_identifier = {"others": 0, "happy": 1, "sad": 0, "angry": 0}

In [0]:
def generate_result_file(model, test_sequences, result_file_name = result_file_name):
    test_data = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)
    predictions = model.predict(test_data, batch_size = BATCH_SIZE)
    predictions = predictions.argmax(axis = 1)

    with io.open(result_file_name, "w", encoding = "utf8") as f:
        f.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')
        with io.open(test_path, encoding = "utf8") as fin:
            fin.readline()
            for line_number, line in enumerate(fin):
                f.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                f.write(label_to_emotion[predictions[line_number]] + '\n')
    print("Model parameters: LSTM Dim : %d, Dropout : %.1f, Batch_size : %d, Learning rate : %.3f"
          % (LSTM_DIM, DROPOUT_RATIO, BATCH_SIZE, LEARNING_RATE))
    return

In [0]:
# Create the embedding matrix
def create_embedding_matrix(word_index, embedding_file_path):
    embeddings_vectors = {}
    with io.open(embedding_file_path, encoding = "utf8") as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embedding_vector = np.array([float(val) for val in values[1:]])
            embeddings_vectors[word] = embedding_vector
    print('%s embedding vectors' % len(embeddings_vectors))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_vectors.get(word)
        if embedding_vector is not None:
            # Assign the vector for words seen in the training dataset
            embedding_matrix[i] = embedding_vector
        else:
            # Handle out of vocabulary words
            out_of_vocabulary_word_vector = [np.random.normal(size = EMBEDDING_DIM)]
            out_of_vocabulary_word_vector /= np.linalg.norm(out_of_vocabulary_word_vector)
            embedding_matrix[i] = out_of_vocabulary_word_vector

    return embedding_matrix

In [0]:
# Compute performance metrics like precision, recall and F1-score

def compute_metrics(predictions, labels):
    # Predictions to classes
    class_predictions = to_categorical(predictions.argmax(axis = 1))
    true_positives = np.sum(class_predictions * labels, axis = 0)
    false_positives = np.sum(np.clip(class_predictions - labels, 0, 1), axis = 0)
    false_negatives = np.sum(np.clip(labels - class_predictions, 0, 1), axis = 0)

    print("True Positives per class : ", true_positives)
    print("False Positives per class : ", false_positives)
    print("False Negatives per class : ", false_negatives)

    # *************************************** Macro metrics ***************************************
    macro_precision = 0
    macro_recall = 0
    for category in range(1, NUM_OF_CLASSES):
        precision = true_positives[category] / (true_positives[category] + false_positives[category])
        macro_precision += precision
        recall = true_positives[category] / (true_positives[category] + false_negatives[category])
        macro_recall += recall
        f1_score = (2 * recall * precision) / (precision + recall) if (precision + recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1-Score : %.3f" % (label_to_emotion[category], precision, recall, f1_score))

    macro_precision /= 3
    macro_recall /= 3
    macro_f1_score = (2 * macro_recall * macro_precision ) / (macro_precision + macro_recall) if (macro_precision + macro_recall) > 0 else 0
    print("Macro Precision : %.3f, Macro Recall : %.3f, Macro F1-Score : %.3f" % (macro_precision, macro_recall, macro_f1_score))

    # *************************************** Micro metrics ***************************************
    true_positives = true_positives[1 : ].sum()
    false_positives = false_positives[1 : ].sum()
    false_negatives = false_negatives[1 : ].sum()

    micro_precision = true_positives / (true_positives + false_positives)
    micro_recall = true_positives / (true_positives + false_negatives)
    micro_f1_score = (2 * micro_recall * micro_precision) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

    predictions = predictions.argmax(axis = 1)
    labels = labels.argmax(axis = 1)
    accuracy = np.mean(predictions == labels)
    print("Accuracy : %.3f, Micro Precision : %.3f, Micro Recall : %.3f, Micro F1-Score : %.3f" % (accuracy, micro_precision, micro_recall, micro_f1_score))

    return {
        "macro": [accuracy, macro_precision, macro_recall, macro_f1_score],
        "micro": [accuracy, micro_precision, micro_recall, micro_f1_score]
    }

### Models

#### Bidirectional LSTM

In [0]:
def bidirectional_lstm(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    bilstm = Sequential()
    bilstm.add(embedding_layer)
    bilstm.add(Dropout(DROPOUT_RATIO))
    bilstm.add(Bidirectional(LSTM(LSTM_DIM)))
    bilstm.add(Dropout(DROPOUT_RATIO))
    bilstm.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    bilstm.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    bilstm.summary()
    return bilstm

#### Gated Recurrent Unit Network

In [0]:
def gated_recurrent_unit_network(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    gru = Sequential()
    gru.add(embedding_layer)
    gru.add(Dropout(DROPOUT_RATIO))
    gru.add(GRU(128))
    gru.add(Dropout(DROPOUT_RATIO))
    gru.add(Dense(NUM_OF_CLASSES * 8, activation = 'relu'))
    gru.add(Dropout(DROPOUT_RATIO))
    gru.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    gru.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    gru.summary()
    return gru

#### Convolutional Neural Network

In [0]:
def convolutional_neural_network(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    cnn = Sequential()
    cnn.add(embedding_layer)
    cnn.add(Dropout(DROPOUT_RATIO))
    cnn.add(Conv1D(64, 3, padding = 'same'))
    cnn.add(Conv1D(32, 3, padding = 'same'))
    cnn.add(Conv1D(16, 3, padding = 'same'))
    cnn.add(Flatten())
    cnn.add(Dropout(DROPOUT_RATIO))
    cnn.add(Dense(180, activation = 'relu'))
    cnn.add(Dropout(DROPOUT_RATIO))
    cnn.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    cnn.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    cnn.summary()
    return cnn

#### CNN-LSTM

In [0]:
#CNN-LSTM Model
def cnn_lstm(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    cnn_lstm = Sequential()
    cnn_lstm.add(embedding_layer)
    cnn_lstm.add(Conv1D(32, 3, padding = 'same', activation = 'relu'))
    cnn_lstm.add(MaxPool1D(2))
    cnn_lstm.add(LSTM(LSTM_DIM))
    cnn_lstm.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    cnn_lstm.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    cnn_lstm.summary()
    return cnn_lstm

#### LSTM-CNN

In [0]:
def lstm_cnn(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    lstm_cnn = Sequential()
    lstm_cnn.add(embedding_layer)
    lstm_cnn.add(LSTM(LSTM_DIM, return_sequences = True))
    lstm_cnn.add(Conv1D(32, 3, padding = 'same', activation = 'relu'))
    lstm_cnn.add(MaxPool1D(2))
    lstm_cnn.add(Flatten())
    lstm_cnn.add(Dropout(DROPOUT_RATIO))
    lstm_cnn.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    lstm_cnn.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    lstm_cnn.summary()
    return lstm_cnn

#### CNN-BiLSTM

In [0]:
def cnn_bilstm(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    cnn_bilstm = Sequential()
    cnn_bilstm.add(embedding_layer)
    cnn_bilstm.add(Conv1D(32, 3, padding = 'same', activation = 'relu'))
    cnn_bilstm.add(MaxPool1D(2))
    cnn_bilstm.add(Dropout(DROPOUT_RATIO))
    cnn_bilstm.add(Bidirectional(LSTM(LSTM_DIM)))
    cnn_bilstm.add(Dropout(DROPOUT_RATIO))
    cnn_bilstm.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    cnn_bilstm.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    cnn_bilstm.summary()
    return cnn_bilstm

#### BiLSTM-CNN

In [0]:
def bilstm_cnn(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    bilstm_cnn = Sequential()
    bilstm_cnn.add(embedding_layer)
    bilstm_cnn.add(Bidirectional(LSTM(LSTM_DIM, return_sequences = True)))
    bilstm_cnn.add(Dropout(DROPOUT_RATIO))
    bilstm_cnn.add(Conv1D(32, 3, padding = 'same', activation = 'relu'))
    bilstm_cnn.add(MaxPool1D(2))
    bilstm_cnn.add(Flatten())
    bilstm_cnn.add(Dropout(DROPOUT_RATIO))
    bilstm_cnn.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    bilstm_cnn.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    bilstm_cnn.summary()
    return bilstm_cnn

#### BiLSTM without dropout

In [0]:
def bilstm_without_dropout(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    bilstm = Sequential()
    bilstm.add(embedding_layer)
    # No dropout layer
    bilstm.add(Bidirectional(LSTM(LSTM_DIM, dropout = 0.6)))
    bilstm.add(Dropout(0.9))
    bilstm.add(Dense(100, activation = 'tanh'))
    bilstm.add(Dropout(0.9))
    bilstm.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    bilstm.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adadelta',
                  metrics = ['acc'])
    bilstm.summary()
    return bilstm

#### CNN-GRU

In [0]:
def cnn_gru(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    cnn_gru = Sequential()
    cnn_gru.add(embedding_layer)
    cnn_gru.add(Conv1D(32, 3, activation = 'relu', padding = 'same'))
    cnn_gru.add(MaxPool1D(2))
    cnn_gru.add(GRU(LSTM_DIM))
    cnn_gru.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    cnn_gru.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    cnn_gru.summary()
    return cnn_gru

#### GRU_CNN

In [0]:
def gru_cnn(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    gru_cnn = Sequential()
    gru_cnn.add(embedding_layer)
    gru_cnn.add(GRU(LSTM_DIM, return_sequences = True))
    gru_cnn.add(Conv1D(32, 3, activation = 'relu', padding = 'same'))
    gru_cnn.add(MaxPool1D(2))
    gru_cnn.add(Flatten())
    gru_cnn.add(Dropout(0.4))
    gru_cnn.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    gru_cnn.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    gru_cnn.summary()
    return gru_cnn

#### LSTM for first layer

In [0]:
def lstm_first_model(embedding_matrix):
    embedding_layer = Embedding(embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    lstm = Sequential()
    lstm.add(embedding_layer)
    lstm.add(LSTM(64))
    lstm.add(Dense(NUM_OF_CLASSES, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    lstm.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    lstm.summary()
    return lstm

#### Custom Model using GloVe and Sentiment-Specific word embeddings

In [0]:
def custom_model(glove_embedding_matrix, sswe_embedding_matrix):
    glove_embedding_layer = Embedding(glove_embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [glove_embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    glove_model = Sequential()
    glove_model.add(glove_embedding_layer)
    glove_model.add(GRU(LSTM_DIM, return_sequences = True))
    glove_model.add(Conv1D(32, 3, activation = 'relu', padding = 'same'))
    glove_model.add(MaxPool1D(2))
    glove_model.add(Flatten())
    glove_model.add(Dropout(0.4))
    glove_model.add(Dense(64, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    glove_model.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])

 
    sswe_embedding_layer = Embedding(sswe_embedding_matrix.shape[0],
                                EMBEDDING_DIM,
                                weights = [sswe_embedding_matrix],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)    
    sswe_model = Sequential()
    sswe_model.add(sswe_embedding_layer)
    sswe_model.add(GRU(LSTM_DIM, return_sequences = True))
    sswe_model.add(Conv1D(32, 3, activation = 'relu', padding = 'same'))
    sswe_model.add(MaxPool1D(2))
    sswe_model.add(Flatten())
    sswe_model.add(Dropout(0.4))
    sswe_model.add(Dense(64, activation = 'softmax'))
    adam = optimizers.adam(lr = LEARNING_RATE)
    sswe_model.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    
    
    merged_layers = Add()([glove_model.output, sswe_model.output])
    output_layer = Dense(4, activation = 'softmax', name = 'output_layer')(merged_layers)
    merged_model = Model([glove_model.input, sswe_model.input], output_layer)
    merged_model.compile(loss = 'categorical_crossentropy',
                  optimizer = adam,
                  metrics = ['acc'])
    merged_model.summary()
    return merged_model

### Pre-processing

In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Stopword removal
# Negation should not be removed
stopwords = set(stopwords.words('english')) - set(('not', 'no'))
# Tag removal
tags = ['<url>', '<email>', '<user>', '<hashtag>', '</hashtag>',
        '<elongated>', '</elongated>', '<repeated>', '</repeated>']
text_pre_processor = TextPreProcessor(
    normalize  = ['url', 'email', 'user'],
    annotate = {'hashtag', 'elongated', 'repeated'},
    segmenter = "twitter",
    corrector = "twitter",
    unpack_hashtags = True,
    unpack_contractions = True,
    tokenizer=SocialTokenizer(lowercase = True).tokenize
)
def pre_process(text):
    pre_processed_text = text_pre_processor.pre_process_doc(text)
    return list(filter(lambda term: term not in tags and
                                 term not in stopwords and
                                 term not in punctuation, pre_processed_text))

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


### Load data

In [0]:
def load_data(data_path, is_training):
    data = pd.read_csv(data_path, encoding = 'utf-8', sep = '\t')
    concatenated_conversation = data[['turn1', 'turn2', 'turn3']].apply(lambda converation: ' '.join(converation), axis = 1)
    if not is_training:
        return data['id'], concatenated_conversation
    else:
        return data['id'], concatenated_conversation, data['label']

def load_pre_processed_data(data_path, is_training = True):
    if not is_training:
        id, concatenated_conversation = load_data(data_path, is_training)
        pre_processed_concatenated_conversation = concatenated_conversation.apply(lambda converation: pre_process(converation))
        return id.values.tolist(), pre_processed_concatenated_conversation.values.tolist()
    else:
        id, concatenated_conversation, class_label = load_data(data_path, is_training)
        pre_processed_concatenated_conversation = concatenated_conversation.apply(lambda converation: pre_process(converation))
        emotion_label = class_label.apply(lambda emotion: emotion_to_label[emotion])
        return id.values.tolist(), pre_processed_concatenated_conversation.values.tolist(), emotion_label.values.tolist()

### Train models

In [0]:
print("Process training data")
train_indices, pre_processed_train, labels = load_pre_processed_data(train_path)
print("Process test data")
_, pre_processed_test = load_pre_processed_data(test_path, is_training = False)
print("Process validation data")
_, pre_processed_validation, validation_classes = load_pre_processed_data(validation_path)

print("Tokenize")
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(pre_processed_train)
train_sequences = tokenizer.texts_to_sequences(pre_processed_train)
test_sequences = tokenizer.texts_to_sequences(pre_processed_test)
validation_sequences = tokenizer.texts_to_sequences(pre_processed_validation)
word_index = tokenizer.word_index
print("%s unique tokens found." % len(word_index))

Process training data
Process test data
Process validation data
Tokenize
14162 unique tokens found.


#### GloVe Embeddings

In [0]:
print("Generate GloVe embedding matrix")
glove_embedding_matrix = create_embedding_matrix(word_index, os.path.join(glove_path, 'glove.840B.300d.txt'))
padded_training_sequences = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
padded_validation_sequences = pad_sequences(validation_sequences, maxlen = MAX_SEQUENCE_LENGTH)
validation_labels = to_categorical(np.asarray(validation_classes))
np.random.shuffle(train_indices)
padded_training_sequences = padded_training_sequences[train_indices]
labels = labels[train_indices]

Generate GloVe embedding matrix
2196016 embedding vectors


##### GRU-CNN

In [0]:
print("Building GRU-CNN model")
cbks = [ModelCheckpoint('./gru_cnn_glove_model.h5', verbose = 1, monitor = 'val_loss', save_best_only = True, mode = 'auto'),
        EarlyStopping(monitor = 'val_loss', patience = 2),
        ComputeMetricsCallback((padded_validation_sequences, validation_labels))]
gru_cnn_model = gru_cnn(glove_embedding_matrix)
gru_cnn_model.fit(padded_training_sequences, labels, validation_data=(padded_validation_sequences, validation_labels), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=True, callbacks=cbks)
gru_cnn_model = load_model('./gru_cnn_glove_model.h5')
print("Generating prediction file")
generate_result_file(gru_cnn_model, test_sequences, result_file_name = "gru_cnn_glove_predictions.txt")

Generate embedding matrix
2196016 embedding vectors
Building model











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 64, 300)           4248900   
_________________________________________________________________
gru_1 (GRU)                  (None, 64, 300)           540900    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 64, 32)            28832     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1024)              0         
_________________________________________________________

##### LSTM

In [0]:
print("Building LSTM model")
callback = [ModelCheckpoint('./lstm_glove_model.h5', verbose = 1, monitor = 'val_loss', save_best_only = True, mode = 'auto'),
        EarlyStopping(monitor = 'val_loss', patience = 2),
        ComputeMetricsCallback((padded_validation_sequences, validation_labels))]
model = lstm_first_model(glove_embedding_matrix)
model.fit(padded_training_sequences, labels, validation_data = (padded_validation_sequences, validation_labels), epochs = NUM_EPOCHS, batch_size = BATCH_SIZE, shuffle = True, callbacks = callback)
model = load_model('./lstm_glove_model.h5')
print("Generating prediction file")
generate_result_file(model, test_sequences, result_file_name = "lstm_glove_predictions.txt")

Building LSTM model











Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 64, 300)           4248900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 260       
Total params: 4,342,600
Trainable params: 4,342,600
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 30160 samples, validate on 2755 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.36243, saving model to ./lstm_glove_model.h5
True Positives per class :  [2078.   98.   94.  130.]
False Positives per class :  [ 81.  73.  8

#### Sentiment-Specific embedding

In [0]:
print("Generating Sentiment-Specific word embedding matrix")
sswe_embedding_matrix = create_embedding_matrix(word_index, os.path.join(sswe_path, 'sswe-r.txt'))
padded_training_sequences = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
padded_validation_sequences = pad_sequences(validation_sequences, maxlen = MAX_SEQUENCE_LENGTH)
validation_labels = to_categorical(np.asarray(validation_classes))
np.random.shuffle(train_indices)
padded_training_sequences = padded_training_sequences[train_indices]
labels = labels[train_indices]

Generating Sentiment-Specific word embedding matrix
137052 embedding vectors


##### GRU-CNN

In [0]:
print("Building GRU-CNN model")
callback = [ModelCheckpoint('./gru_cnn_sswe_model.h5', verbose = 1, monitor = 'val_loss', save_best_only = True, mode = 'auto'),
        EarlyStopping(monitor = 'val_loss', patience = 2),
        ComputeMetricsCallback((padded_validation_sequences, validation_labels))]
gru_cnn_sswe_model = gru_cnn(sswe_embedding_matrix)
gru_cnn_sswe_model.fit(padded_training_sequences, labels, validation_data = (padded_validation_sequences, validation_labels), epochs = NUM_EPOCHS, batch_size = BATCH_SIZE, shuffle = True, callbacks = callback)
gru_cnn_sswe_model = load_model('./gru_cnn_sswe_model.h5')
print("Generating prediction file")
generate_result_file(model, test_sequences, result_file_name = "gru_cnn_sswe_predictions.txt")

Building GRU-CNN model
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 64, 300)           4248900   
_________________________________________________________________
gru_3 (GRU)                  (None, 64, 300)           540900    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 64, 32)            28832     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 32, 32)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 1024)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_5 (Dense)              (N

##### LSTM

In [0]:
print("Building LSTM model")
callback = [ModelCheckpoint('./lstm_sswe_model.h5', verbose = 1, monitor = 'val_loss', save_best_only = True, mode='auto'),
        EarlyStopping(monitor = 'val_loss', patience = 2),
        ComputeMetricsCallback((padded_validation_sequences, validation_labels))]
lstm_sswe_model = lstm_first_model(sswe_embedding_matrix)
lstm_sswe_model.fit(padded_training_sequences, labels, validation_data = (padded_validation_sequences, validation_labels), epochs = NUM_EPOCHS, batch_size = BATCH_SIZE, shuffle = True, callbacks = callback)
lstm_sswe_model = load_model('./lstm_sswe_model.h5')
print("Generating prediction file")
generate_result_file(model, test_sequences, result_file_name = "lstm_sswe_predictions.txt")

Building LSTM model
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 64, 300)           4248900   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 260       
Total params: 4,342,600
Trainable params: 4,342,600
Non-trainable params: 0
_________________________________________________________________


ValueError: ignored

#### Combined GloVe and SSWE

In [0]:
model = load_model('./lstm_glove_model.h5')
print(model.layers[2].get_weights()[0].shape)
lstm_glove_weights = np.zeros(shape = (64,1))
x = model.layers[2].get_weights()[0]
lstm_glove_weights = np.sum(x, axis = 1)
print(lstm_glove_weights.shape)
lstm_glove_weights = lstm_glove_weights.reshape(lstm_glove_weights.shape[0], 1)
print(lstm_glove_weights.shape)












Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


(64, 4)
(64,)
(64, 1)


In [0]:
model = load_model('./lstm_sswe_model.h5')
print(model.layers[2].get_weights()[0].shape)
lstm_sswe_weights = np.zeros(shape = (64,1))
x = model.layers[2].get_weights()[0]
lstm_sswe_weights = np.sum(x, axis = 1)
print(lstm_sswe_weights.shape)
lstm_sswe_weights = lstm_sswe_weights.reshape(lstm_sswe_weights.shape[0], 1)
print(lstm_sswe_weights.shape)

(64, 4)
(64,)
(64, 1)


In [0]:
glove_sswe_combined = np.concatenate((lstm_sswe_weights, lstm_glove_weights))
print(glove_sswe_combined.shape)

(128, 1)


In [0]:
print("Generating GloVe and SSWE combined embedding matrix")
glove_embedding_matrix = create_embedding_matrix(word_index, os.path.join(glove_path, 'glove.840B.300d.txt'))
sswe_embedding_matrix = create_embedding_matrix(word_index, os.path.join(sswe_path, 'sswe-r.txt'))
padded_training_sequences = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
padded_validation_sequences = pad_sequences(validation_sequences, maxlen = MAX_SEQUENCE_LENGTH)
validation_labels = to_categorical(np.asarray(validation_classes))
np.random.shuffle(train_indices)
padded_training_sequences = padded_training_sequences[train_indices]
labels = labels[train_indices]

Generating GloVe and SSWE combined embedding matrix
2196016 embedding vectors
137052 embedding vectors


##### Custom model

In [0]:
print("Building custom model")
callback = [ModelCheckpoint('./custom_glovesswecombined_model.h5', verbose = 1, monitor = 'val_loss', save_best_only = True, mode = 'auto'),
        EarlyStopping(monitor = 'val_loss', patience = 2),
        ComputeMetricsCallback(([np.array(padded_validation_sequences),np.array(padded_validation_sequences)], validation_labels))]
custom_glovesswecombined_model = custom_model(glove_embedding_matrix, sswe_embedding_matrix)
custom_glovesswecombined_model.fit([np.array(padded_training_sequences),np.array(padded_training_sequences)], np.array(labels), validation_data = ([np.array(padded_validation_sequences),np.array(padded_validation_sequences)], np.array(validation_labels)), epochs = NUM_EPOCHS, batch_size = BATCH_SIZE, shuffle = True, callbacks = callback)
custom_glovesswecombined_model = load_model('./custom_glovesswecombined_model.h5')
print("Generating prediction file")
generate_result_file(model, test_sequences, result_file_name = "custom_glovesswecombined_predictions.txt")

Building custom model
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_5_input (InputLayer)  (None, 64)           0                                            
__________________________________________________________________________________________________
embedding_6_input (InputLayer)  (None, 64)           0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 64, 300)      4248900     embedding_5_input[0][0]          
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 64, 300)      4248900     embedding_6_input[0][0]          
______________________________________________________________________




Epoch 00005: val_loss improved from 0.53626 to 0.50210, saving model to ./custom_glovesswecombined_model.h5
True Positives per class :  [2012.  113.    0.  133.]
False Positives per class :  [ 62. 143.   0. 292.]
False Negatives per class :  [326.  29. 125.  17.]
Class happy : Precision : 0.441, Recall : 0.796, F1-Score : 0.568
Class sad : Precision : nan, Recall : 0.000, F1-Score : 0.000
Class angry : Precision : 0.313, Recall : 0.887, F1-Score : 0.463
Macro Precision : nan, Macro Recall : 0.561, Macro F1-Score : 0.000
Accuracy : 0.820, Micro Precision : 0.361, Micro Recall : 0.590, Micro F1-Score : 0.448
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.50210
True Positives per class :  [1952.  119.    0.  130.]
False Positives per class :  [ 53. 182.   0. 319.]
False Negatives per class :  [386.  23. 125.  20.]
Class happy : Precision : 0.395, Recall : 0.838, F1-Score : 0.537
Class sad : Precision : nan, Recall : 0.000, F1-Score : 0.000
Class angry : Precision : 0.290, Recal

### Test phase

In [0]:
test_data = pd.read_csv(test_path, encoding = 'utf-8', sep = '\t')

In [0]:
print("Process training data")
train_indices, train_conversation, labels = load_pre_processed_data(train_path)
print("Process testing data")
_, test_conversation = load_pre_processed_data(test_path, is_training = False)
print("Tokenization")
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_conversation)
train_sequences = tokenizer.texts_to_sequences(train_conversation)
test_sequences = tokenizer.texts_to_sequences(test_conversation)

Process training data
Process testing data
Tokenization


In [0]:
evaluation_model = load_model('./gru_cnn_sswe_model.h5')
print("Generating prediction file")
generate_result_file(evaluation_model, test_sequences, result_file_name = 'test_gru_cnn_swe_predictions.txt')

Generating prediction file
Model parameters: LSTM Dim : 300, Dropout : 0.2, Batch_size : 200, Learning rate : 0.001


In [0]:
_, _, test_label = load_pre_processed_data(test_path)
_, _, test_predicted = load_pre_processed_data('test_gru_cnn_swe_predictions.txt')

In [0]:
actual = (to_categorical(np.array(test_label)))
predicted = (to_categorical(np.array(test_predicted)))

In [0]:
compute_metrics(predicted, actual)

True Positives per class :  [4214.  200.  187.  234.]
False Positives per class :  [174. 180. 171. 149.]
False Negatives per class :  [463.  84.  63.  64.]
Class happy : Precision : 0.526, Recall : 0.704, F1-Score : 0.602
Class sad : Precision : 0.522, Recall : 0.748, F1-Score : 0.615
Class angry : Precision : 0.611, Recall : 0.785, F1-Score : 0.687
Macro Precision : 0.553, Macro Recall : 0.746, Macro F1-Score : 0.635
Accuracy : 0.878, Micro Precision : 0.554, Micro Recall : 0.746, Micro F1-Score : 0.636


{'macro': [0.8776547467779996,
  0.5532094240188599,
  0.7458201050758362,
  0.635235306834371],
 'micro': [0.8776547467779996, 0.5539697, 0.7463942, 0.6359447029308016]}