# EmoSense at SemEval-2019 Task 3: Bidirectional LSTM Network for Contextual Emotion Detection in Textual Conversations

## 1. Loading Data

In [1]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import numpy as np

import re
import io

label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

emoticons_additional = {
    '(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>',
    ':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>',
    ':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':":
        '<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>',
}

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter",
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, emoticons_additional]
)


def tokenize(text):
    text = " ".join(text_processor.pre_process_doc(text))
    return text


def preprocessData(dataFilePath, mode):
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            line = line.strip().split('\t')
            for i in range(1, 4):
                line[i] = tokenize(line[i])
            if mode == "train":
                labels.append(emotion2label[line[4]])
            conv = line[1:4]
            conversations.append(conv)
    if mode == "train":
        return np.array(conversations), np.array(labels)
    else:
        return np.array(conversations)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...


In [2]:
texts_train, labels_train = preprocessData('data/train.txt', mode="train")
texts_dev, labels_dev = preprocessData('data/dev.txt', mode="train")
texts_test, labels_test = preprocessData('data/test.txt', mode="train")

In [3]:
emo_mapping = {
    '🙂':'',
    '🙁':'😕',
    '🤣':'😂',
    '🤐':'😬',
    '🙄':'😏',
    '🍾':'🍹',
    '🤗':'☺',
    '🤔':'😏',
    '🤡':'🃏',
    '🛰':'',
    '🤑':'💰',
    '\u200d':'',
    '🤥':'😢',
    '🤕':'',
    '🖕':'',
    '🤦':'',
    '🕺':'',
    '🏕':'',
    '🙃':'',
    '🤒':'',
    '🏣':'',
    '🤷':'💁',
    '🤢':'',
    '🏖':'',
   '🏋':'',
    '🤘':'',
    '🤖':'',
    '⏸':''
}

for x in texts_train:
    for y in x:
        for i, j in emo_mapping.items():
            y = y.replace(i, j)
            
for x in texts_dev:
    for y in x:
        for i, j in emo_mapping.items():
            y = y.replace(i, j)
            
for x in texts_test:
    for y in x:
        for i, j in emo_mapping.items():
            y = y.replace(i, j)

In [4]:
texts_train[20:25]

array([['do you dance ?', 'yes i love to dance 😻',
        '😂 😂 😂 so you have legs too'],
       ['i hate it too', 'guess what , i do not .', 'even i do not'],
       ['not always', 'what about yesterday',
        'do u know what <number> is ?'],
       ['bcoz u dont know wat is to miss someone',
        'but sometimes one can not express the same', '😢'],
       ['yeah', 'i will ask around', 'which is your favourite movie']],
      dtype='<U625')

## 2. Loading Word Embeddings

In [87]:
def getEmbeddings(file):
    embeddingsIndex = {}
    with io.open(file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector 
            
    iter = 0
    with io.open('D:\Downloads\DP\emoji2vec300.txt', encoding="utf8") as f:
        for line in f:
            if iter == 0:
                iter += 1
                continue
            values = line.split()
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector 
            
    return embeddingsIndex, 300


def getEmbeddingMatrix(wordIndex, embeddings, dim):
    embeddingMatrix = np.zeros((len(wordIndex) + 1, dim))
    for word, i in wordIndex.items():
        embeddingMatrix[i] = embeddings.get(word)
    return embeddingMatrix

In [88]:
from tensorflow.keras.preprocessing.text import Tokenizer

embeddings, dim = getEmbeddings('D:\Downloads\DP\emosense.300d.txt')
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([' '.join(list(embeddings.keys()))])

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

embeddings_matrix = getEmbeddingMatrix(wordIndex, embeddings, dim) 

Found 658845 unique tokens.


## 3. Texts Tokenization

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from emosent import get_emoji_sentiment_rank


MAX_SEQUENCE_LENGTH = 24

X_train, X_val, y_train, y_val = train_test_split(texts_train, labels_train, test_size=0.2, random_state=42)

labels_categorical_train = to_categorical(np.asarray(y_train))
labels_categorical_val = to_categorical(np.asarray(y_val))
labels_categorical_dev = to_categorical(np.asarray(labels_dev))
labels_categorical_test = to_categorical(np.asarray(labels_test))


def add_lexical_feature(texts, message, seq):
    sent_arr = np.zeros((len(message),1))   # create array of zeros
    for i in range(0, len(message)):
        sent = 0
        l = 0
        for char in texts[i][seq]:
            try:
                sent += get_emoji_sentiment_rank(char)["sentiment_score"] 
                l+=1
            except:
                pass
        if l>0:
            sent /= l    
        sent_arr[i] = sent

    message = np.append(message, sent_arr, axis=1)
    return message
    

def get_sequances(texts, sequence_length):
    message_first = pad_sequences(tokenizer.texts_to_sequences(texts[:, 0]), sequence_length)
    message_second = pad_sequences(tokenizer.texts_to_sequences(texts[:, 1]), sequence_length)
    message_third = pad_sequences(tokenizer.texts_to_sequences(texts[:, 2]), sequence_length)
    
    message_first = add_lexical_feature(texts, message_first, 0)
    message_second = add_lexical_feature(texts, message_second, 1)
    message_third = add_lexical_feature(texts, message_third, 2)
    
    return message_first, message_second, message_third


message_first_message_train, message_second_message_train, message_third_message_train = get_sequances(X_train, MAX_SEQUENCE_LENGTH)
message_first_message_val, message_second_message_val, message_third_message_val = get_sequances(X_val, MAX_SEQUENCE_LENGTH)
message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequances(texts_dev, MAX_SEQUENCE_LENGTH)
message_first_message_test, message_second_message_test, message_third_message_test = get_sequances(texts_test, MAX_SEQUENCE_LENGTH)

## 3. Bidirectional LSTM 

In [93]:
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Activation, \
    Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, GaussianNoise
from tensorflow.keras.models import Model


def buildModel(embeddings_matrix, sequence_length, lstm_dim, hidden_layer_dim, num_classes, 
               noise=0.1, dropout_lstm=0.2, dropout=0.2):
    turn1_input = Input(shape=(sequence_length,), dtype='int32')
    turn2_input = Input(shape=(sequence_length,), dtype='int32')
    turn3_input = Input(shape=(sequence_length,), dtype='int32')
    embedding_dim = embeddings_matrix.shape[1]
    embeddingLayer = Embedding(embeddings_matrix.shape[0],
                                embedding_dim,
                                weights=[embeddings_matrix],
                                input_length=sequence_length,
                                trainable=False)
    
    turn1_branch = embeddingLayer(turn1_input)
    turn2_branch = embeddingLayer(turn2_input) 
    turn3_branch = embeddingLayer(turn3_input) 
    
    turn1_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn1_branch)
    turn2_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn2_branch)
    turn3_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn3_branch)

    lstm1 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    lstm2 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    
    turn1_branch = lstm1(turn1_branch)
    turn2_branch = lstm2(turn2_branch)
    turn3_branch = lstm1(turn3_branch)
    
    x = Concatenate(axis=-1)([turn1_branch, turn2_branch, turn3_branch])
    
    x = Dropout(dropout)(x)
    
    x = Dense(hidden_layer_dim, activation='relu')(x)
    
    output = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=[turn1_input, turn2_input, turn3_input], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

model = buildModel(embeddings_matrix, MAX_SEQUENCE_LENGTH + 1, lstm_dim=64, hidden_layer_dim=30, num_classes=4) 

In [94]:
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 300)      197653800   input_7[0][0]                    
                                                                 input_8[0][0]         

In [95]:
from sklearn.metrics import f1_score, precision_score, recall_score
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

metrics = {
    "f1_selected": (lambda y_test, y_pred:
             f1_score(y_test, y_pred, average='micro',
                      labels=[emotion2label['happy'],
                              emotion2label['sad'],
                              emotion2label['angry']
                              ]))
}

_datasets = {}
_datasets["dev"] = [[message_first_message_dev, message_second_message_dev, message_third_message_dev],
                    np.array(labels_categorical_dev)]
_datasets["val"] = [[message_first_message_val, message_second_message_val, message_third_message_val],
                    np.array(labels_categorical_val)]

In [99]:
model.fit([message_first_message_train, message_second_message_train, message_third_message_train],
                    np.array(labels_categorical_train),
                    validation_data=(
                        [message_first_message_val, message_second_message_val, message_third_message_val],
                        np.array(labels_categorical_val)
                    ),
                    epochs=10,
                    batch_size=200)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x21024d13898>

In [100]:
y_pred = model.predict([message_first_message_dev, message_second_message_dev, message_third_message_dev])

In [101]:
from sklearn.metrics import classification_report

for title, metric in metrics.items():
    print(title, metric(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))
print(classification_report(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))

f1_selected 0.7023686920700309
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2338
           1       0.66      0.75      0.70       142
           2       0.66      0.84      0.74       125
           3       0.56      0.86      0.68       150

    accuracy                           0.90      2755
   macro avg       0.71      0.84      0.76      2755
weighted avg       0.92      0.90      0.91      2755



## 4. Performance Evaluation

In [102]:
y_pred = model.predict([message_first_message_test, message_second_message_test, message_third_message_test])

for title, metric in metrics.items():
    print(title, metric(labels_categorical_test.argmax(axis=1), y_pred.argmax(axis=1)))
print(classification_report(labels_categorical_test.argmax(axis=1), y_pred.argmax(axis=1)))

f1_selected 0.701500258665287
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      4677
           1       0.70      0.71      0.71       284
           2       0.61      0.86      0.71       250
           3       0.57      0.88      0.69       298

    accuracy                           0.90      5509
   macro avg       0.71      0.84      0.76      5509
weighted avg       0.92      0.90      0.91      5509



In [104]:
y_pred = model.predict([message_first_message_test, message_second_message_test, message_third_message_test])

for title, metric in metrics.items():
    print(title, metric(labels_categorical_test.argmax(axis=1), y_pred.argmax(axis=1)))
print(classification_report(labels_categorical_test.argmax(axis=1), y_pred.argmax(axis=1)))

f1_selected 0.719832109129066
              precision    recall  f1-score   support

           0       0.97      0.92      0.95      4677
           1       0.60      0.80      0.68       284
           2       0.70      0.84      0.76       250
           3       0.63      0.84      0.72       298

    accuracy                           0.91      5509
   macro avg       0.73      0.85      0.78      5509
weighted avg       0.92      0.91      0.91      5509



In [41]:
def saveSubmissionFile(solution_path, test_data_path, predictions):
    with io.open(solution_path, "w", encoding="utf8") as fout:
        fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')        
        with io.open(test_data_path, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(label2emotion[predictions[lineNum]] + '\n')
                
                
saveSubmissionFile('results.txt', './starterkitdata/test.txt', y_pred.argmax(axis=1))