# NLP Project: EmoContext

Here we have explored models based on DeepMoji embeddings.   
**DeepMoji needs to be installed for using it**

In [2]:
import sys
sys.path.append('../')

In [3]:
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.datasets import imdb
from deepmoji.model_def import deepmoji_architecture
import io
import random
from copy import deepcopy

Using TensorFlow backend.


In [87]:
import json
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.create_vocab import extend_vocab, VocabBuilder
from deepmoji.word_generator import WordGenerator
from deepmoji.tokenizer import tokenize
from sklearn.model_selection import train_test_split

In [5]:
from utills import *
from keras.layers import *
from keras.models import Model, Sequential
from keras.optimizers import Adam
from deepmoji.model_def import deepmoji_feature_encoding
from deepmoji.global_variables import PRETRAINED_PATH

## Data read

In [6]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in  "train" mode] List of labels
    """
    indices = []
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '
                line = cSpace.join(lineSplit)

            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)

#             conv = ' <eos> '.join(line[1:4])

            # Remove any duplicate spaces
#             duplicateSpacePattern = re.compile(r'\ +')
#             conv = re.sub(duplicateSpacePattern, ' ', conv)
            
#             for l in line[1:4]:
                
            indices.append(int(line[0]))
            conversations.append(line[1:4])

    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations

In [7]:
_ , conv_train = preprocessData('../data/train.txt', 'test')
_ , conv_test = preprocessData('../data/devwithoutlabels.txt', 'test')

In [8]:
convs = conv_train + conv_test

## Extending Vocab

In [17]:
# loading deafult voacb
with open('./../DeepMoji/model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

In [18]:
new_words = set()

In [19]:
for conv in convs:
    for line in conv:
        words = tokenize(line)
        for word in words:
            if word.lower() in vocabulary:
                pass
            else:
                new_words.add(word.lower())

In [20]:
len(new_words)

5197

In [21]:
word_gen = WordGenerator(list(new_words))
vb = VocabBuilder(word_gen)
vb.count_all_words()

In [22]:
print(len(vocabulary))
print(len(vb.word_counts.keys()))

50000
4517


In [23]:
# Hyperparameters
extend_vocab(vocabulary, vb, max_tokens=3000)

3000

In [24]:
print(len(vocabulary))

53000


In [25]:
vb.save_vocab(path='../data/new_vocab')

Saved dict to ../data/new_vocab


## Models

In [27]:
MAX_LEN = 10

### Model 1: Simple MLP model

In [216]:
model = Sequential()

deep_feature_extractor = deepmoji_feature_encoding(MAX_LEN, PRETRAINED_PATH)

model.add(
    TimeDistributed(
        deep_feature_extractor,
        input_shape=(3,MAX_LEN)
    ))

model.add(Flatten())
model.add(Dropout(rate=0.5))
# model.add(Dense(2304, activation='relu'))
# model.add(Dropout(rate=0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(4,activation='softmax'))

Loading weights for embedding
Loading weights for bi_lstm_0
Loading weights for bi_lstm_1
Loading weights for attlayer
Ignoring weights for softmax


In [217]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_17 (TimeDis (None, 3, 2304)           22247680  
_________________________________________________________________
flatten_16 (Flatten)         (None, 6912)              0         
_________________________________________________________________
dropout_43 (Dropout)         (None, 6912)              0         
_________________________________________________________________
dense_35 (Dense)             (None, 512)               3539456   
_________________________________________________________________
dropout_44 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_36 (Dense)             (None, 4)                 2052      
Total params: 25,789,188
Trainable params: 25,789,188
Non-trainable params: 0
________________________________________________________________

### Model 2: 1-D Conv-> LSTM

In [219]:
model = Sequential()

# deep_feature_extractor = deepmoji_feature_encoding(MAX_LEN, PRETRAINED_PATH)

model.add(
    TimeDistributed(
        deep_feature_extractor,
        input_shape=(3,MAX_LEN)
    ))

In [220]:
model.add(Conv1D(32,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.4))
model.add(LSTM(62,return_sequences=True))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.45))
model.add(Dense(4,activation='softmax'))

In [221]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_18 (TimeDis (None, 3, 2304)           22247680  
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 3, 32)             221216    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 1, 32)             0         
_________________________________________________________________
dropout_45 (Dropout)         (None, 1, 32)             0         
_________________________________________________________________
lstm_36 (LSTM)               (None, 1, 62)             23560     
_________________________________________________________________
dropout_46 (Dropout)         (None, 1, 62)             0         
_________________________________________________________________
flatten_17 (Flatten)         (None, 62)                0         
__________

### Model 3: 1-D Conv-> Desne

In [222]:
model = Sequential()

# deep_feature_extractor = deepmoji_feature_encoding(MAX_LEN, PRETRAINED_PATH)

model.add(
    TimeDistributed(
        deep_feature_extractor,
        input_shape=(3,MAX_LEN)
    ))
def model_cnn():
    # 1D Conv Layer with multiple possible kernel sizes
    inputs = Input(shape=(3, 2304))

    model = Conv1D(filters=300,
                   kernel_size=3,
                   padding='valid',
                   activation='relu',
                   kernel_regularizer=regularizers.l2(0.001),
                   strides=1)(inputs)
    
    model = GlobalMaxPooling1D()(model)

    flat_input = Flatten()(inputs)
    flat_input = Dense(512, activation='relu',
                       kernel_regularizer=regularizers.l2(0.01),
                       activity_regularizer=regularizers.l2(0.01))(flat_input)
    flat_input = Dropout(0.5)(flat_input)


    model = Concatenate()([model, flat_input])

    model = Dense(264, activation='relu', kernel_regularizer=regularizers.l2(0.01))(model)
    model = Dropout(0.3)(model)
    predictions = Dense(4, activation='softmax')(model)
    model = Model(inputs=inputs, outputs=predictions)
    return model

In [223]:
model.add(model_cnn())

In [225]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_19 (TimeDis (None, 3, 2304)           22247680  
_________________________________________________________________
model_4 (Model)              (None, 4)                 5829048   
Total params: 28,076,728
Trainable params: 28,076,728
Non-trainable params: 0
_________________________________________________________________


In [None]:
# common to all stuffs

In [211]:
adam = Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [215]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_16 (TimeDis (None, 3, 2304)           22247680  
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 3, 200)            2765000   
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 1, 200)            0         
_________________________________________________________________
dropout_40 (Dropout)         (None, 1, 200)            0         
_________________________________________________________________
lstm_31 (LSTM)               (None, 1, 62)             65224     
_________________________________________________________________
dropout_41 (Dropout)         (None, 1, 62)             0         
_________________________________________________________________
flatten_15 (Flatten)         (None, 62)                0         
__________

In [152]:
# from keras.utils import plot_model
# plot_model(model, to_file='model.png')

## Data preprocessing

In [32]:
_, X, Y = preprocessData('../data/train.txt',mode='train')

In [33]:
st = SentenceTokenizer(vocabulary, MAX_LEN)

In [34]:
for i in range(len(X)):
    X[i], infos, stats = st.tokenize_sentences(X[i])

In [35]:
X = np.array(X)
X.shape

(30160, 3, 10)

In [36]:
Y = to_categorical(Y)
Y.shape

(30160, 4)

## Data Augmentation

- random remove stopwords
- random switch words

In [62]:
indexes = [ i for i in range(MAX_LEN)]

def switch_words(X):
    p = random.random()
    if p < 0.3:
        new_x = deepcopy(X)
        
        try:
            last_index = np.where(new_x == 0)[0][0]
        except:
            last_index = MAX_LEN - 1
        
        first = random.randint(0,last_index) 
        second = random.randint(0,last_index) 
        new_x[first],new_x[second] = new_x[second],new_x[first]
        return new_x
    return X

def remove_random_word(X):
    p = random.random()
    if p < 0.5:
        where = random.sample(indexes,1)[0]
        where = int(where)
        new_x = deepcopy(X)
        if new_x[where] is 0:
            pass
        else:
            new_x[where] = 1
            return new_x 
    return X

In [65]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, 
                                                    test_size=0.1, 
                                                    shuffle=True,
                                                    stratify=Y,
                                                    random_state=32)

In [66]:
x_train = list(x_train)
y_train = list(y_train)

n_sample = len(x_train)
for sample in range(n_sample):
    new_sample = []
    is_new_sample = False
    for line in x_train[sample]:
        x_1 = switch_words(line)
        x_2 = remove_random_word(x_1)
        new_sample.append(x_2)
        
        if x_2 is not line:
            is_new_sample = True
    
    if is_new_sample:
        new_sample = np.array(new_sample).reshape(3,MAX_LEN)
        x_train.append(new_sample)
        y_train.append(y_train[sample])

In [67]:
x_train = np.array(x_train)
print(x_train.shape)
y_train = np.array(y_train)
print(y_train.shape)

(53154, 3, 10)
(53154, 4)


## Train

In [213]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
earlystop = EarlyStopping(monitor='val_acc', min_delta=0.01, patience=4, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath='./weights_all_cnn_lstm.hdf5', verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss')

In [214]:
history = model.fit(x_train,
          y_train,
          validation_data=(x_test,y_test),
          shuffle=True,
          batch_size=32,
          epochs=2,
          callbacks=[earlystop,checkpointer],
          initial_epoch=0)  

Train on 53154 samples, validate on 3016 samples
Epoch 1/2
Epoch 2/2


In [202]:
# model.load_weights('./weights_all_cnn_lstm.hdf5')

## Testing Upload 

A file called `test.txt` is created which needs to be zipped  and uploaded to the the platform

In [203]:
_, x_dev = preprocessData('../data/devwithoutlabels.txt',mode='test')

st = SentenceTokenizer(vocabulary, MAX_LEN)

for i in range(len(x_dev)):
    x_dev[i], infos, stats = st.tokenize_sentences(x_dev[i])

x_dev = np.array(x_dev)

In [204]:
predictions = model.predict(x_dev, batch_size=64,verbose=1)
predictions = predictions.argmax(axis=1)



In [205]:
with io.open('../test.txt', "w", encoding="utf8") as fout:
    fout.write(unicode('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n'))        
    with io.open('../data/devwithoutlabels.txt', encoding="utf8") as fin:
        fin.readline()
        for lineNum, line in enumerate(fin):
            fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
            fout.write(unicode(label2emotion[predictions[lineNum]] + '\n'))

## Let's see how this looks on the test model

In [206]:
pred_Y = model.predict_classes(x_test)
pred_Y = to_categorical(pred_Y)



In [207]:
getMetrics(pred_Y,y_test)

True Positives per class :  [ 1380.   358.   479.   471.]
False Positives per class :  [ 140.   70.   63.   55.]
False Negatives per class :  [ 115.   66.   67.   80.]
Class happy : Precision : 0.836, Recall : 0.844, F1 : 0.840
Class sad : Precision : 0.884, Recall : 0.877, F1 : 0.881
Class angry : Precision : 0.895, Recall : 0.855, F1 : 0.875
Ignoring the Others class, Macro Precision : 0.8719, Macro Recall : 0.8588, Macro F1 : 0.8653
Ignoring the Others class, Micro TP : 1308, FP : 188, FN : 213
Accuracy : 0.8912, Micro Precision : 0.8743, Micro Recall : 0.8600, Micro F1 : 0.8671


(0.89124668435013266,
 0.87433155080213909,
 0.85996055226824453,
 0.86708650977792501)

In [125]:
def getMetrics(predictions, ground):
    """
    FROM: Baseline/starting_kit

    Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))

    truePositives = np.sum(discretePredictions * ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground - discretePredictions, 0, 1), axis=0)

    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)

    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, 4):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = (2 * recall * precision) / (precision + recall) if (precision + recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))

    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision) / (macroPrecision + macroRecall) if (
                                                                                             macroPrecision + macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (
        macroPrecision, macroRecall, macroF1))

    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()

    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (
        truePositives, falsePositives, falseNegatives))

    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)

    microF1 = (2 * microRecall * microPrecision) / (microPrecision + microRecall) if (
                                                                                             microPrecision + microRecall) > 0 else 0
    # -----------------------------------------------------

    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions == ground)

    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (
        accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1


## Notes

```
MAX_LEN = 10

model = Sequential()

deep_feature_extractor = deepmoji_feature_encoding(MAX_LEN, PRETRAINED_PATH)

model.add(
    TimeDistributed(
        deep_feature_extractor,
        input_shape=(3,MAX_LEN)
    ))

model.add(Flatten())
model.add(Dropout(rate=0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(4,activation='softmax'))

adam = Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

model.summary()
```

```_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
time_distributed_2 (TimeDist (None, 3, 2304)           22247680  
_________________________________________________________________
flatten_2 (Flatten)          (None, 6912)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 6912)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               3539456   
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 2052      
=================================================================
Total params: 25,789,188
Trainable params: 25,789,188
Non-trainable params: 0
```

On 0.1% stratified test
```
True Positives per class :  [ 1393.   346.   486.   478.]
False Positives per class :  [ 145.   44.   71.   53.]
False Negatives per class :  [ 102.   78.   60.   73.]
Class happy : Precision : 0.887, Recall : 0.816, F1 : 0.850
Class sad : Precision : 0.873, Recall : 0.890, F1 : 0.881
Class angry : Precision : 0.900, Recall : 0.868, F1 : 0.884
Ignoring the Others class, Macro Precision : 0.8866, Macro Recall : 0.8579, Macro F1 : 0.8720
Ignoring the Others class, Micro TP : 1310, FP : 168, FN : 211
Accuracy : 0.8962, Micro Precision : 0.8863, Micro Recall : 0.8613, Micro F1 : 0.8736
```


## CNN on embeddings

```
def model_cnn():
    # 1D Conv Layer with multiple possible kernel sizes
    inputs = Input(shape=(3, 2304))

    model = Conv1D(filters=300,
                   kernel_size=3,
                   padding='valid',
                   activation='relu',
                   kernel_regularizer=regularizers.l2(0.001),
                   strides=1)(inputs)
    
    model = GlobalMaxPooling1D()(model)

    flat_input = Flatten()(inputs)
    flat_input = Dense(512, activation='relu',
                       kernel_regularizer=regularizers.l2(0.01),
                       activity_regularizer=regularizers.l2(0.01))(flat_input)
    flat_input = Dropout(0.5)(flat_input)


    model = Concatenate()([model, flat_input])

    model = Dense(264, activation='relu', kernel_regularizer=regularizers.l2(0.01))(model)
    model = Dropout(0.3)(model)
    predictions = Dense(4, activation='softmax')(model)
    model = Model(inputs=inputs, outputs=predictions)
    return model
    
 ````
 
On 0.1% stratified test
```
True Positives per class :  [ 1385.   312.   473.   458.]
False Positives per class :  [ 201.   47.   78.   62.]
False Negatives per class :  [ 110.  112.   73.   93.]
Class happy : Precision : 0.869, Recall : 0.736, F1 : 0.797
Class sad : Precision : 0.858, Recall : 0.866, F1 : 0.862
Class angry : Precision : 0.881, Recall : 0.831, F1 : 0.855
Ignoring the Others class, Macro Precision : 0.8694, Macro Recall : 0.8111, Macro F1 : 0.8393
Ignoring the Others class, Micro TP : 1243, FP : 187, FN : 278
Accuracy : 0.8714, Micro Precision : 0.8692, Micro Recall : 0.8172, Micro F1 : 0.8424
```

## CNN_LSTM Small

```
model = Sequential()

deep_feature_extractor = deepmoji_feature_encoding(MAX_LEN, PRETRAINED_PATH)

model.add(
    TimeDistributed(
        deep_feature_extractor,
        input_shape=(3,MAX_LEN)
    ))

model.add(Conv1D(32,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.4))
model.add(LSTM(50,return_sequences=True))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.45))
model.add(Dense(4,activation='softmax'))
```

```
True Positives per class :  [ 1372.   360.   474.   472.]
False Positives per class :  [ 148.   77.   59.   54.]
False Negatives per class :  [ 123.   64.   72.   79.]
Class happy : Precision : 0.824, Recall : 0.849, F1 : 0.836
Class sad : Precision : 0.889, Recall : 0.868, F1 : 0.879
Class angry : Precision : 0.897, Recall : 0.857, F1 : 0.877
Ignoring the Others class, Macro Precision : 0.8701, Macro Recall : 0.8579, Macro F1 : 0.8640
Ignoring the Others class, Micro TP : 1306, FP : 190, FN : 215
Accuracy : 0.8879, Micro Precision : 0.8730, Micro Recall : 0.8586, Micro F1 : 0.8658
```