In [138]:
#from load import load_preprocessed_data, label2emotion
import numpy as np
import json
import io
import os
import argparse
from keras.models import load_model
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import KFold
from keras.layers import Input, Dense, Embedding, LSTM, Concatenate, Reshape, GRU, Bidirectional, Dropout, Conv1D, Flatten, MaxPool1D, TimeDistributed
from keras.models import Sequential
from keras import optimizers
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from keras.layers import LeakyReLU
np.random.seed(7)
from keras.models import Model

#Keras Callback to compute F1 score for each epoch
class TestCallback(Callback):
    def __init__(self, test_data):
        self.test_data = test_data

    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        r = self.model.predict(x)
        getMetrics(r, y)


validationDataPath = ""
trainDataPath = ""
testDataPath = ""
solutionPath = ""
gloveDir = ""

trainDataPath = '../data/starterkit/train.txt'
#testDataPath = '../data/starterkit/devwithoutlabels.txt'
groundTruthTestData =  "../data/test.txt"
validationDataPath = '../data/devsetwithlabels.txt'
solutionPath = 'glovelstmtest.txt'
gloveDir = '../data/'
ssweDir = '../data/'


NUM_CLASSES = 4 
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 300
BATCH_SIZE = 200
LSTM_DIM = 300
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
DROPOUT = 0.2


def create_solution_file(model,u_testSequences):
    u_testData = pad_sequences(u_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(u_testData, batch_size=BATCH_SIZE)
    predictions = predictions.argmax(axis=1)

    with io.open(solutionPath, "w", encoding="utf8") as fout:
        fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')
        with io.open(groundTruthTestData, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(label2emotion[predictions[lineNum]] + '\n')
    print("Completed. Model parameters: ")
    print("Learning rate : %.3f, LSTM Dim : %d, Dropout : %.3f, Batch_size : %d"
          % (LEARNING_RATE, LSTM_DIM, DROPOUT, BATCH_SIZE))
    return

def create_solution_file2(model,u_testSequences):
    u_testData = pad_sequences(u_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict([u_testData,u_testData], batch_size=BATCH_SIZE)
    predictions = predictions.argmax(axis=1)

    with io.open(solutionPath, "w", encoding="utf8") as fout:
        fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')
        with io.open(groundTruthTestData, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(label2emotion[predictions[lineNum]] + '\n')
    print("Completed. Model parameters: ")
    print("Learning rate : %.3f, LSTM Dim : %d, Dropout : %.3f, Batch_size : %d"
          % (LEARNING_RATE, LSTM_DIM, DROPOUT, BATCH_SIZE))
    return

#Create the embedding matrix
def getEmbeddingMatrix(wordIndex, preEmbeddingDir, preEmbeddingFile):
    embeddingsIndex = {}
    with io.open(os.path.join(preEmbeddingDir, preEmbeddingFile), encoding="utf8") as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embeddingVector = np.array([float(val) for val in values[1:]])
            embeddingsIndex[word] = embeddingVector

    print('Found %s word vectors.' % len(embeddingsIndex))

    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        embeddingVector = embeddingsIndex.get(word)
        if embeddingVector is not None:
            embeddingMatrix[i] = embeddingVector
        else:
            oov = [np.random.normal(size = EMBEDDING_DIM)]
            oov /= np.linalg.norm(oov)
            embeddingMatrix[i] = oov

    return embeddingMatrix


#Compute the micro F1 score, the average accuracy, the micro precision and the micro recall
def getMetrics(predictions, ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))

    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)

    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)

    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))

    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))

    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()

    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))

    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)

    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------

    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)

    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))

    return accuracy, microPrecision, microRecall, microF1


#Bidirectional LSTM Model
def model1(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Dropout(DROPOUT))
    model.add(Bidirectional(LSTM(LSTM_DIM)))
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#GRU Model
def model2(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Dropout(DROPOUT))
    model.add(GRU(128))
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES * 8, activation='relu'))
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#CNN Model
def model3(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Dropout(DROPOUT))
    model.add(Conv1D(64, 3, padding='same'))
    model.add(Conv1D(32, 3, padding='same'))
    model.add(Conv1D(16, 3, padding='same'))
    model.add(Flatten())
    model.add(Dropout(DROPOUT))
    model.add(Dense(180, activation='relu'))
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#CNN-LSTM Model
def model4(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(2))
    model.add(LSTM(LSTM_DIM))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#LSTM-CNN Model
def model5(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(LSTM(LSTM_DIM, return_sequences=True))
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(2))
    model.add(Flatten())
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#CNN-BiLSTM Model
def model6(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(2))
    model.add(Dropout(DROPOUT))
    model.add(Bidirectional(LSTM(LSTM_DIM)))
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#BiLSTM-CNN Model
def model7(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Bidirectional(LSTM(LSTM_DIM, return_sequences=True)))
    model.add(Dropout(DROPOUT))
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(2))
    model.add(Flatten())
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#BiLSTM Model without dropout after embedding layer
def model8(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Bidirectional(LSTM(LSTM_DIM, dropout=0.6)))
    model.add(Dropout(0.9))
    model.add(Dense(100, activation='tanh'))
    model.add(Dropout(0.9))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adadelta',
                  metrics=['acc'])
    model.summary()
    return model


#CNN-GRU Model
def model9(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(2))
    model.add(GRU(LSTM_DIM))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model


#GRU-CNN Model
def model10(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(GRU(LSTM_DIM, return_sequences=True))
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(2))
    model.add(Flatten())
    model.add(Dropout(0.4))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    return model

#lstm Model for first layer refer to "A Sentiment-and-Semantics-Based Approach for Emotion Detection in Textual Conversations"
def modelLstm(embeddingMatrix):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(LSTM(64))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    
    return model

#FC Model for second layer refer to "A Sentiment-and-Semantics-Based Approach for Emotion Detection in Textual Conversations"
def modelFC(embeddingMatrix):
    model = Sequential()
    model.add(Dense(NUM_CLASSES, input_dim=64 * 2))
    model.add(LeakyReLU(alpha=0.05))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    model.summary()
    
    return model

from keras.layers import *

def custom_arch(gloveEmbeddingMatrix,ssweEmbeddingMatrix):

    
    gloveembeddingLayer = Embedding(gloveEmbeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[gloveEmbeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model_glove = Sequential()
    model_glove.add(gloveembeddingLayer)
    model_glove.add(GRU(LSTM_DIM, return_sequences=True))
    model_glove.add(Conv1D(32, 3, padding='same', activation='relu'))
    model_glove.add(MaxPool1D(2))
    model_glove.add(Flatten())
    model_glove.add(Dropout(0.4))
    model_glove.add(Dense(64, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model_glove.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    
    ssweembeddingLayer = Embedding(ssweEmbeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[ssweEmbeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
        
    model_sswe = Sequential()
    model_sswe.add(ssweembeddingLayer)
    model_sswe.add(GRU(LSTM_DIM, return_sequences=True))
    model_sswe.add(Conv1D(32, 3, padding='same', activation='relu'))
    model_sswe.add(MaxPool1D(2))
    model_sswe.add(Flatten())
    model_sswe.add(Dropout(0.4))
    model_sswe.add(Dense(64, activation='softmax'))
    adam = optimizers.adam(lr=LEARNING_RATE)
    model_sswe.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    
    
    #merged = concatenate([model_glove,model_sswe])
    merged = Add()([model_glove.output,model_sswe.output])
    #concatenated = concatenate([model1_out, model2_out])
    out = Dense(4, activation='softmax', name='output_layer')(merged)

    merged_model = Model([model_glove.input,model_sswe.input], out)
    
    
    
    merged_model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    merged_model.summary()
    
    return merged_model


In [126]:
# %load preprocessing.py
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import re


stopwords = set(stopwords.words('english')) - set(('not', 'no'))


tags = ['<url>', '<email>', '<user>', '<hashtag>', '</hashtag>',
        '<elongated>', '</elongated>', '<repeated>', '</repeated>']


text_processor = TextPreProcessor(
    normalize=['url', 'email', 'user'],
    annotate={'hashtag', 'elongated', 'repeated'},
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,
    unpack_contractions=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize
)


def preprocess(text):
    txt = text_processor.pre_process_doc(text)
    return list(filter(lambda x: x not in tags and
                                 x not in stopwords and
                                 x not in punctuation, txt))

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [127]:
# %load load.py
import pandas as pd


label2emotion = {0:"others", 1:"happy", 2:"sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

emotion2label_angry = {"others":0, "happy":0, "sad":0, "angry":3}
emotion2label_sad = {"others":0, "happy":0, "sad":2, "angry":0}
emotion2label_happy = {"others":0, "happy":1, "sad":0, "angry":0}


def load_data(path, training):
    data = pd.read_csv(path, encoding='utf-8', sep='\t')
    text = data[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
    if not training:
        return data['id'], text
    else:
        return data['id'], text, data['label']
def load_preprocessed_data(path, training=True):
    if not training:
        id, text = load_data(path, training)
        t = text.apply(lambda x: preprocess(x))
        return id.values.tolist(), t.values.tolist()
    else:
        id, text, label = load_data(path, training)
        t = text.apply(lambda x: preprocess(x))
        l = label.apply(lambda x: emotion2label[x])
        return id.values.tolist(), t.values.tolist(), l.values.tolist()

In [128]:
print("Processing training data...")
trainIndices, text_train, labels = load_preprocessed_data(trainDataPath)
print("Processing test data...")
_, text_test = load_preprocessed_data(groundTruthTestData, training=False)
print("Processing validation data...")
_, X_validation, y_validation = load_preprocessed_data(validationDataPath)

print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text_train)
u_trainSequences = tokenizer.texts_to_sequences(text_train)
u_testSequences = tokenizer.texts_to_sequences(text_test)
u_validationSequences = tokenizer.texts_to_sequences(X_validation)

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

Processing training data...
Processing test data...
Processing validation data...
Extracting tokens...
Found 14162 unique tokens.


In [90]:

#glove
print("Populating embedding matrix...")
#embeddingMatrix = getEmbeddingMatrix(wordIndex,gloveDir, 'glove.840B.300d.txt')
u_data = pad_sequences(u_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
u_validation = pad_sequences(u_validationSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels_validation = to_categorical(np.asarray(y_validation))

np.random.shuffle(trainIndices)
u_data = u_data[trainIndices]
labels = labels[trainIndices]

print("Building model...")
cbks = [ModelCheckpoint('./model1.h5', verbose=1, monitor='val_loss', save_best_only=True, mode='auto'),
        EarlyStopping(monitor='val_loss', patience=2),
        TestCallback((u_validation, labels_validation))]
model = model10(embeddingMatrix)
model.fit(u_data, labels, validation_data=(u_validation, labels_validation), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=True, callbacks=cbks)
model = load_model('./model1.h5')
print("Creating solution file...")
create_solution_file(model, u_testSequences)


Populating embedding matrix...
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_49 (Embedding)     (None, 64, 300)           4248900   
_________________________________________________________________
gru_47 (GRU)                 (None, 64, 300)           540900    
_________________________________________________________________
conv1d_47 (Conv1D)           (None, 64, 32)            28832     
_________________________________________________________________
max_pooling1d_47 (MaxPooling (None, 32, 32)            0         
_________________________________________________________________
flatten_47 (Flatten)         (None, 1024)              0         
_________________________________________________________________
dropout_47 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_55 (Dense)           

KeyboardInterrupt: 

In [75]:
#glove - lstm
'''
print("Populating embedding matrix...")
embeddingMatrix = getEmbeddingMatrix(wordIndex,gloveDir, 'glove.840B.300d.txt')
u_data = pad_sequences(u_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
u_validation = pad_sequences(u_validationSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels_validation = to_categorical(np.asarray(y_validation))

np.random.shuffle(trainIndices)
u_data = u_data[trainIndices]
labels = labels[trainIndices]
'''
print("Building model...")
cbks = [ModelCheckpoint('./modellstmglove.h5', verbose=1, monitor='val_loss', save_best_only=True, mode='auto'),
        EarlyStopping(monitor='val_loss', patience=2),
        TestCallback((u_validation, labels_validation))]
model = modelLstm(embeddingMatrix)
model.fit(u_data, labels, validation_data=(u_validation, labels_validation), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=True, callbacks=cbks)
print("Done training")
model = load_model('./modellstmglove.h5')
#print("Creating solution file...")
#model.layers[0].get_weights()[0]
print(len(model.layers))
#create_solution_file(model, u_testSequences)

Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 64, 300)           4248900   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 260       
Total params: 4,342,600
Trainable params: 4,342,600
Non-trainable params: 0
_________________________________________________________________
Train on 30160 samples, validate on 2755 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.32864, saving model to ./modellstmglove.h5
True Positives per class :  [2111.  104.   94.  124.]
False Positives per class :  [84. 81. 75. 82.]
False Negatives per class :  [227.  38.  31.  26.]
Class happy : Precision : 0.562, Recall : 0.732, F1 : 0.636
Class sad : Preci

In [9]:
model = load_model('./modellstmglove.h5')
print(model.layers[2].get_weights()[0].shape)
#print(len(model.layers[1].get_weights()))

glove_lstm_weights = np.zeros(shape = (64,1))

x = model.layers[2].get_weights()[0]
glove_lstm_weights = np.sum(x, axis = 1)
print(glove_lstm_weights.shape)
glove_lstm_weights = glove_lstm_weights.reshape(glove_lstm_weights.shape[0],1)
print(glove_lstm_weights.shape)

(64, 4)
(64,)
(64, 1)


In [46]:
#sswe
print("Populating embedding matrix...")
embeddingMatrix = getEmbeddingMatrix(wordIndex,ssweDir, "sswe-r.txt")
u_data = pad_sequences(u_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
u_validation = pad_sequences(u_validationSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels_validation = to_categorical(np.asarray(y_validation))

np.random.shuffle(trainIndices)
u_data = u_data[trainIndices]
labels = labels[trainIndices]

print("Building model...")
cbks = [ModelCheckpoint('./modelsswe.h5', verbose=1, monitor='val_loss', save_best_only=True, mode='auto'),
        EarlyStopping(monitor='val_loss', patience=2),
        TestCallback((u_validation, labels_validation))]
model = model10(embeddingMatrix)
model.fit(u_data, labels, validation_data=(u_validation, labels_validation), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=True, callbacks=cbks)
model = load_model('./modelsswe.h5')
print("Creating solution file...")
create_solution_file(model, u_testSequences)

Populating embedding matrix...
Found 137052 word vectors.
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 64, 300)           4248900   
_________________________________________________________________
gru_1 (GRU)                  (None, 64, 300)           540900    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 64, 32)            28832     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1024)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________


In [5]:
#sswe lstm
print("Populating embedding matrix...")
embeddingMatrix = getEmbeddingMatrix(wordIndex,ssweDir, "sswe-r.txt")
u_data = pad_sequences(u_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
u_validation = pad_sequences(u_validationSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels_validation = to_categorical(np.asarray(y_validation))

np.random.shuffle(trainIndices)
u_data = u_data[trainIndices]
labels = labels[trainIndices]

print("Building model...")
cbks = [ModelCheckpoint('./modelsswelstm.h5', verbose=1, monitor='val_loss', save_best_only=True, mode='auto'),
        EarlyStopping(monitor='val_loss', patience=2),
        TestCallback((u_validation, labels_validation))]
model = modelLstm(embeddingMatrix)
model.fit(u_data, labels, validation_data=(u_validation, labels_validation), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=True, callbacks=cbks)
model = load_model('./modelsswelstm.h5')


Populating embedding matrix...
Found 137052 word vectors.
Building model...
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 64, 300)           4248900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 260       
Total params: 4,342,600
Trainable params: 4,342,600
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 30160 samples, validate on 2755 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.35367, saving model to ./modelsswelstm.h5
True Positives per class :  [2105.  108.   91.  116.

In [8]:
model = load_model('./modelsswelstm.h5')
print(model.layers[2].get_weights()[0].shape)
#print(len(model.layers[1].get_weights()))

sswe_lstm_weights = np.zeros(shape = (64,1))

x = model.layers[2].get_weights()[0]
sswe_lstm_weights = np.sum(x, axis = 1)
print(sswe_lstm_weights.shape)
sswe_lstm_weights = sswe_lstm_weights.reshape(sswe_lstm_weights.shape[0],1)
print(sswe_lstm_weights.shape)

(64, 4)
(64,)
(64, 1)


In [19]:
glove_sswe_combined = np.concatenate((sswe_lstm_weights, glove_lstm_weights))
print(glove_sswe_combined.shape)

(128, 1)


In [22]:
print(u_data.shape)

(30160, 64)


In [129]:
#combined
print("Populating embedding matrix...")
print("Populating embedding matrix...")
#ssweembeddingMatrix = getEmbeddingMatrix(wordIndex,ssweDir, "sswe-r.txt")
#gloveembeddingMatrix = getEmbeddingMatrix(wordIndex,gloveDir, 'glove.840B.300d.txt')
u_data = pad_sequences(u_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
u_validation = pad_sequences(u_validationSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels_validation = to_categorical(np.asarray(y_validation))

np.random.shuffle(trainIndices)
u_data = u_data[trainIndices]
labels = labels[trainIndices]

print("Building model...")
cbks = [ModelCheckpoint('./modelcombined.h5', verbose=1, monitor='val_loss', save_best_only=True, mode='auto'),
        EarlyStopping(monitor='val_loss', patience=2),
        TestCallback(([np.array(u_validation),np.array(u_validation)], labels_validation))]
model = custom_arch(gloveembeddingMatrix,ssweembeddingMatrix)
model.fit([np.array(u_data),np.array(u_data)], np.array(labels), validation_data=([np.array(u_validation),np.array(u_validation)], np.array(labels_validation)), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=True, callbacks=cbks)
#model = load_model('./model99991.h5')
#print("Creating solution file...")
#create_solution_file(model, u_testSequences)

Populating embedding matrix...
Populating embedding matrix...
Building model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_64_input (InputLayer) (None, 64)           0                                            
__________________________________________________________________________________________________
embedding_65_input (InputLayer) (None, 64)           0                                            
__________________________________________________________________________________________________
embedding_64 (Embedding)        (None, 64, 300)      4248900     embedding_64_input[0][0]         
__________________________________________________________________________________________________
embedding_65 (Embedding)        (None, 64, 300)      4248900     embedding_65_input[0][0]         
_____________________________

<keras.callbacks.History at 0x1fbe7924278>

In [132]:
model = load_model('./modelcombined.h5')
print("Creating solution file...")
create_solution_file2(model, u_testSequences)

Creating solution file...
Completed. Model parameters: 
Learning rate : 0.001, LSTM Dim : 300, Dropout : 0.200, Batch_size : 200


In [56]:
print("Building model...")
cbks = [ModelCheckpoint('./modelsswelstm.h5', verbose=1, monitor='val_loss', save_best_only=True, mode='auto'),
        EarlyStopping(monitor='val_loss', patience=2),
        TestCallback((u_validation, labels_validation))]
model = model10(gloveembeddingMatrix )
model.fit(u_data, labels, validation_data=(u_validation, labels_validation), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=True, callbacks=cbks)
model = load_model('./modelsswelstm.h5')
print("Creating solution file...")
create_solution_file(model, u_testSequences)

Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 64, 300)           4248900   
_________________________________________________________________
gru_26 (GRU)                 (None, 64, 300)           540900    
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 64, 32)            28832     
_________________________________________________________________
max_pooling1d_26 (MaxPooling (None, 32, 32)            0         
_________________________________________________________________
flatten_26 (Flatten)         (None, 1024)              0         
_________________________________________________________________
dropout_26 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_34 (Dense)             (None, 4)                 410

ValueError: Error when checking target: expected dense_34 to have shape (4,) but got array with shape (1,)

In [133]:
def get_test_metrics(predictions,ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    """
    
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    #discretePredictions = to_categorical(predictions.argmax(axis=1))

    truePositives = np.sum(predictions*ground, axis=0)
    falsePositives = np.sum(np.clip(predictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-predictions, 0, 1), axis=0)

    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)

    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(0, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))

    macroPrecision /= 4
    macroRecall /= 4
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Not Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))

    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()

    print("Not Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))

    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)

    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------

    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)

    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))

    

In [134]:

groundTruthTestData = pd.read_csv(groundTruthTestData, encoding='utf-8', sep='\t')

In [135]:
print(groundTruthTestData.shape)

(5509, 5)


In [139]:
print("Processing training data...")
trainIndices, text_train, labels = load_preprocessed_data(trainDataPath)
print("Processing test data...")
_, text_test = load_preprocessed_data(groundTruthTestData, training=False)
print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text_train)
u_trainSequences = tokenizer.texts_to_sequences(text_train)
u_testSequences = tokenizer.texts_to_sequences(text_test)

Processing training data...
Processing test data...
Extracting tokens...


In [52]:

model = load_model('./modelsswe.h5')
print("Creating solution file...")
create_solution_file(model, u_testSequences)

Creating solution file...
Completed. Model parameters: 
Learning rate : 0.001, LSTM Dim : 300, Dropout : 0.200, Batch_size : 200


In [140]:
_, _, y_test_groundtruth = load_preprocessed_data(groundTruthTestData)
_, _, y_test_predicted = load_preprocessed_data(solutionPath)


In [141]:
ground_truth = (to_categorical(np.array(y_test_groundtruth)))
predicted = (to_categorical(np.array(y_test_predicted)))

In [142]:
get_test_metrics(predicted, ground_truth)

True Positives per class :  [4677.  284.  250.  298.]
False Positives per class :  [ 832. 5225. 5259. 5211.]
False Negatives per class :  [0. 0. 0. 0.]
Class others : Precision : 0.849, Recall : 1.000, F1 : 0.918
Class happy : Precision : 0.052, Recall : 1.000, F1 : 0.098
Class sad : Precision : 0.045, Recall : 1.000, F1 : 0.087
Class angry : Precision : 0.054, Recall : 1.000, F1 : 0.103
Not Ignoring the Others class, Macro Precision : 0.2500, Macro Recall : 1.0000, Macro F1 : 0.4000
Not Ignoring the Others class, Micro TP : 832, FP : 15695, FN : 0
Accuracy : 0.8490, Micro Precision : 0.0503, Micro Recall : 1.0000, Micro F1 : 0.0959


In [143]:
#alternative function for getting metric
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def get_metrics_alternative(y_true, y_pred):
         """Given predicted labels and the respective ground truth labels, display some metrics
         Input: shape [# of samples]
            predictions : Model output labels eg. [1 2 3 0 1 2...]
            ground : Ground truth labels eg. [1 2 3 0 1 2...]]
        """
         p_r_fbeta = (precision_recall_fscore_support(y_true, y_pred, average='macro'))
         acc = accuracy_score(y_true, y_pred)
         print("Macro Precision = ", p_r_fbeta[0])
         print("Macro Recall = ", p_r_fbeta[1])
         print("Macro F1 = ", p_r_fbeta[2])
         print("Accuracy = ", acc)

         p_r_fbeta = (precision_recall_fscore_support(y_true, y_pred, average='micro'))
         print("Micro Precision = ", p_r_fbeta[0])
         print("Micro Recall = ", p_r_fbeta[1])
         print("Micro F1 = ", p_r_fbeta[2])


In [144]:
get_metrics_alternative(y_test_groundtruth,y_test_predicted)

Macro Precision =  0.21224360137956072
Macro Recall =  0.25
Macro F1 =  0.22957981543294717
Accuracy =  0.8489744055182429
Micro Precision =  0.8489744055182429
Micro Recall =  0.8489744055182429
Micro F1 =  0.8489744055182429


  'precision', 'predicted', average, warn_for)
