In [2]:
import emoji
import numpy as np

In [3]:
#define emoji dictionary used
emoji_dictionary = {"0" : "\u2764\uFE0F",
                    "1" : ":baseball:",
                    "2" : ":grinning_face_with_big_eyes:",
                    "3" : ":disappointed_face:",
                    "4" : ":fork_and_knife:"
                   }


In [4]:
for e in emoji_dictionary:
    print(emoji.emojize(emoji_dictionary[e]))

❤️
⚾
😃
😞
🍴


In [5]:
def process_x(path):
    f = open(path)
    text = f.read()
    f.close()
    text = text.replace("' " , "',")
    text = text.replace("\n", ',')
    sep = text.split(",")
    for i,sent in enumerate(sep):
        sep[i] = sent.replace("'", "")
        sep[i] = sep[i].split()
    x = np.array(sep)
    return x

def process_y(path):
    f = open(path)
    y = f.read()
    f.close()
    y = y.split(" ")
    y_ = []
    for i, yval in enumerate(y):
        y[i] = yval.replace("\n", "")
        y_.append(y[i])
    return np.array(y_)

In [6]:
X_TRAIN_PATH = 'emojify_train_x.csv'
Y_TRAIN_PATH = 'Emojify_Y_train.csv'
X_TEST_PATH =  'emojiy_test_x.csv'
Y_TEST_PATH =  'emojiy_y_test.csv'

In [7]:
x_train = process_x(X_TRAIN_PATH)
y_train = process_y(Y_TRAIN_PATH)

x_test = process_x(X_TEST_PATH)
y_test = process_y(Y_TEST_PATH)

  # This is added back by InteractiveShellApp.init_path()


In [8]:
x_train[:5]

array([list(['never', 'talk', 'to', 'me', 'again']),
       list(['I', 'am', 'proud', 'of', 'your', 'achievements']),
       list(['It', 'is', 'the', 'worst', 'day', 'in', 'my', 'life']),
       list(['Miss', 'you', 'so', 'much']), list(['food', 'is', 'life'])],
      dtype=object)

In [9]:
for i in range(5):
    print(x_train[i], emoji.emojize(emoji_dictionary[y_train[i]]))

['never', 'talk', 'to', 'me', 'again'] 😞
['I', 'am', 'proud', 'of', 'your', 'achievements'] 😃
['It', 'is', 'the', 'worst', 'day', 'in', 'my', 'life'] 😞
['Miss', 'you', 'so', 'much'] ❤️
['food', 'is', 'life'] 🍴


In [10]:
# get word embeddings 

f = open('glove.6B.50d.txt', encoding = 'utf-8')
embeddings = {}

for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float')
    embeddings[word] = coefs

In [11]:
#convert sentence to word vectors and create 3d tensor input for rnn
#length of word vectors to form tensor
emb_dim = list(embeddings.values())[0].shape[0]

def embedding_output(x):
    maxLen = 10
    embedding_out = np.zeros((x.shape[0], maxLen, emb_dim))
    
    #iterate over sentences
    for ix in range(x.shape[0]):
        
        #iterate over list of words for every sentence
        for ij in range(len(x[ix])):
            try:
                embedding_out [ix][ij] = embeddings[x[ix][ij].lower()]
            except:
                embedding_out[ix][ij] = np.zeros((50,))
    return embedding_out

In [12]:
#form the matrix from input data

x_train_embedded_matrix = embedding_output(x_train)
x_test_embedded_matrix = embedding_output(x_test)

In [13]:
print(x_train_embedded_matrix.shape, x_test_embedded_matrix.shape)

(132, 10, 50) (56, 10, 50)


In [14]:
#convert y to one hot
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)


Using TensorFlow backend.


In [15]:
print(y_train.shape, y_test.shape)

(132, 5) (56, 5)


In [16]:
#create model
from keras.models import Sequential
from keras.layers import *

In [24]:
model = Sequential()
model.add(LSTM(64, input_shape = (10, 50)))
model.add(Dropout(0.5))

model.add(Dense(5))
model.add(Activation('softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 325       
_________________________________________________________________
activation_2 (Activation)    (None, 5)                 0         
Total params: 29,765
Trainable params: 29,765
Non-trainable params: 0
_________________________________________________________________


In [25]:
modelstack = Sequential()
modelstack.add(LSTM(64, input_shape = (10, 50), return_sequences = True))
modelstack.add(Dropout(0.5))
modelstack.add(LSTM(64, return_sequences = False))
modelstack.add(Dropout(0.5))
modelstack.add(Dense(5))
modelstack.add(Activation('softmax'))

modelstack.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
modelstack.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 10, 64)            29440     
_________________________________________________________________
dropout_4 (Dropout)          (None, 10, 64)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
_________________________________________________________________
activation_3 (Activation)    (None, 5)                 0         
Total params: 62,789
Trainable params: 62,789
Non-trainable params: 0
__________________________________________________

In [27]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("best_model.h5", monitor = 'val_loss', verbose = True, save_best_only = True)
earlystop = EarlyStopping(monitor = 'val_acc', patience = 5)

model.fit(x_train_embedded_matrix, y_train, epochs = 100, batch_size= 64, shuffle = True, validation_split = 0.2, callbacks = [checkpoint, earlystop])

Train on 105 samples, validate on 27 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.17761, saving model to best_model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 1.17761 to 1.03707, saving model to best_model.h5
Epoch 3/100

Epoch 00003: val_loss did not improve from 1.03707
Epoch 4/100

Epoch 00004: val_loss did not improve from 1.03707
Epoch 5/100

Epoch 00005: val_loss did not improve from 1.03707
Epoch 6/100

Epoch 00006: val_loss did not improve from 1.03707
Epoch 7/100





Epoch 00007: val_loss did not improve from 1.03707
Epoch 8/100

Epoch 00008: val_loss did not improve from 1.03707
Epoch 9/100

Epoch 00009: val_loss did not improve from 1.03707
Epoch 10/100

Epoch 00010: val_loss did not improve from 1.03707
Epoch 11/100

Epoch 00011: val_loss did not improve from 1.03707
Epoch 12/100

Epoch 00012: val_loss did not improve from 1.03707
Epoch 13/100

Epoch 00013: val_loss did not improve from 1.03707
Epoch 14/100

Epoch 00014: val_loss did not improve from 1.03707
Epoch 15/100

Epoch 00015: val_loss did not improve from 1.03707
Epoch 16/100

Epoch 00016: val_loss did not improve from 1.03707
Epoch 17/100

Epoch 00017: val_loss did not improve from 1.03707
Epoch 18/100

Epoch 00018: val_loss did not improve from 1.03707
Epoch 19/100

Epoch 00019: val_loss did not improve from 1.03707
Epoch 20/100

Epoch 00020: val_loss did not improve from 1.03707
Epoch 21/100

Epoch 00021: val_loss did not improve from 1.03707
Epoch 22/100

Epoch 00022: val_loss did 


Epoch 00049: val_loss did not improve from 1.03707
Epoch 50/100

Epoch 00050: val_loss did not improve from 1.03707
Epoch 51/100

Epoch 00051: val_loss did not improve from 1.03707
Epoch 52/100

Epoch 00052: val_loss did not improve from 1.03707
Epoch 53/100

Epoch 00053: val_loss did not improve from 1.03707
Epoch 54/100

Epoch 00054: val_loss did not improve from 1.03707
Epoch 55/100

Epoch 00055: val_loss did not improve from 1.03707
Epoch 56/100

Epoch 00056: val_loss did not improve from 1.03707
Epoch 57/100

Epoch 00057: val_loss did not improve from 1.03707
Epoch 58/100

Epoch 00058: val_loss did not improve from 1.03707
Epoch 59/100

Epoch 00059: val_loss did not improve from 1.03707
Epoch 60/100

Epoch 00060: val_loss did not improve from 1.03707
Epoch 61/100

Epoch 00061: val_loss did not improve from 1.03707
Epoch 62/100

Epoch 00062: val_loss did not improve from 1.03707
Epoch 63/100

Epoch 00063: val_loss did not improve from 1.03707
Epoch 64/100

Epoch 00064: val_loss di


Epoch 00090: val_loss did not improve from 1.03707
Epoch 91/100

Epoch 00091: val_loss did not improve from 1.03707
Epoch 92/100

Epoch 00092: val_loss did not improve from 1.03707
Epoch 93/100

Epoch 00093: val_loss did not improve from 1.03707
Epoch 94/100

Epoch 00094: val_loss did not improve from 1.03707
Epoch 95/100

Epoch 00095: val_loss did not improve from 1.03707
Epoch 96/100

Epoch 00096: val_loss did not improve from 1.03707
Epoch 97/100

Epoch 00097: val_loss did not improve from 1.03707
Epoch 98/100

Epoch 00098: val_loss did not improve from 1.03707
Epoch 99/100

Epoch 00099: val_loss did not improve from 1.03707
Epoch 100/100

Epoch 00100: val_loss did not improve from 1.03707


<keras.callbacks.callbacks.History at 0x27b7e68ae48>

In [28]:


checkpoint = ModelCheckpoint("best_model_stack.h5", monitor = 'val_loss', verbose = True, save_best_only = True)
earlystop = EarlyStopping(monitor = 'val_acc', patience = 5)

modelstack.fit(x_train_embedded_matrix, y_train, epochs = 100, batch_size= 64, shuffle = True, validation_split = 0.2, callbacks = [checkpoint, earlystop])

Train on 105 samples, validate on 27 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.03850, saving model to best_model_stack.h5
Epoch 2/100

Epoch 00002: val_loss did not improve from 1.03850
Epoch 3/100

Epoch 00003: val_loss improved from 1.03850 to 0.95968, saving model to best_model_stack.h5
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.95968
Epoch 5/100





Epoch 00005: val_loss did not improve from 0.95968
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.95968
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.95968
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.95968
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.95968
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.95968
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.95968
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.95968
Epoch 13/100

Epoch 00013: val_loss did not improve from 0.95968
Epoch 14/100

Epoch 00014: val_loss did not improve from 0.95968
Epoch 15/100

Epoch 00015: val_loss did not improve from 0.95968
Epoch 16/100

Epoch 00016: val_loss did not improve from 0.95968
Epoch 17/100

Epoch 00017: val_loss did not improve from 0.95968
Epoch 18/100

Epoch 00018: val_loss did not improve from 0.95968
Epoch 19/100

Epoch 00019: val_loss did not improve from 0.95968
Epoch 20/100

Epoch 00020: val_loss did no


Epoch 00047: val_loss did not improve from 0.95968
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.95968
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.95968
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.95968
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.95968
Epoch 52/100

Epoch 00052: val_loss did not improve from 0.95968
Epoch 53/100

Epoch 00053: val_loss did not improve from 0.95968
Epoch 54/100

Epoch 00054: val_loss did not improve from 0.95968
Epoch 55/100

Epoch 00055: val_loss did not improve from 0.95968
Epoch 56/100

Epoch 00056: val_loss did not improve from 0.95968
Epoch 57/100

Epoch 00057: val_loss did not improve from 0.95968
Epoch 58/100

Epoch 00058: val_loss did not improve from 0.95968
Epoch 59/100

Epoch 00059: val_loss did not improve from 0.95968
Epoch 60/100

Epoch 00060: val_loss did not improve from 0.95968
Epoch 61/100

Epoch 00061: val_loss did not improve from 0.95968
Epoch 62/100

Epoch 00062: val_loss di


Epoch 00088: val_loss did not improve from 0.95968
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.95968
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.95968
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.95968
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.95968
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.95968
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.95968
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.95968
Epoch 96/100

Epoch 00096: val_loss did not improve from 0.95968
Epoch 97/100

Epoch 00097: val_loss did not improve from 0.95968
Epoch 98/100

Epoch 00098: val_loss did not improve from 0.95968
Epoch 99/100

Epoch 00099: val_loss did not improve from 0.95968
Epoch 100/100

Epoch 00100: val_loss did not improve from 0.95968


<keras.callbacks.callbacks.History at 0x27c79bd30f0>

In [30]:
model.load_weights("best_model.h5")
modelstack.load_weights("best_model_stack.h5")

In [31]:
pred = model.predict_classes(x_test_embedded_matrix)
predstack = modelstack.predict_classes(x_test_embedded_matrix)

In [32]:

acc = model.evaluate(x_test_embedded_matrix, y_test)
print('normal model gives -  ', round(acc[1],2) *100)

accstack = modelstack.evaluate(x_test_embedded_matrix, y_test)
print('stacked model gives -  ', round(accstack[1], 2)*100)

normal model gives -   68.0
stacked model gives -   56.99999999999999


In [27]:
pred

array([3, 3, 2, 0, 2, 2, 3, 2, 1, 2, 1, 2, 0, 3, 1, 3, 2, 2, 3, 3, 0, 0,
       4, 2, 3, 3, 1, 0, 1, 2, 0, 1, 3, 2, 3, 1, 2, 4, 1, 0, 1, 0, 2, 0,
       2, 0, 3, 2, 3, 1, 3, 0, 3, 2, 0, 0], dtype=int64)

In [61]:
for i in pred:
    print(emoji.emojize(emoji_dictionary[str(i)]))

😞
😞
😃
❤️
😃
😃
😞
😃
⚾
😃
⚾
😃
❤️
😞
⚾
😞
😃
😃
😞
😞
❤️
😞
🍴
😃
😞
😞
❤️
❤️
⚾
😃
❤️
⚾
😞
😃
😞
⚾
😃
🍴
⚾
❤️
⚾
❤️
❤️
❤️
😃
❤️
😞
😃
😞
⚾
😞
❤️
😞
😃
❤️
❤️
ERROR! Session/line number was not unique in database. History logging moved to new session 89


In [66]:
with open('emojiy_y_test.csv') as f:
    y_t = f.read().split()
for i in pred:
    print(emoji.emojize(emoji_dictionary[str(i)]))

😞
😞
😃
❤️
😃
😃
😞
😃
⚾
😃
⚾
😃
❤️
😞
⚾
😞
😃
😃
😞
😞
❤️
😞
🍴
😃
😞
😞
❤️
❤️
⚾
😃
❤️
⚾
😞
😃
😞
⚾
😃
🍴
⚾
❤️
⚾
❤️
❤️
❤️
😃
❤️
😞
😃
😞
⚾
😞
❤️
😞
😃
❤️
❤️


In [64]:
sent = [[' '.join(i)] for i in x_test]
for idx in range(len(sent)):
    sent[idx][0] = sent[idx][0].replace('\\t', '')

In [69]:
print("Sentence")
print("actual")
print("predicted")
for idx,[line] in enumerate(sent):
    print(line)
    print(emoji.emojize(emoji_dictionary[y_t[idx]]))
    print(emoji.emojize(emoji_dictionary[str(pred[idx])]))

Sentence
actual
predicted
I want to eat
🍴
😞
he did not answer
😞
😞
he got a very nice raise
😃
😃
she got me a nice present
😃
❤️
ha ha ha it was so funny
😃
😃
he is a good friend
😃
😃
I am upset
😞
😞
We had such a lovely dinner tonight
😃
😃
where is the food
🍴
⚾
Stop making this joke ha ha ha
😃
😃
where is the ball
⚾
⚾
work is hard
😞
😃
This girl is messing with me
😞
❤️
are you serious
😞
😞
Let us go play baseball
⚾
⚾
This stupid grader is not working 
😞
😞
work is horrible
😞
😃
Congratulation for having a baby
😃
😃
stop pissing me off
😞
😞
any suggestions for dinner
🍴
😞
I love taking breaks
❤️
❤️
you brighten my day
😃
😞
I boiled rice
🍴
🍴
she is a bully
😞
😃
Why are you feeling bad
😞
😞
I am upset
😞
😞
give me the ball
⚾
❤️
My grandmother is the love of my life
❤️
❤️
enjoy your game
⚾
⚾
valentine day is near
😃
😃
I miss you so much
❤️
❤️
throw the ball
⚾
⚾
My life is so boring
😞
😞
she said yes
😃
😃
will you be my valentine
😃
😞
he can pitch really well
⚾
⚾
dance with me
😃
😃
I am hungry
🍴
🍴
See you at the 