# Given a sentence find emoji best describing the sentence

You have a tiny dataset (X, Y) where:
- X contains 127 sentences (strings)
- Y contains a integer label between 0 and 4 corresponding to an emoji for each sentence

<img src="data/emoji.png" style="width:700px;height:300px;">
<caption><center> **Figure 1**: EMOJISET - a classification problem with 5 classes. A few examples of sentences are given here. </center></caption>



In [1]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors



In [2]:
file = '../data/w2vec.6B.50d.txt'
tmodel = KeyedVectors.load_word2vec_format(file, binary=False)

In [3]:
glove_vector_length = 50

## Sentence to avg for converting sentence to avg vector which will be fed to softmax classifier

In [4]:
def sentence_to_avg(sentence, word_to_vec_model):
    """ Given a sentence break it down into words find its embedding vector and avg them out
    return the avg
    """
    
    avg = np.zeros(word_to_vec_model['man'].shape) # avg size same as word_to_vec_model dim
    
    words = str(sentence).lower().split()
    
    for w in words:
        avg += word_to_vec_model[w]
    
    
    avg = avg/len(words)
    
    return avg

In [5]:
avg = sentence_to_avg("Morrocan couscous is my favorite dish", tmodel)
print(avg)


[-0.008005    0.56370833 -0.50427333  0.258865    0.55131104  0.03104983
 -0.21013718  0.16893933 -0.09590267  0.141784   -0.15708966  0.18525867
  0.64957852  0.38371118  0.21102167  0.11301667  0.02613967  0.26037766
  0.05820667 -0.01578167 -0.12078834 -0.02471267  0.41284552  0.5152061
  0.38756166 -0.89866098 -0.535145    0.33501166  0.68806935 -0.2156265
  1.79715503  0.10476932 -0.36775333  0.750785    0.10282583  0.34892499
 -0.27262834  0.66767999 -0.10706167 -0.28363501  0.59580119  0.28747334
 -0.33666349  0.23393817  0.34349183  0.178405    0.1166155  -0.076433
  0.1445417   0.09808667]


In [6]:
import pandas as pd

In [29]:
train = pd.read_csv('data/train_emoji.csv')

In [30]:
test = pd.read_csv('data/test_emoji.csv')

In [9]:
X_train = train.drop('class', axis=1)
y_train = train['class']

In [10]:
X_test = test.drop('class', axis=1)
y_test = test['class']

In [48]:
onehot_y_train = pd.get_dummies(y_train).as_matrix()
onehot_y_test = pd.get_dummies(y_test).as_matrix()

In [12]:
# convert input x to array of embedding vectors
def convert_to_e (x):
    x_c = np.zeros((x.size, 50))

    for i in range(x.size):

        x_c[i] = sentence_to_avg(x['sentence'][i], tmodel)

    return x_c

In [13]:
X_train_m = convert_to_e(X_train)
X_test_m = convert_to_e(X_test)

In [14]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [15]:
num_outputs = 5 #classes
learning_rate = 0.01
num_hidden1 = 20

In [16]:
X = tf.placeholder(dtype=tf.float32, shape=(None, 50))
y = tf.placeholder(dtype=tf.float32, shape=(None, 5))

In [17]:
hidden1 = fully_connected(X,num_hidden1,activation_fn=tf.nn.relu)
output = fully_connected(hidden1, num_outputs)

In [18]:
loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=output)

In [19]:
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [20]:
init = tf.global_variables_initializer()

In [21]:
training_steps = 500

with tf.Session() as sess:
    
    sess.run(init)

    for i in range(training_steps):
        
        sess.run(train, feed_dict={X:X_train_m, y:onehot_y_train})
        
    
    logits = output.eval(feed_dict={X:X_test_m})
    pred = tf.argmax(logits, axis=1)
    results = pred.eval()
        
    

In [22]:
results

array([4, 3, 2, 2, 2, 2, 3, 2, 4, 2, 1, 2, 3, 3, 1, 3, 3, 2, 3, 4, 3, 2, 4,
       3, 3, 3, 1, 0, 1, 2, 2, 1, 2, 2, 2, 1, 2, 4, 4, 2, 1, 3, 0, 1, 2, 1,
       2, 2, 3, 3, 3, 3, 3, 2, 2], dtype=int64)

In [23]:
from sklearn.metrics import classification_report
print(classification_report(results, y_test))

             precision    recall  f1-score   support

          0       0.29      1.00      0.44         2
          1       1.00      0.89      0.94         9
          2       1.00      0.86      0.92        21
          3       0.88      0.82      0.85        17
          4       1.00      1.00      1.00         6

avg / total       0.94      0.87      0.89        55



# Solve emojify problem using LSTM

In [24]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Activation, Embedding

Using TensorFlow backend.


In [25]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

## convert words in input sentences to indices
## pad the vectors to max_sentence_length
## create a embedding matrix of shape (vocab_size, dim)

vocab_size -> number of unique words in input sentences,
dim -> glove word embedding dim (length of word vector)

## use these in embedding layer for Keras

In [26]:
# create tokenizer on total data, this will be used to represent words to integers

In [31]:
token = Tokenizer()
X = train.drop('class', axis=1).append(test.drop('class', axis=1))
token.fit_on_texts(X['sentence'])

In [32]:
def convert_word_to_indices_pad(X, max_length):
    """ converts a word into a vector of integers
    
    X -> an array of sentences
    max_length -> max_length of sentence for padding purposes
    
    returns a tuple of tokenzier and padded vector of integers
    """
   
    X_indices = token.texts_to_sequences(X)
    
    # return tokenizer and padded vector of indices
    return pad_sequences(X_indices, maxlen=max_length, padding='post')

In [33]:
def get_max_len(X):
    """ array of sentences in X, return max length of sentence"""
    
    max_len = 0
    for i in range(len(X)):
        
        len1 = len(X[i].split())
        
        if len1 > max_len:
            max_len = len1
            
    return max_len

In [34]:
max_len = get_max_len(X_train['sentence'])

In [35]:
X_train_indices = convert_word_to_indices_pad(X_train['sentence'], max_len)

In [36]:
def create_embedding_matrix(t, model):
    """ Create embedding matrix to be passed to Keras Embedding layer.
    t -> tokenizer
    model -> glove model
    
    returns a matrix of shape vocab_size, glove_vector_length
    """
    
    vocab_size = len(t.word_index) + 1
    
    
    e_matrix = np.zeros((vocab_size, glove_vector_length))
    
    for word in t.word_index:
        e_matrix[t.word_index[word]] = model[word]
    
    return e_matrix

In [38]:
e_matrix = create_embedding_matrix(token, tmodel)

In [33]:
# Create keras layers now

In [40]:
vocab_size = len(token.word_index) + 1

In [41]:
model = Sequential()

In [42]:
model.add(Embedding(vocab_size, glove_vector_length, trainable=False, weights=[e_matrix]))

In [43]:
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(num_outputs))
model.add(Activation('softmax'))


In [44]:
model.compile(optimizer='adam', 
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [45]:
model.fit(X_train_indices, onehot_y_train, epochs=50, batch_size=32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1d8ec61f898>

In [None]:
# test data

In [46]:
X_test_indices = convert_word_to_indices_pad(X_test['sentence'], get_max_len(X_test['sentence']))

In [49]:
loss, acc = model.evaluate(X_test_indices, onehot_y_test)



In [50]:
print("accuracy: ", acc)

accuracy:  0.854545442625


In [61]:
# Let's see sentences where the model did not do well

In [52]:
pred = model.predict(X_test_indices)

In [58]:
for i in range(len(X_test_indices)):
    predictedclass = np.argmax(pred[i])
    
    if (predictedclass != y_test[i]):
        print('For sentence: ' + X_test['sentence'][i] + ", actual class: " + str(y_test[i])
              + ", predicted class: " + str(predictedclass))

For sentence: she got me a nice present, actual class: 2, predicted class: 0
For sentence: work is hard, actual class: 3, predicted class: 2
For sentence: This girl is messing with me, actual class: 3, predicted class: 0
For sentence: you brighten my day, actual class: 2, predicted class: 0
For sentence: she is a bully, actual class: 3, predicted class: 0
For sentence: My life is so boring, actual class: 3, predicted class: 0
For sentence: go away, actual class: 3, predicted class: 1
For sentence: yesterday we lost again, actual class: 3, predicted class: 1


In [66]:
x_dummy = np.array(['i rock'])
x_dummy_indices = convert_word_to_indices_pad(x_dummy, get_max_len(x_dummy))

print(np.argmax(model.predict(x_dummy_indices)))

2
