# Given a sentence find emoji best describing the sentence

In [2]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

In [3]:
file = '../data/w2vec.6B.50d.txt'
model = KeyedVectors.load_word2vec_format(file, binary=False)

In [4]:
glove_vector_length = 50

## Sentence to avg for converting sentence to avg vector which will be fed to softmax classifier

In [5]:
def sentence_to_avg(sentence, word_to_vec_model):
    """ Given a sentence break it down into words find its embedding vector and avg them out
    return the avg
    """
    
    avg = np.zeros(word_to_vec_model['man'].shape) # avg size same as word_to_vec_model dim
    
    words = str(sentence).lower().split()
    
    for w in words:
        avg += word_to_vec_model[w]
    
    
    avg = avg/len(words)
    
    return avg

In [6]:
avg = sentence_to_avg("Morrocan couscous is my favorite dish", model)
print(avg)


[-0.008005    0.56370833 -0.50427333  0.258865    0.55131104  0.03104983
 -0.21013718  0.16893933 -0.09590267  0.141784   -0.15708966  0.18525867
  0.64957852  0.38371118  0.21102167  0.11301667  0.02613967  0.26037766
  0.05820667 -0.01578167 -0.12078834 -0.02471267  0.41284552  0.5152061
  0.38756166 -0.89866098 -0.535145    0.33501166  0.68806935 -0.2156265
  1.79715503  0.10476932 -0.36775333  0.750785    0.10282583  0.34892499
 -0.27262834  0.66767999 -0.10706167 -0.28363501  0.59580119  0.28747334
 -0.33666349  0.23393817  0.34349183  0.178405    0.1166155  -0.076433
  0.1445417   0.09808667]


In [7]:
import pandas as pd

In [56]:
train = pd.read_csv('data/train_emoji.csv')

In [57]:
test = pd.read_csv('data/test_emoji.csv')

In [58]:
X_train = train.drop('class', axis=1)
y_train = train['class']

In [59]:
X_test = test.drop('class', axis=1)
y_test = test['class']

In [60]:
onehot_y_train = pd.get_dummies(y_train).as_matrix()
one_hot_y_test = pd.get_dummies(y_test).as_matrix()

In [13]:
# convert input x to array of embedding vectors
def convert_to_e (x):
    x_c = np.zeros((x.size, 50))

    for i in range(x.size):

        x_c[i] = sentence_to_avg(x['sentence'][i], model)

    return x_c

In [14]:
X_train_m = convert_to_e(X_train)
X_test_m = convert_to_e(X_test)

In [15]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [16]:
num_outputs = 5 #classes
learning_rate = 0.01
num_hidden1 = 20

In [17]:
X = tf.placeholder(dtype=tf.float32, shape=(None, 50))
y = tf.placeholder(dtype=tf.float32, shape=(None, 5))

In [18]:
hidden1 = fully_connected(X,num_hidden1,activation_fn=tf.nn.relu)
output = fully_connected(hidden1, num_outputs)

In [19]:
loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=output)

In [20]:
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [21]:
init = tf.global_variables_initializer()

In [22]:
training_steps = 500

with tf.Session() as sess:
    
    sess.run(init)

    for i in range(training_steps):
        
        sess.run(train, feed_dict={X:X_train_m, y:onehot_y_train})
        
    
    logits = output.eval(feed_dict={X:X_test_m})
    pred = tf.argmax(logits, axis=1)
    results = pred.eval()
        
    

In [23]:
results

array([4, 3, 0, 0, 2, 2, 3, 2, 4, 2, 1, 2, 3, 3, 1, 3, 3, 2, 3, 4, 3, 0, 4,
       3, 3, 3, 1, 0, 1, 4, 0, 1, 0, 2, 0, 1, 2, 4, 4, 2, 1, 0, 0, 1, 2, 0,
       2, 2, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [24]:
from sklearn.metrics import classification_report
print(classification_report(results, y_test))

             precision    recall  f1-score   support

          0       0.71      0.50      0.59        10
          1       1.00      1.00      1.00         8
          2       0.72      0.87      0.79        15
          3       0.88      0.88      0.88        16
          4       0.86      0.86      0.86         7

avg / total       0.82      0.82      0.82        56



# Solve emojify problem using LSTM

In [25]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Activation, Embedding

Using TensorFlow backend.


In [26]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

## convert words in input sentences to indices
## pad the vectors to max_sentence_length
## create a embedding matrix of shape (vocab_size, dim)

vocab_size -> number of unique words in input sentences,
dim -> glove word embedding dim (length of word vector)

## use these in embedding layer for Keras

In [27]:
def convert_word_to_indices_pad(X, max_length):
    """ convertes a word into a vector of integers
    
    X -> a vector of words
    max_length -> max_length of sentence for padding purposes
    
    returns a tuple of tokenzier and padded vector of integers
    """
    t = Tokenizer()
    
    t.fit_on_texts(X)
    X_indices = t.texts_to_sequences(X)
    
    # return tokenizer and padded vector of indices
    return t, pad_sequences(X_indices, maxlen=max_length, padding='post')

In [52]:
def get_max_len(X):
    """ array of sentences in X, return max length of sentence"""
    
    max_len = 0
    for i in range(len(X)):
        
        len1 = len(X[i].split())
        
        if len1 > max_len:
            max_len = len1
            
    return max_len

In [29]:
max_len = get_max_len(X_train['sentence'])

In [30]:
t, X_train_indices = convert_word_to_indices_pad(X_train['sentence'], max_len)

In [31]:
def create_embedding_matrix(t, model):
    """ Create embedding matrix to be passed to Keras Embedding layer.
    t -> tokenizer
    model -> glove model
    
    returns a matrix of shape vocab_size, glove_vector_length
    """
    
    vocab_size = len(t.word_index) + 1
    
    
    e_matrix = np.zeros((vocab_size, glove_vector_length))
    
    for word in t.word_index:
        e_matrix[t.word_index[word]] = model[word]
    
    return e_matrix

In [32]:
e_matrix = create_embedding_matrix(t, model)

In [33]:
# Create keras layers now

In [34]:
vocab_size = len(t.word_index) + 1

In [44]:
model = Sequential()

In [45]:
model.add(Embedding(vocab_size, glove_vector_length, trainable=False, weights=[e_matrix]))

In [46]:
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(num_outputs))
model.add(Activation('softmax'))


In [47]:
model.compile(optimizer='adam', 
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [48]:
model.fit(X_train_indices, onehot_y_train, epochs=50, batch_size=32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a0d3b8c748>

In [None]:
# test data

In [None]:
t, X_train_indices = convert_word_to_indices_pad(X_test['sentence'], get_max_len(X_test['sentence']))

In [61]:
get_max_len(X_test['sentence'])

I want to eat
he did not answer
he got a very nice raise
she got me a nice present
ha ha ha it was so funny
he is a good friend
I am upset
We had such a lovely dinner tonight
where is the food
Stop making this joke ha ha ha
where is the ball
work is hard
This girl is messing with me
are you serious
Let us go play baseball
This stupid grader is not working 
work is horrible
Congratulation for having a baby
stop pissing me off
any suggestions for dinner
I love taking breaks
you brighten my day
I boiled rice
she is a bully
Why are you feeling bad
I am upset
give me the ball
My grandmother is the love of my life
enjoy your game
valentine day is near
I miss you so much
throw the ball
My life is so boring
she said yes
will you be my valentine
he can pitch really well
dance with me
I am hungry
See you at the restaurant
I like to laugh
I will run
I like your jacket 
i miss her
what is your favorite baseball game
Good job
I love you to the stars and back
What you did was awesome
ha ha ha lol
I 

8