In [None]:
import numpy as np
import pandas as pd
import emoji

from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, SimpleRNN,LSTM, Activation
from keras.utils import np_utils

import matplotlib.pyplot as plt

In [None]:
# train_emoji data and test emoji data is Attached in the repo
train = pd.read_csv('train_emoji.csv',header=None)
test = pd.read_csv('test_emoji.csv',header=None)

In [None]:
# checking the data by printing first 5 entries
train.head()

In [None]:
# checking the data by printing first 5 entries
test.head()

In [None]:
# Creating the Dictionary of some emoji's with key a number and value is emoji
emoji_dict = { 0 : ":heart:", 1 : ":baseball:", 2:":smile:", 3 : ":disappointed:", 4 : ":fork_and_knife:"}

In [None]:
# printing the emoji icon by emojifying each emoji
for ix in emoji_dict.keys():
    print(ix,end=" ")
    print (emoji.emojize(emoji_dict[ix], use_aliases=True))

In [None]:
# Creating the training and testing data

X_train = train[0]
Y_train = train[1]

X_test = test[0]
Y_test = test[1]

print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
print(("-------------------------"))
print(X_train[0],Y_train[0])

In [None]:
# Splitting the train data from sentences to list of words
for ix in range(X_train.shape[0]):
    X_train[ix] = X_train[ix].split()
    
# Splitting the test data from sentences to list of words
for ix in range(X_test.shape[0]):
    X_test[ix] = X_test[ix].split()

# Converting the labels into categorical Form
Y_train = np_utils.to_categorical(Y_train)

In [None]:
print(X_train[0],Y_train[0])
type(X_train)

In [None]:
# To check what's the maximum length exist in the training data
np.unique(np.array([len(ix) for ix in X_train]) , return_counts=True)

In [None]:
# To check what's the maximum length exist in the testing data
np.unique(np.array([len(ix) for ix in X_test]) , return_counts=True)

In [None]:
# Creating the Embedding dictionary with key = word and value = list of words
embeddings_index = {}

f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
#     print(values)
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
# Checking the length of each vector
embeddings_index["i"].shape

In [None]:
# Using Scipy library to import the embedding matrix
from scipy import spatial

# Checking the cosine similarity of happy and sad
spatial.distance.cosine(embeddings_index["happy"], embeddings_index["sad"])

In [None]:
# Checking the cosine similarity of india and delhi
spatial.distance.cosine(embeddings_index["india"], embeddings_index["delhi"])

In [None]:
# Checking the cosine similarity of france and paris
spatial.distance.cosine(embeddings_index["france"], embeddings_index["paris"])

In [None]:
# Filling the Embedding Matrix

embedding_matrix_train = np.zeros((X_train.shape[0], 10, 50))
embedding_matrix_test = np.zeros((X_test.shape[0], 10, 50))

for ix in range(X_train.shape[0]):
    for ij in range(len(X_train[ix])):
        embedding_matrix_train[ix][ij] = embeddings_index[X_train[ix][ij].lower()]
        
for ix in range(X_test.shape[0]):
    for ij in range(len(X_test[ix])):
        embedding_matrix_test[ix][ij] = embeddings_index[X_test[ix][ij].lower()]        

In [None]:
print(embedding_matrix_train.shape, embedding_matrix_test.shape)

### Using RNN 

In [None]:
# A simple RNN network to classify the emoji class from a input Sentence

model = Sequential()
model.add(SimpleRNN(64, input_shape=(10,50), return_sequences=True))
model.add(Dropout(0.5))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation('softmax'))

model.summary()

In [None]:
# Setting Loss, Optimizer of the Model 

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Training of the Model

hist = model.fit(embedding_matrix_train,Y_train,
                epochs = 50, batch_size=32,shuffle=True
                )

In [None]:
# prediction from the trained model
pred = model.predict_classes(embedding_matrix_test)

In [None]:
# Calculating the score of the algorithm

float(sum(pred==Y_test))/embedding_matrix_test.shape[0]

In [None]:
# printing the sentences with the predicted emoji and the labelled emoji

for ix in range(embedding_matrix_test.shape[0]):
    
    if pred[ix] != Y_test[ix]:
        print(ix)
        print(test[0][ix],end=" ")
        print(emoji.emojize(emoji_dict[pred[ix]], use_aliases=True),end=" ")
        print(emoji.emojize(emoji_dict[Y_test[ix]], use_aliases=True))

In [None]:
# Predicting for Our random sentence

x = ['i', 'do', 'think','this', 'class', 'is', 'very', 'interesting']

x_ = np.zeros((1,10,50))

for ix in range(len(x)):
    x_[0][ix] = embeddings_index[x[ix].lower()]

In [None]:
model.predict_classes(x_)

### Using LSTM 

In [None]:

model = Sequential()
model.add(LSTM(128, input_shape=(10,50), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation('softmax'))

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
hist = model.fit(embedding_matrix_train,Y_train,
                epochs = 50, batch_size=32,shuffle=True
                )

In [None]:
pred = model.predict_classes(embedding_matrix_test)

In [None]:
float(sum(pred==Y_test))/embedding_matrix_test.shape[0]

In [None]:
for ix in range(embedding_matrix_test.shape[0]):
    
    if pred[ix] != Y_test[ix]:
        print(ix)
        print(test[0][ix],end=" ")
        print(emoji.emojize(emoji_dict[pred[ix]], use_aliases=True),end=" ")
        print(emoji.emojize(emoji_dict[Y_test[ix]], use_aliases=True))