In [9]:
import json
import pandas as pd
import numpy as np
import os
import pickle
import warnings
import requests
import re
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

# Data extraction and processing

In [2]:
def extract_text():
    file = 'raw.pickle'
    response = requests.get("https://raw.githubusercontent.com/bfelbo/DeepMoji/master/data/PsychExp/raw.pickle")
    open(file, 'wb').write(response.content)
    data = pickle.load(open(file,'rb'),encoding='latin1')
    if os.path.exists('data.txt'):
        os.remove('data.txt')
    try:
        texts = [str(x) for x in data['texts']]
        labels = [x['label'] for x in data['info']]
        with open("data.txt", 'a') as txtfile: 
            for i in range(len(texts)):
                txtfile.write(np.array2string(labels[i]))
                txtfile.write(str(texts[i])+'\n')

    except Exception as e:
        print(e)

extract_text()


In [2]:
def read_text_file(file_name):
    data_list  = []
    with open(file_name,'r') as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data_list.append([label, text])

    return data_list

def extract_labels(text_list):
    label_list = []
    text_list = [text_list[i][0].replace('[','') for i in range(len(text_list))]
    label_list = [list(np.fromstring(text_list[i], dtype=float, sep=' ')) for i in range(len(text_list))]
    return label_list

def extract_text_msgs(text_list):
    msg_list = []
    msg_list = [text_list[i][1] for i in range(len(text_list))]
    return msg_list

In [3]:
data_list = read_text_file('data.txt')
label_list = extract_labels(data_list)
msg_list = extract_text_msgs(data_list)

# Tokenize the data

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
max_words = 10000 # the max number of words to extracted into the vocaburary
maxlen = 50  # the max number of words a text sample can have

tokenizer = Tokenizer(num_words=max_words,lower=True)
tokenizer.fit_on_texts(msg_list)
msg_tokenized = tokenizer.texts_to_sequences(msg_list) # tokenize the msg_list
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

msg_tokenized = pad_sequences(msg_tokenized, maxlen=maxlen) # make each text sample same long 


Found 9121 unique tokens.


# save the tokenizer

In [5]:
f=open('tokenizer.pickle','wb')
pickle.dump(tokenizer,f)
f.close()
#tokenizer = pickle.load(open('tokenizer.pickle','rb')) # load it back
#tokenizer.texts_to_sequences([text]) # tolenize new text

# Split data into a training set and a validation set

In [6]:
# Shuffle the data
seed = 1
rng = np.random.RandomState(seed)
rng.shuffle(msg_tokenized)
rng = np.random.RandomState(seed)
rng.shuffle(label_list)
test_split = 0.2
num_validation_samples = int(test_split * len(msg_tokenized))
X_train = msg_tokenized[:-num_validation_samples]
X_test = msg_tokenized[-num_validation_samples:]
y_train = label_list[:-num_validation_samples]
y_test = label_list[-num_validation_samples:]

# Read Glove word embedding

In [7]:
def read_glove_vector(glove_file):
    with open(glove_file,'r',encoding='UTF-8') as file:
        words = set() 
        word_to_vec = {}
        for line in file:
            line = line.strip().split()
            line[0] = re.sub('[^a-zA-Z]', '', line[0])
            if len(line[0]) > 0:
                words.add(line[0])
                word_to_vec[line[0]] = np.array(line[1:],dtype=np.float64)

        i = 1
        word_to_index = {}
        index_to_word = {}
        for word in sorted(words):
            word_to_index[word] = i
            index_to_word[i] = word
            i = i+1
    return word_to_index,index_to_word,word_to_vec

In [10]:
word_to_index,index_to_word,word_to_vec = read_glove_vector("glove.6B.50d.txt")

# create embedding layer

In [11]:
from tensorflow import keras
from keras import layers


In [12]:
EMBEDDING_DIM = word_to_vec['word'].shape[0]
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = word_to_vec.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [13]:
from keras.layers import Embedding
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)

# Bi-directional RNN model

In [14]:
def create_lstm_model(input_shape,embedding_layer):
    sentence_indices = keras.Input(shape=input_shape, dtype=np.int32)
    embeddings = embedding_layer(sentence_indices)
    reg = keras.regularizers.L1L2(0.01, 0.01)

    X = layers.Bidirectional(layers.LSTM(128, return_sequences=True,bias_regularizer=reg,kernel_initializer='he_uniform'))(embeddings)
    X = layers.BatchNormalization()(X)
    X = layers.Dropout(0.5)(X)
    X = layers.LSTM(64)(X)
    X = layers.Dropout(0.5)(X)
    X = layers.Dense(7, activation='softmax')(X)
    X =  layers.Activation('softmax')(X)
    model = keras.Model(sentence_indices, X)

    return model

model = create_lstm_model((maxlen,),embedding_layer)


In [15]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            456100    
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 256)           183296    
_________________________________________________________________
batch_normalization (BatchNo (None, 50, 256)           1024      
_________________________________________________________________
dropout (Dropout)            (None, 50, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                82176     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)               

# Train the model

In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, np.array(y_train),validation_data=(X_test, np.array(y_test)), epochs = 30, batch_size = 32, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2cdd7a2a488>

In [17]:
model.save('emoji_model.h5')


# load the model back

In [79]:
model = keras.models.load_model('emoji_model.h5')