This notebook is for training emoji embeddings as proposed in this paper: https://arxiv.org/abs/1609.08359

In [3]:
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import tensorflow as tf
from tensorflow.python.framework import ops
import pickle as pk
import gensim.models as gs
import numpy as np
import random
import re

# Internal dependencies
from model import Emoji2Vec
from trainer import Trainer
from batcher import BatchNegSampler

In [4]:
args = {
    "word2vec_file": "../data/word2vec/GoogleNews-vectors-negative300.bin",
    "emoj2vec_file": "../data/emoji2vec/emoji2vec_proposed.bin",
    "dimension": 300,
    "train_data_true": "../data/emojipedia/emojipedia_positive.txt",
    "train_data_false": "../data/emojipedia/emojipedia_negative.txt",
#     "dev_data": "../data/training/dev.txt",
#     "test_data": "../data/training/test.txt",
    "ind_to_emoj_file": "../data/proposed/ind_to_emoj.pk",
    "ind_to_phr_file": "..data/proposed/ind_to_phr.pk",
    "embeddings_file": "../data/proposed/phrase_embeddings.pk",
    "model_path": "../data/proposed/model",
    
    
    "item": "happy face",
    "top_n": 8,
}

# hyperparams for model
hp = {
    "in_dim": 300,
    "out_dim": 300,
    "max_epochs": 10,
    "batch_size": 8,
    "neg_ratio": 1,
    "learning_rate": 0.001,
    "dropout": 0.0
}

Load pre-trained Google word2vec:

In [3]:
# print('reading embedding data from: ' + args["word2vec_file"])
w2v = gs.KeyedVectors.load_word2vec_format(args["word2vec_file"], binary=True)

In [4]:
alphabet = 'abcdefghigklmnopqrstuvwxyz'

Make ind_to_emoji, ind_to_phrase dectionaries and sums of word embeddings for each phrase in data:

In [1]:
def process_data(args):

    def phrase_vec_model(item):
        tokens = item.split(' ')
        phr_sum = np.zeros(args["dimension"], np.float32)

        for token in tokens:
            if token in w2v:
                phr_sum += w2v[token]
        
        return phr_sum

    phrase_vector_sums = dict()
    ind_to_emoj = []
    ind_to_phr = []
    for file in [args["train_data_true"], args["train_data_false"]]:
        with open(file, 'r') as f:
            lines = f.readlines()
            for i, line in enumerate(lines):
                if i % 10000 == 0:
                    print(i/10000)
                try:
                    em, phrase, truth = line.rstrip().split('\t')
                except Exception as e:
                    print(line.rstrip().split('\t'))
                    continue
                phrase = phrase.lower()
                try:
                    while phrase[0] not in alphabet:
                        phrase = phrase[1:]
                    while phrase[-1] not in alphabet:
                        phrase = phrase[:-1]
                except Exception as e:
                    continue
                phrase_vector_sums[phrase] = phrase_vec_model(phrase)
                if em not in ind_to_emoj:
                    ind_to_emoj.append(em)
                if phrase not in ind_to_phr:
                    ind_to_phr.append(phrase)

    pk.dump(ind_to_emoj, open(args["ind_to_emoj_file"], 'wb'))
    pk.dump(ind_to_phr, open(args["ind_to_phr_file"], 'wb'))
    pk.dump(phrase_vector_sums, open(args["embeddings_file"], 'wb'))


In [6]:
process_data(args)

0.0
0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
28.0
29.0
30.0
31.0
32.0
33.0
34.0
35.0
36.0
37.0
38.0
39.0
40.0
41.0
42.0
43.0
44.0
45.0
46.0
47.0
48.0
49.0
50.0
51.0
52.0
53.0
54.0
55.0
56.0
57.0
58.0
59.0
60.0
61.0
62.0
63.0
64.0
65.0
66.0
67.0
68.0
69.0
70.0
71.0
72.0
73.0
74.0
75.0
76.0
77.0
78.0
79.0
80.0
81.0
82.0
83.0
84.0
85.0
86.0
87.0


Load data made using process_data function:

In [7]:
ind_to_phr = pk.load(open(args["ind_to_phr_file"], 'rb'))
ind_to_emoj = pk.load(open(args["ind_to_emoj_file"], 'rb'))

phr_to_ind = {v: k for k, v in enumerate(ind_to_phr)}
emoj_to_ind = {v: k for k, v in enumerate(ind_to_emoj)}

phrase_vector_sums = pk.load(open(args["embeddings_file"], 'rb'))

embeddings_array = np.zeros(shape=[len(ind_to_phr), 300], dtype=np.float32)
for ind, phr in enumerate(ind_to_phr):
    embeddings_array[ind] = phrase_vector_sums[phr]

emoj_to_ind = {v: k for k, v in enumerate(ind_to_emoj)}

In [8]:
def restore_sess():
    ops.reset_default_graph()
    model = Emoji2Vec(hp, len(ind_to_emoj), embeddings_array=None, use_embeddings=False)

    session = tf.Session()
    saver = tf.train.Saver()
    saver.restore(session, args["model_path"]+"/model.ckpt")
    
    w2v, e2v = from_2vec_paths(args["word2vec_file"], args["emoj2vec_file"])
    return w2v, e2v, session, model

In [9]:
def load_batch(datafile):
    rows = list()
    cols = list()
    targets = list()
    with open(datafile, 'r') as f:
        lines = f.readlines()
        batch = []
        for line in lines:
            em, phrase, truth = line.rstrip().split('\t')
            phrase = phrase.lower()
            try:
                while phrase[0] not in alphabet:
                    phrase = phrase[1:]
                while phrase[-1] not in alphabet:
                    phrase = phrase[:-1]
            except Exception as e:
                continue
            batch.append((phrase, em, truth))
            cols.append(emoj_to_ind[em])
            rows.append(phr_to_ind[phrase])
            targets.append(1 if truth == 'True' else 0)
    return batch, (rows, cols, targets)

In [10]:
def TRAIN(args):
    ops.reset_default_graph()
    tf.reset_default_graph()

    # Create the model based on the given model parameters
    print(str.format('Training: k={}, batch={}, epochs={}, ratio={}, dropout={}', hp["out_dim"],
                         hp["batch_size"], hp["max_epochs"], hp["neg_ratio"], hp["dropout"]))
    model = Emoji2Vec(hp=hp, num_emoji=len(ind_to_emoj), embeddings_array=embeddings_array)

    dsets = {
        'train_true': load_batch(args["train_data_true"])[0],
        'train_false': load_batch(args["train_data_false"])[0], 
        #'dev': load_batch(args["dev_data"])[0]
        }

    sess = tf.Session()

    # corpus is the body from which we sample
    corpus = BatchNegSampler(phr_to_ind, emoj_to_ind, hp["batch_size"], 
                             hp["neg_ratio"], dsets["train_true"], dsets["train_false"])

    model.train(corpus, session=sess, datasets=dsets)

    saver = tf.train.Saver()
    # Save a checkpoint with the trained model
    saver.save(sess, args["model_path"]+"/model.ckpt")

    # Generate the gensim structures
    e2v = model.create_gensim_files(sess=sess, model_folder=args["model_path"], ind2emoj=ind_to_emoj,
                                    out_dim=hp["out_dim"])

In [None]:
TRAIN(args)