In this notebook we are trying to learn e2v embeddings using emoji descriptions and pre-trained Google w2v for words

In [None]:
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import tensorflow as tf
from tensorflow.python.framework import ops
import pickle as pk
import gensim.models as gs
import numpy as np
import random
import itertools
# Internal dependencies
import nltk
from nltk.corpus import wordnet as wn
from model import Emoji2Vec
from trainer import Trainer

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from tfnn.layers import *
import tfnn
import math
import mt.bleu
import sys
import numpy as np
import collections
from mt.strutils import tokenize
import time

In [None]:
minibatch_len = 4
batch_size = 8

In [None]:
w2v = gs.KeyedVectors.load_word2vec_format("../data/word2vec/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'

In [None]:
counts = {}
for word, item in w2v.vocab.items():
    counts[word] = item.count

In [None]:
lower_counts = {}
lower_to_origin = {}
for word, value in counts.items():
    if np.any([w not in alphabet for w in word.lower()]):
        continue
            
    changed = False
    if word.lower() in lower_counts:
        lower_counts[word.lower()] = max(lower_counts[word.lower()], value)
        changed = True
    else:
        lower_counts[word.lower()] = value
    if word.lower() not in lower_to_origin or changed:
        lower_to_origin[word.lower()] = word     

In [None]:
freq_and_word = lambda item: item[::-1]
most_frequent = sorted(lower_counts.items(), key=freq_and_word, reverse=True)

In [None]:
most_frequent = most_frequent[:80000]

In [None]:
word2vec = {}
for word, _ in most_frequent:
    word2vec[word.lower()] = w2v[lower_to_origin[word]]

In [None]:
def make_data(datafile):
    
    def normalize_word(word):
        word = word.lower()
        try:
            while word[0] not in alphabet:
                word = word[1:]
            while word[-1] not in alphabet:
                word = word[:-1]
        except:
            return None
        if word not in word2vec:
            print(word)
            return None
        return word
    
    smiles_to_words = {} 
    emojis = []
    words_from_emojis = []
    with open(datafile, 'r') as f:
        lines = f.readlines()
        for line in lines:
            try:
                em, phrase, truth = line.rstrip().split('\t')
            except:
                continue
            phrase = phrase.lower()
            new_words = [normalize_word(new_word) for new_word in phrase.split() if not normalize_word(new_word) is None]
            if em in smiles_to_words:
                smiles_to_words[em].extend(new_words)
            else:
                smiles_to_words[em] = new_words
            emojis.append(em)
            words_from_emojis.extend(new_words)
    for key, value in smiles_to_words.items():
        normalized_words = []
        for word in list(set(value)):
            norm_word = normalize_word(word)
            if not norm_word is None:
                normalized_words.append(norm_word)
        smiles_to_words[key] = normalized_words
    emojis = list(set(emojis))
    words_from_emojis = list(set(words_from_emojis))
    return smiles_to_words, emojis, words_from_emojis

In [None]:
smiles_to_words, emojis, words_from_emojis = make_data('../data/emojipedia_positive.txt')
words = list(word2vec.keys())
emojis_to_ind = {emojis[i]: i+80000 for i in range(len(emojis))}
words_to_ind = {most_frequent[i][0].lower(): i for i in range(len(most_frequent))}

In [None]:
reverse_dictionary = []
reverse_dictionary.extend([most_frequent[i][0].lower() for i in range(80000)])
reverse_dictionary.extend(emojis)

In [None]:
emoji_voc_size = len(emojis)
words_voc_size = 80000
vocabulary_size = emoji_voc_size + words_voc_size

In [None]:
smiles = list(smiles_to_words.keys())
smile_ind = 0

def get_minibatch():
    global smile_ind
    minibatch = []
    targets_to_awoid = []
    
    curr_words = smiles_to_words[smiles[smile_ind]]
    while len(curr_words) < 1:
        smile_ind += 1
        smile_ind %= len(smiles)
        curr_words = smiles_to_words[smiles[smile_ind]]
    for i in range(minibatch_len):
        smile = smiles[smile_ind]
        word_ind = random.randint(0, len(curr_words)-1)
        while word_ind in targets_to_awoid:
            word_ind += 1
            word_ind %= len(curr_words)
        word = curr_words[word_ind]
        targets_to_awoid.append(word)
        if i < min(minibatch_len, len(smiles_to_words[smiles[smile_ind]])) / 2:
            minibatch.append((emojis_to_ind[smile], words_to_ind[word])) 
        else:
            #break
            minibatch.append((words_to_ind[word], emojis_to_ind[smile]))
        if len(targets_to_awoid) == len(curr_words):
            break
    smile_ind += 1
    if smile_ind >= len(smiles):
        smile_ind = 0
    return minibatch

In [None]:
def generate_batch():
    batch = []
    while len(batch) < batch_size:
        batch.extend(get_minibatch())
    random.shuffle(batch)
    return [batch[i][0] for i in range(batch_size)], [[batch[i][1]] for i in range(batch_size)]

In [None]:
generate_batch()

In [None]:
embedding_size = 300
valid_size = 32     # Random set of words to evaluate similarity on.
valid_examples = list(np.random.choice(len(smiles), valid_size // 2, replace=False))
valid_examples.extend([x + 80000 for x in valid_examples])
num_sampled = 32    # Number of negative examples to sample.

In [None]:
unk_vector = [np.mean([word2vec[most_frequent[j][0]][i] for j in range(80000)]) for i in range(300)]

In [None]:
full_embeddings = []
for i, word in enumerate(most_frequent):
    full_embeddings.append(word2vec[most_frequent[i][0]])
for emoji in emojis:
    full_embeddings.append(unk_vector)

In [None]:
np.array(full_embeddings).shape

In [None]:
graph = tf.Graph()
with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    #embeddings_words = tf.placeholder(tf.float32, shape=[80000+len(emojis), embedding_size])
    
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(np.array(full_embeddings), dtype=tf.float32)
        embeddings = tf.Variable(
            tf.random_uniform([len(full_embeddings), embedding_size], -1.0, 1.0))
        choice = [2 for i in range(80000)]
        choice.extend([0 for i in range(len(emojis))])
        choice = tf.constant(choice)
        embeddings = tf.where(tf.less(choice, [1]), embeddings, embeddings_words)
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_sampled,
                           num_classes=vocabulary_size)
                              )
        
        # Construct the SGD optimizer using a learning rate of 1.0.
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
 
        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)

        # Add variable initializer.
        init = tf.global_variables_initializer()

In [None]:
# Begin training.
num_steps = 10000001

config = tf.ConfigProto(
    device_count = {'GPU': 1}
)
with tf.Session(graph=graph, config=config) as session:
    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch()
        
        feed_dict = {
            train_inputs: batch_inputs, 
            train_labels: batch_labels, 
            embeddings_words: np.array(full_embeddings)
        }
        
        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 50000 == 0:
            sim = session.run(similarity, feed_dict=feed_dict)
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to {}:'.format(valid_word)
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
            final_embeddings = session.run(normalized_embeddings, feed_dict=feed_dict)
            pk.dump(final_embeddings, open('e2v_smiles_only', 'wb'))

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
    final_embeddings = session.run(normalized_embeddings, feed_dict=feed_dict)