This notebook is for training e2v along with w2v using method which Google w2v was trained with. Read more about it: https://www.tensorflow.org/tutorials/word2vec

In [None]:
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import tensorflow as tf
from tensorflow.python.framework import ops
import pickle as pk
import gensim.models as gs
import numpy as np
import random
import itertools
# Internal dependencies
import nltk
from nltk.corpus import wordnet as wn
from model import Emoji2Vec
from trainer import Trainer

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import zipfile
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from tfnn.layers import *
import tfnn
import math
import mt.bleu
import sys
import numpy as np
import collections
from mt.strutils import tokenize
import time

In [None]:
emoji_minibatch_len = 16
batch_size = 128

In [None]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [None]:
vocabulary = read_data('text8.zip')
print('Data size', len(vocabulary))
words_voc_size = 80000

In [None]:
e2v = {}
w2v = {}
with open("./e2v_full.txt", "r") as f:
    for i, line in enumerate(f.readlines()):
        line = line.split('\t')
        e2v[line[0]] = [float(x) for x in line[1].split()]
with open("./w2v_full.txt", "r") as f:
    for i, line in enumerate(f.readlines()):
        line = line.split('\t')
        w2v[line[0]] = [float(x) for x in line[1].split()]

In [None]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    keys = [x[0] for x in count]
    only_in_w2v = [word for word in w2v if word not in keys]
    index = 0
    for i in range(len(count)):
        if count[i][0] not in w2v:
            count[i] = (only_in_w2v[index], 4)
            index += 1
    dictionary = dict()
    extended_dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        if word in extended_dictionary:
            extended_dictionary[word].append(len(data))
        else:
            extended_dictionary[word] = [len(data)]
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary, extended_dictionary

data, count, dictionary_w2v, reverse_dictionary_w2v, extended_dictionary = build_dataset(vocabulary,
                                                            words_voc_size)

In [None]:
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary_w2v[i] for i in data[:10]])

In [None]:
def make_data(datafile):
    
    def normalize_word(word):
        alphabet = 'abcdefghijklmnopqrstuvwxyz'
        word = word.lower()
        try:
            while word[0] not in alphabet:
                word = word[1:]
            while word[-1] not in alphabet:
                word = word[:-1]
        except:
            return None
        if word not in dictionary_w2v:
            print(word)
            return None
        return word
    
    smiles_to_words = {} 
    emojis = []
    words_from_emojis = []
    with open(datafile, 'r') as f:
        lines = f.readlines()
        for line in lines:
            try:
                em, phrase, truth = line.rstrip().split('\t')
            except:
                print(em, phrase)
                continue
            phrase = phrase.lower()
            new_words = [normalize_word(new_word) for new_word in phrase.split() if not normalize_word(new_word) is None]
            if em in smiles_to_words:
                smiles_to_words[em].extend(new_words)
            else:
                smiles_to_words[em] = new_words
            emojis.append(em)
            words_from_emojis.extend(new_words)
    for key, value in smiles_to_words.items():
        normalized_words = []
        for word in list(set(value)):
            norm_word = normalize_word(word)
            if not norm_word is None:
                normalized_words.append(norm_word)
        smiles_to_words[key] = normalized_words
    emojis = list(set(emojis))
    words_from_emojis = list(set(words_from_emojis))
    return smiles_to_words, emojis, words_from_emojis

In [None]:
smiles_to_words, emojis, words_from_emojis = make_data('../data/emojipedia_positive.txt')
data = [x+len(emojis) for x in data]
dictionary_w2v = {k:v+len(emojis) for k, v in dictionary_w2v.items()}
emojis_to_ind = {emojis[i]: i for i in range(len(emojis))}

In [None]:
reverse_dictionary = []
reverse_dictionary.extend(emojis)
reverse_dictionary.extend(list(reverse_dictionary_w2v.values())[:words_voc_size])

In [None]:
emoji_voc_size = len(emojis)
vocabulary_size = emoji_voc_size + words_voc_size

In [None]:
smiles = list(smiles_to_words.keys())
smile_ind = 0
word_window = 2
word_w2v_ind = 2
samples_per_word = 2
samples_per_emoji = 4

def get_emoji_minibatch():
    global smile_ind
    minibatch = []
    
    for i in range(emoji_minibatch_len // samples_per_emoji):
        curr_words = smiles_to_words[smiles[smile_ind]]
        while len(curr_words) < 1:
            smile_ind += 1
            smile_ind %= len(smiles)
            curr_words = smiles_to_words[smiles[smile_ind]]
            curr_words = [word for word in curr_words if len(word)>=3 and word not in ["the", "this", "that", "are"]]
        smile = smiles[smile_ind]
        
        targets_to_awoid = []
        for k in range(samples_per_emoji // samples_per_word):
            word_ind = random.randint(0, len(curr_words)-1)
            while word_ind in targets_to_awoid:
                word_ind += 1
                word_ind %= len(curr_words)

            num_labels_added = 0
            labels = extended_dictionary[curr_words[word_ind]]
            for j in range(samples_per_word):   
                label_ind = random.randint(0, len(labels)-1)
                label = labels[label_ind]
                
                label_ind = random.randint(0, 5)
                while label_ind == 2:
                    label_ind = random.randint(0, 5)
                label_ind = min(max(label+label_ind-2, 0), len(data)-1)
                
                what_first = random.randint(0, 2)
                if what_first == 1:
                    minibatch.append((emojis_to_ind[smile], dictionary_w2v[reverse_dictionary_w2v[data[label_ind]-len(emojis)]])) 
                else:
                    minibatch.append((dictionary_w2v[reverse_dictionary_w2v[data[label_ind]-len(emojis)]], emojis_to_ind[smile]))
                num_labels_added += 1
                if num_labels_added == samples_per_word:
                    break
            targets_to_awoid.append(word_ind)
            if len(targets_to_awoid) == len(curr_words):
                break
        smile_ind += 1
        if smile_ind >= len(smiles):
            smile_ind = 0
    return minibatch

def get_words_minibatch():
    global word_window
    global word_w2v_ind
    minibatch = []
    for i in range((batch_size-emoji_minibatch_len) // samples_per_word):
        targets_to_awoid = [word_w2v_ind]
        for j in range(samples_per_word):
            next_word_ind = random.randint(-2, 2)
            while next_word_ind in targets_to_awoid:
                next_word_ind = random.randint(-2, 2)
            word = reverse_dictionary_w2v[data[next_word_ind+word_w2v_ind]-len(emojis)]
            targets_to_awoid.append(next_word_ind)
            minibatch.append((data[word_w2v_ind], data[word_w2v_ind+next_word_ind])) 
        word_w2v_ind += 1
        if word_w2v_ind >= len(data)-2:
            word_w2v_ind = 2
    return minibatch

def get_minibatch():
    minibatch = get_emoji_minibatch()
    minibatch.extend(get_words_minibatch())
    return minibatch

In [None]:
#get_minibatch()

In [None]:
def generate_batch():
    batch = []
    while len(batch) < batch_size:
        batch.extend(get_minibatch())
    random.shuffle(batch)
    return [batch[i][0] for i in range(batch_size)], [[batch[i][1]] for i in range(batch_size)]

In [None]:
#generate_batch()

In [None]:
embedding_size = 300
valid_size = 32     # Random set of words to evaluate similarity on.
valid_examples = list(np.random.choice(len(smiles), valid_size // 2, replace=False))
words_val_indices = np.random.choice(500, valid_size // 2, replace=False)
valid_examples.extend([x+len(smiles) for x in words_val_indices])
num_sampled = 64    # Number of negative examples to sample.

In [None]:
# True if some checkpoint already exists, False if you want to start over
begin_from_checkpoint = True

In [None]:
full_embeddings = []
if begin_from_checkpoint:
    for emoji in emojis:
        full_embeddings.append(e2v[emoji])
    for i in range(len(emojis), len(reverse_dictionary)):
        try:
            full_embeddings.append(w2v[reverse_dictionary[i]])
        except Exception as e:
            print(reverse_dictionary[i])
else:
    for i in range(words_voc_size + len(emojis)):
        full_embeddings.append([random.uniform(-1, 1) for i in range(embedding_size)])

In [None]:
np.array(full_embeddings).shape

In [None]:
graph = tf.Graph()
with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(np.array(full_embeddings), dtype=tf.float32)
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_sampled,
                           num_classes=vocabulary_size)
                              )
        # Construct the SGD optimizer using a learning rate of 1.0.
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
 
        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)

        # Add variable initializer.
        init = tf.global_variables_initializer()

In [None]:
def write_embeddings(embeddings):
    with open("../data/emoji2vec/e2v_full.txt", "w") as file:
        for i in range(len(emojis)):
            file.write(emojis[i])
            file.write('\t')
            for j in range(300):
                file.write(str(final_embeddings[i][j]))
                file.write(' ')
            file.write('\n')
    with open("../data/word2vec/w2v_full.txt", "w") as file:
        for i in range(words_voc_size):
            file.write(reverse_dictionary[i+len(emojis)])
            file.write('\t')
            for j in range(300):
                file.write(str(final_embeddings[i+len(emojis)][j]))
                file.write(' ')
            file.write('\n')

In [None]:
# Begin training.
num_steps = 10000001
config = tf.ConfigProto(
    device_count = {'GPU': 0}
)
with tf.Session(graph=graph, config=config) as session:
    # We must initialize all variables before we use them.
#     saver = tf.train.Saver()
#     saver.restore(session, 'session'+"/model.ckpt")
    
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch()
        
        feed_dict = {
            train_inputs: batch_inputs, 
            train_labels: batch_labels, 
            #embeddings_words: np.array(full_embeddings)
        }
        
        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 50000 == 0:
            sim = session.run(similarity, feed_dict=feed_dict)
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
        if step % 150000 == 0: 
            final_embeddings = session.run(normalized_embeddings, feed_dict=feed_dict)
            write_embeddings(final_embeddings)
            
            pk.dump(reverse_dictionary, open('../data/reverse_dictionary.txt', 'wb'))
            
            saver = tf.train.Saver()
            # Save a checkpoint with the trained model
            saver.save(session, '../data/session'+"/model.ckpt")
            
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
    final_embeddings = session.run(normalized_embeddings, feed_dict=feed_dict)

In [None]:
final_embeddings[0]