En->emoji translation pipeline

In [1]:
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import tensorflow as tf
from tensorflow.python.framework import ops
import pickle as pk
import gensim.models as gs
import numpy as np
import random
import itertools
# Internal dependencies
import nltk
from nltk.corpus import wordnet as wn
from model import Emoji2Vec
from trainer import Trainer
#from batcher import BatchNegSampler

In [5]:
emoji_to_ind = {}
ind_to_emoji = []
e2v = {}
embeddings_array_emoji = []

with open("../data/emoji2vec/e2v_as_average.txt", "r") as f:
    for i, line in enumerate(f.readlines()):
        line = line.split('\t')
        emoji_to_ind[line[0]] = i
        e2v[line[0]] = [float(x) for x in line[1].split()]
        ind_to_emoji.append(line[0])
        embeddings_array_emoji.append([float(x) for x in line[1].split()])

In [4]:
w2v = gs.KeyedVectors.load_word2vec_format("../data/word2vec/GoogleNews-vectors-negative300.bin", binary=True)

In [16]:
w2v = {}
word_to_ind = {}
ind_to_word = []
embeddings_array_words = []
with open("../data/word2vec/w2v_as_average.txt", "r") as f:
    for i, line in enumerate(f.readlines()):
        line = line.split('\t')
        word_to_ind[line[0]] = i
        w2v[line[0]] = [float(x) for x in line[1].split()]
        ind_to_word.append(line[0])
        embeddings_array_words.append([float(x) for x in line[1].split()])

In [20]:
# get vector representation of a single token
def V(token):
    if token in w2v:
        return w2v[token]
    elif token in e2v:
        return e2v[token]
    raise Exception("given token {} is not in dictionary".format(token))

In [17]:
# compute sum of vectors for tokens in item.split()
def phraseVecModel(item):
    tokens = item.lower().split(' ')
    phr_sum = None

    for token in tokens:
        try:
            phr_sum += V(token)
        except Exception as e:
            pass
    return phr_sum

In [18]:
# get unicode description of emoji
import unicodedata
def n(smile):
    try: 
        return unicodedata.name(smile)
    except Exception as e:
        pass

In [19]:
# sort emojis in order of ascending similarity to total vector
# P.S. it is not effective code
def sim(total):
    if total is None:
        return ()
    ops.reset_default_graph()
    # Emoji indices in current batch
    V = tf.constant(embeddings_array_emoji)
    
    col = tf.placeholder(tf.int32, shape=[1], name='col')

    orig_vec = tf.placeholder(tf.float32, shape=[1, 300], name='orig_vec')
    v_row = orig_vec

    v_col = tf.nn.embedding_lookup(V, col) # [1, 300]

    # Calculate the predicted score, a.k.a. dot product (here)
    score = tf.reduce_sum(tf.multiply(v_row, v_col), 1) # [1]

    # Probability of match
    #prob = tf.sigmoid(score) # [1]
    prob = score / (tf.norm(v_row) * tf.norm(v_col))
    
    res = list()
    session = tf.Session()
    for colIx in range(0, len(ind_to_emoji)):
        predict = session.run(prob, feed_dict={
            col: np.array([colIx]),
            orig_vec: np.array([total]),
        })
        res.append(predict[0])
        
    ems = sorted(range(len(res)), key=lambda i: res[i], reverse=True)[:4]
    res = sorted(res, reverse=True)
    
    return ([(ind_to_emoji[em], n(ind_to_emoji[em][0]), res_) for em, res_ in zip(ems, res)])

In [21]:
# compute similarity between 2 vectors: a, b
# P.S. it's also not an effective code ><
def similarity(a, b):
    v_row = tf.placeholder(tf.float32, shape=[1, 300], name='v_row')

    v_col = tf.placeholder(tf.float32, shape=[1, 300], name='v_col')
    
    # Calculate the predicted score, a.k.a. dot product (here)
    score = tf.reduce_sum(tf.multiply(v_row, v_col), 1) # [1]

    # Probability of match
    #prob = tf.sigmoid(score) # [1]
    prob = score / (tf.norm(v_row) * tf.norm(v_col))

    res = list()
    session = tf.Session()
    predict = session.run(prob, feed_dict={
        v_col: np.array([a]),
        v_row: np.array([b]),
    })
    return predict[0]

In [22]:
nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}
# find set of emojis best representing the given phrase (in some way)
def find_sims(phrase, threshold):
    sims = []
    collected_score = 0
    phrase_emb = phraseVecModel(phrase)
    
    while collected_score < threshold:
        smile, descr, score = sim(phrase_emb)[0]
        collected_score += score
        sims.append((smile, descr, score))
        if phrase in nouns:
            break
        if score < 0.25:
            break
        smile_emb = phraseVecModel(descr)
        print(smile, descr, score)
        phrase_emb = [phrase_emb[i] - score*smile_emb[i] for i in range(300)]
        
#         if score > 0.7:
#             break
        
        if len(sims) > 2:
            break
        
    return sims

In [27]:
print(sim(phraseVecModel('automobile')))

[('🚗', 'AUTOMOBILE', 0.71176922), ('🚘', 'ONCOMING AUTOMOBILE', 0.63140714), ('🏍️', 'RACING MOTORCYCLE', 0.49713874), ('\U0001f6f4', None, 0.4655095)]


In [13]:
bad_tokens = ['the', 'for', 'are']
alphabet = 'abcdefghigklmnopqrstuvwxyz'

In [14]:
import itertools

In [30]:
def get_subphrase_score(tokens, begin_ind, end_ind):
    if end_ind > len(tokens) or begin_ind < 0:
        return 0
    subphrase = ' '.join([tokens[i] for i in range(begin_ind, end_ind)])
    print(subphrase)
    translation = ' '.join([x[0] for x in find_sims(subphrase, 0.7)])
    simil = similarity(phraseVecModel(subphrase), phraseVecModel(translation))
    return translation, simil

translations = []
scores = []

# try to split phrase into subsets of words so that this split is best for translating each 
# subphrase into emojis using find_sims function 
def tokenize(phrase):
    tokens = []
    for token in phrase.lower().split():
        while token[0] not in alphabet:
            token = token[1:]
        while token[-1] not in alphabet:
            token = token[:-1]
        if len(token) > 2 and token not in bad_tokens:
            tokens.append(token)
    for m in range(len(tokens)//2, len(tokens)):
        borders_sets = list(itertools.combinations(list(range(len(tokens))), m))
        for borders_ in borders_sets:
            borders = []
            borders_ = list(sorted(borders_))
            if len(borders_) > 0 and borders_[0] != 0:
                borders = [0]
            borders.extend(borders_)
            borders.append(len(tokens))
            begin_ind = 0
            end_ind = 1
            
            score = 0
            translation = []
            
            while end_ind < len(borders):
                try:
                    tr, new_score = get_subphrase_score(tokens, borders[begin_ind], borders[end_ind])
                    translation.append(tr)
                    score += new_score
                except Exception as e:
                    pass
                begin_ind += 1
                end_ind += 1
            
            translations.append('|'.join(translation))
            scores.append(score)
            
            print(translations[-1], scores[-1])
            print("===========================")
                  

In [16]:
tokenize('drunk driver hit me out of the road')

drunk
driver hit out road
🍸|🚘 🚓 1.00471621752
drunk driver
hit out road
🚘 🍸|🚘 🆙 🚵 1.05803227425
drunk driver hit
out road
🚘 🕴️|⤵️ 🚏 🛤️ 0.900391936302
drunk driver hit out
road
🚘 🕴️|🚃 0.932773083448
drunk
driver
hit out road
🍸|🚘|🚘 🆙 🚵 1.48281171918
drunk
driver hit
out road
🍸|🚘 🕴️|⤵️ 🚏 🛤️ 1.33551338315
drunk
driver hit out
road
🍸|🚘 🕴️|🚃 1.36438435316
drunk driver
hit
out road
🚘 🍸|🎯|⤵️ 🚏 🛤️ 1.31914708018
drunk driver
hit out
road
🚘 🍸|🚃 0.927494198084
drunk driver hit
out
road
🚘 🕴️|🔙|🚃 1.34232139587
drunk
driver
hit out road
🍸|🚘|🚘 🆙 🚵 1.48281171918
drunk
driver hit
out road
🍸|🚘 🕴️|⤵️ 🚏 🛤️ 1.33551332355
drunk
driver hit out
road
🍸|🚘 🕴️|🚃 1.36438435316
drunk driver
hit
out road
🚘 🍸|🎯|⤵️ 🚏 🛤️ 1.31914713979
drunk driver
hit out
road
🚘 🍸|🚃 0.927494198084
drunk driver hit
out
road
🚘 🕴️|🔙|🚃 1.34232139587
drunk
driver
hit
out road
🍸|🚘|🎯|⤵️ 🚏 🛤️ 1.74392658472
drunk
driver
hit out
road
🍸|🚘|🚃 1.35227379203
drunk
driver hit
out
road
🍸|🚘 🕴️|🔙|🚃 1.77744281292
drunk driver
hit
out
road
🚘 🍸|🎯|🔙|🚃 1.76107

In [31]:
tokenize('Russian rocket landed in pacific ocean')

russian
rocket landed pacific ocean
🇷🇺|🚀 🌊 1.11947250366
russian rocket
landed pacific ocean
🚀 🇷🇺|🌊 🇮🇴 1.19825345278
russian rocket landed
pacific ocean
🚀 🇷🇺|🌊 🇮🇴 1.21570831537
russian rocket landed pacific
ocean
🚀 🇷🇺|🌊 1.22547090054
russian
rocket
landed pacific ocean
🇷🇺|🚀|🌊 🇮🇴 1.85984379053
russian
rocket landed
pacific ocean
🇷🇺|🚀 🚀|🌊 🇮🇴 1.80308133364
russian
rocket landed pacific
ocean
🇷🇺|🚀 🚀|🌊 1.72357535362
russian rocket
landed
pacific ocean
🚀 🇷🇺|🌊 🇮🇴 1.28981137276
russian rocket
landed pacific
ocean
🚀 🇷🇺|🌊 1.33253782988
russian rocket landed
pacific
ocean
🚀 🇷🇺|🇮🇴|🌊 1.64197978377
russian
rocket
landed pacific ocean
🇷🇺|🚀|🌊 🇮🇴 1.85984390974
russian
rocket landed
pacific ocean
🇷🇺|🚀 🚀|🌊 🇮🇴 1.80308133364
russian
rocket landed pacific
ocean
🇷🇺|🚀 🚀|🌊 1.72357529402
russian rocket
landed
pacific ocean
🚀 🇷🇺|🌊 🇮🇴 1.28981119394
russian rocket
landed pacific
ocean
🚀 🇷🇺|🌊 1.33253788948
russian rocket landed
pacific
ocean
🚀 🇷🇺|🇮🇴|🌊 1.64197978377
russian
rocket
landed
pacific ocean
🇷🇺|🚀|🌊 🇮🇴 1.95

In [28]:
tokenize('killing people: instruction for cats')

killing
people instruction cats
💀|🐈 🚷 0.893984466791
killing people
instruction cats
💀 👵 🚷|🐈 🐾 0.975699305534
killing people instruction
cats
🙇 🚸|🐈 1.10461840034
killing
people
instruction cats
💀|🚷|🐈 🐾 1.32512834668
killing
people instruction
cats
💀|🚷 🏫 ℹ️|🐈 1.56678208709
killing people
instruction
cats
💀 👵 🚷|🏫|🐈 1.50554850698
killing
people
instruction cats
💀|🚷|🐈 🐾 1.32512831688
killing
people instruction


KeyboardInterrupt: 

In [16]:
def translate(phrase):
    tokens = tokenize(phrase)
    print(tokens)
    translation = ''
    sep = ''
    for token in tokens:
        translation += sep + ' '.join([x[0] for x in find_sims(token)])
        sep = ' ||| '
    return translation

In [65]:
print(translate('killing people: instruction for cats'))

['killing', 'people instruction', 'cats']
💀 ||| 🏫 🎏 👥 ||| 🐈


In [66]:
print(translate('Russian rocket landed in pacific ocean'))

['russian rocket', 'landed', 'pacific', 'ocean']
🚀 🇷🇺 ||| 🛬 ||| 🇮🇴 ||| 🌊


In [None]:
print(translate('two men were cruelly killed last night'))

['two men', 'killed', 'last night']
👨‍👨‍👦‍👦 ||| 💀 🤕 ||| 🌗
