Make corpora from data got from emojidictionary and emojipedia webpages

In [1]:
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import tensorflow as tf
from tensorflow.python.framework import ops
import pickle as pk
import gensim.models as gs
import numpy as np
import random
import re
# Internal dependencies
from model import Emoji2Vec
from trainer import Trainer
from batcher import BatchNegSampler

In [7]:
emojipedia_data = "../data/emojis_all.txt"
emojidict_data = "../data/emojis_emojidictionary_new.txt"

emojipedia_corpora_name = "../data/emojipedia"
emojidict_corpora_name = "../data/emojidict"

word2vec_file = "../data/word2vec/GoogleNews-vectors-negative300.bin"

In [3]:
def get_pos_samples(file):
    names_to_emojis = {}
    positive_samples = []
    with open(file, "r") as file:
        lines = file.readlines()
        for line in lines:
            defs = re.findall(r"\'(.+?)\'", line)
            line = line.split()
            name_words = []
            for i in range(1, len(line)):
                name_words.append(line[i].lower())
                if '[' in line[i+1]:
                    break
            name = ' '.join(name_words)
            names_to_emojis[name] = line[0].lower()
            for def_ in defs:
                positive_samples.append((line[0].lower(), def_, True))
    return names_to_emojis, positive_samples

In [4]:
names_to_emojis, positive_samples = get_pos_samples(emojipedia_data)

In [8]:
w2v = gs.KeyedVectors.load_word2vec_format(word2vec_file, binary=True)

In [15]:
counts = {}
for word, item in w2v.vocab.items():
    counts[word] = item.count

In [16]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'

In [19]:
lower_counts = {}
lower_to_origin = {}
words = []
for word, value in counts.items():
    if np.any([w not in alphabet for w in word.lower()]):
        continue
            
    changed = False
    if word.lower() in lower_counts:
        lower_counts[word.lower()] = max(lower_counts[word.lower()], value)
        changed = True
    else:
        lower_counts[word.lower()] = value
        words.append(word.lower())
    if word.lower() not in lower_to_origin or changed:
        lower_to_origin[word.lower()] = word     

In [18]:
freq_and_word = lambda item: item[::-1]
most_frequent = sorted(lower_counts.items(), key=freq_and_word, reverse=True)

In [20]:
len(words)

699957

In [28]:
def get_neg_samples(names_to_emojis, positive_samples):
    negative_samples = []
    for emoji in names_to_emojis.values():
        random.shuffle(positive_samples)
        pos_samples_emoji = [x[1] for x in positive_samples if x[0] == emoji]
        emoji_keywords = []
        for x in pos_samples_emoji:
            emoji_keywords.extend([word for word in x.split() if len(word) > 2])
        count_generated = 0
#         for sample in positive_samples:
#             num_matches = len([x for x in sample[1].split() if sample[1].split() in emoji_keywords])
#             if num_matches > 0:
#                 continue
#             negative_samples.append((emoji, sample[1], False))
#             count_generated += 1
#             if count_generated > 500:
#                 break
        for i in range(200):
            for j in range(1, 4):
                negative_sample = []
                while True:
                    word = random.choice(words)
                    if word in emoji_keywords:
                        continue
                    negative_sample.append(word)
                    if len(negative_sample) == j:
                        break
                negative_samples.append((emoji, ' '.join(negative_sample), False))
    return negative_samples

In [25]:
len(names_to_emojis)

1452

In [27]:
1452*600

871200

In [29]:
def make_corpora(data, corpora_filename):
    names_to_emojis, positive_samples = get_pos_samples(data)
    random.shuffle(positive_samples)
    negative_samples = get_neg_samples(names_to_emojis, positive_samples)
    random.shuffle(negative_samples)
    
    with open("{}_positive.txt".format(corpora_filename), "w") as file:
        for i in range(len(positive_samples)):
            for j in range(3):
                file.write(str(positive_samples[i][j]))
                file.write('\t')
            file.write('\n')
        
    with open("{}_negative.txt".format(corpora_filename), "w") as file:
        for i in range(len(negative_samples)):
            for j in range(3):
                file.write(str(negative_samples[i][j]))
                file.write('\t')
            file.write('\n')
        
    return names_to_emojis, positive_samples, negative_samples

In [30]:
names_to_emojis, positive_samples, negative_samples = make_corpora(emojipedia_data, emojipedia_corpora_name)
#x = make_corpora(emojidict_data, emojidict_corpora_name)

({'tractor': '🚜',
  'pirate flag': '🏴\u200d☠️',
  'flag burkina faso': '🇧🇫',
  'flag sri lanka': '🇱🇰',
  'middle finger': '🖕',
  'robot face': '🤖',
  'couple with heart: man, man': '👨\u200d❤️\u200d👨',
  'flag in hole': '⛳',
  'hot pepper': '🌶️',
  'carp streamer': '🎏',
  'waving hand sign': '👋',
  'clock face six o’clock': '🕕',
  'honeybee': '🐝',
  'small blue diamond': '🔹',
  'beach with umbrella': '🏖️',
  'lady beetle': '🐞',
  'flag belgium': '🇧🇪',
  'anchor': '⚓',
  'man police officer': '👮\u200d♂️',
  'flag são tomé &amp; príncipe': '🇸🇹',
  'sagittarius': '♐',
  'bookmark tabs': '📑',
  'flag somalia': '🇸🇴',
  'drum with drumsticks': '\U0001f941',
  'horse racing': '🏇',
  'worried face': '😟',
  'peanuts': '\U0001f95c',
  'bacon': '\U0001f953',
  'chipmunk': '🐿️',
  'white sun behind cloud with rain': '🌦️',
  'lock with ink pen': '🔏',
  'boxing glove': '\U0001f94a',
  'neutral face': '😐',
  'clock face eleven o’clock': '🕚',
  'convenience store': '🏪',
  'money with wings': '💸',
  'te

In [48]:
random.shuffle(positive_samples)
random.shuffle(negative_samples)
with open("../data/ref_positive.txt", "w") as file:
    for i in range(len(positive_samples)):
        for j in range(3):
            file.write(str(positive_samples[i][j]))
            file.write('\t')
        file.write('\n')

with open("../data/ref_negative.txt", "w") as file:
    for i in range(len(negative_samples)):
        for j in range(3):
            file.write(str(negative_samples[i][j]))
            file.write('\t')
        file.write('\n')