In [1]:
import numpy as np
import gensim
import emoji



In [2]:
e2v = gensim.models.KeyedVectors.load_word2vec_format("emojional.bin", binary=True)

In [3]:
vocabulary = e2v.key_to_index.keys()

In [4]:
# Sample 10 random emojis from the data set.
example_emojis = np.random.choice(list(vocabulary), 10)
print(example_emojis)

['🇹🇲' '🇬🇺' '🔇' '🟦' '🔊' '🏹' '🔝' '🪔' '🏅' '👨\u200d💼']


In [5]:
e2v.most_similar('🦎')

[('🦖', 0.5565454363822937),
 ('🐶', 0.5542377829551697),
 ('🐕', 0.5361674427986145),
 ('🐁', 0.5336870551109314),
 ('🕋', 0.52545166015625),
 ('🐭', 0.5247642993927002),
 ('🐍', 0.5186745524406433),
 ('🦓', 0.5150848627090454),
 ('🐟', 0.5091385245323181),
 ('🐸', 0.5007147192955017)]

In [6]:
list(vocabulary)[1]

'👩\u200d🎨'

### Tokenizer

In [7]:
import transformers

2022-12-01 20:43:45.173318: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [8]:
tokenizer = transformers.SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-uncased", do_lower_case=True)

In [9]:
sample = "Don't you love 🤗 Transformers? We sure do."

In [10]:
tokenizer.tokenize(sample)

['don',
 "'",
 't',
 'you',
 'love',
 '[UNK]',
 'transformers',
 '?',
 'we',
 'sure',
 'do',
 '.']

In [11]:
# add huggingface emoji
tokenizer.add_tokens("🤗")

1

In [12]:
tokenizer.tokenize(sample)

['don',
 "'",
 't',
 'you',
 'love',
 '🤗',
 'transformers',
 '?',
 'we',
 'sure',
 'do',
 '.']

In [14]:
emoji.is_emoji('🤗')

True

In [15]:
emoji.emoji_count(sample)

1

In [16]:
emoji.emoji_list(sample)

[{'match_start': 15, 'match_end': 16, 'emoji': '🤗'}]

In [17]:
sample[15:16]

'🤗'

### play with goemo dataset

In [29]:
from datasets import load_dataset
from tqdm import tqdm

In [20]:
go_emotions = load_dataset("go_emotions")
data = go_emotions.data

No config specified, defaulting to: go_emotions/simplified
Found cached dataset go_emotions (/jet/home/zzhou5/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
train, valid, test = data["train"].to_pandas(), data["validation"].to_pandas(), data["test"].to_pandas()

In [30]:
all_emojis = set()

In [38]:
for phase in [train, valid, test]:
    for txt in tqdm(phase['text'][:100]):
        if emoji.emoji_count(txt) > 0:
            print(txt)
            # emojis = emoji.emoji_list(txt)
            # for emoji_pair in emojis:
            #     all_emojis.add(txt[emoji_pair['match_start']:emoji_pair['match_end']])

100%|██████████| 100/100 [00:00<00:00, 75099.44it/s]


Yes I heard abt the f bombs! That has to be why. Thanks for your reply:) until then hubby and I will anxiously wait 😝
sorry [NAME]! 😘😘😘


100%|██████████| 100/100 [00:00<00:00, 65906.73it/s]


[NAME] is such a legendary daddy 😩
Hahahah thank you so much, username does not check out, you don't seem sketchy at all! 😅


100%|██████████| 100/100 [00:00<00:00, 67682.81it/s]

I’m so sorry 🤪
Eff your video - love Canada 🇨🇦 Stupid geolock





In [36]:
all_emojis = list(all_emojis)

In [37]:
tokenizer.add_tokens(all_emojis)

266

In [39]:
tokenizer.tokenize('Eff your video - love Canada 🇨🇦 Stupid geolock')

['e',
 '##ff',
 'your',
 'video',
 '-',
 'love',
 'canada',
 '🇨🇦',
 'stupid',
 'geo',
 '##lock']

### Add all emojis in training data to tokenizier