In [1]:
import numpy as np
import gensim
import emoji
import torch



In [2]:
e2v = gensim.models.KeyedVectors.load_word2vec_format("../emoji2vec/results/unicode/k-768_pos-4_rat-1_ep-40_dr-0/emoji2vec.bin", binary=True)

In [3]:
vocabulary = e2v.key_to_index.keys()

In [4]:
# Sample 10 random emojis from the data set.
example_emojis = np.random.choice(list(vocabulary), 10)
print(example_emojis)

['📏' '⛽' '👉🏿' '👢' '◽' '👘' '💅🏿' '2⃣️' '📟' '🐇']


In [5]:
e2v.most_similar('🛁')

[('🚿', 0.702677309513092),
 ('🛀', 0.6778178811073303),
 ('🚾', 0.41962745785713196),
 ('🚽', 0.40616124868392944),
 ('🚰', 0.3953370451927185),
 ('🛀🏼', 0.39066949486732483),
 ('🐃', 0.39057111740112305),
 ('💧', 0.3904171288013458),
 ('🏊', 0.3754548728466034),
 ('👙', 0.3627477288246155)]

In [6]:
e2v.most_similar('😙')

[('😗', 0.5419405698776245),
 ('😘', 0.4886314272880554),
 ('😆', 0.470554381608963),
 ('💋', 0.4581233263015747),
 ('😄', 0.3989715874195099),
 ('🤗', 0.39804166555404663),
 ('😏', 0.3912205100059509),
 ('😍', 0.3868739604949951),
 ('👄', 0.3836819529533386),
 ('😀', 0.37199661135673523)]

In [10]:
list(vocabulary)[1]

'👔'

### Tokenizer

In [5]:
import transformers

2022-12-03 22:39:58.060151: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [6]:
tokenizer = transformers.SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-uncased", do_lower_case=True)
bert_model = transformers.SqueezeBertModel.from_pretrained(
    "squeezebert/squeezebert-uncased"
)

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
print(len(tokenizer))

30528


In [17]:
sample = "Don't you love 🤗 Transformers? We sure do."

In [18]:
tokenizer.tokenize(sample)

['don',
 "'",
 't',
 'you',
 'love',
 '[UNK]',
 'transformers',
 '?',
 'we',
 'sure',
 'do',
 '.']

In [8]:
# add huggingface emoji
tokenizer.add_tokens("🥰")

1

In [20]:
print(len(tokenizer))

30529


In [21]:
tokenizer.tokenize(sample)

['don',
 "'",
 't',
 'you',
 'love',
 '🤗',
 'transformers',
 '?',
 'we',
 'sure',
 'do',
 '.']

In [22]:
emoji.is_emoji('🤗')

True

In [23]:
emoji.emoji_count(sample)

1

In [24]:
emoji.emoji_list(sample)

[{'match_start': 15, 'match_end': 16, 'emoji': '🤗'}]

In [25]:
sample[15:16]

'🤗'

In [26]:
e2v['🤗'].shape

(768,)

In [87]:
len(tokenizer)

30757

In [27]:
bert_model.resize_token_embeddings(
        len(tokenizer)
    )

Embedding(30529, 768)

In [28]:
tokenizer.encode(sample)

[101,
 2123,
 1005,
 1056,
 2017,
 2293,
 30528,
 19081,
 1029,
 2057,
 2469,
 2079,
 1012,
 102]

In [29]:
sample='🤗'

In [31]:
bert_model.embeddings.word_embeddings(torch.Tensor(tokenizer.encode(sample)).int())

tensor([[-0.0305,  0.0102,  0.0096,  ..., -0.0291, -0.0119,  0.0164],
        [ 0.0561, -0.0042,  0.0043,  ..., -0.0027, -0.0398,  0.0091],
        [ 0.0098, -0.0378, -0.0410,  ...,  0.0054, -0.0812, -0.0202]],
       grad_fn=<EmbeddingBackward0>)

### play with goemo dataset

In [8]:
from datasets import load_dataset
from tqdm import tqdm

In [9]:
go_emotions = load_dataset("go_emotions")
data = go_emotions.data

No config specified, defaulting to: go_emotions/simplified
Found cached dataset go_emotions (/jet/home/zzhou5/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
train, valid, test = data["train"].to_pandas(), data["validation"].to_pandas(), data["test"].to_pandas()

In [11]:
all_emojis = set()

In [12]:
for phase in [train, valid, test]:
    for txt in tqdm(phase['text']):
        if emoji.emoji_count(txt) > 0:
            # print(txt)
            emojis = emoji.emoji_list(txt)
            for emoji_pair in emojis:
                all_emojis.add(txt[emoji_pair['match_start']:emoji_pair['match_end']])

100%|██████████| 43410/43410 [00:00<00:00, 78472.27it/s]
100%|██████████| 5426/5426 [00:00<00:00, 78636.32it/s]
100%|██████████| 5427/5427 [00:00<00:00, 79951.70it/s]


In [13]:
all_emojis = list(all_emojis)

In [14]:
error_emojis = []
health_emojis = []

In [15]:
for emoji in all_emojis:
    try:
        tmp_emoji = e2v[emoji[0]]
        health_emojis.append(emoji[0])
    except:
        error_emojis.append(emoji[0])

In [16]:
len(error_emojis)

41

In [17]:
len(health_emojis)

229

In [37]:
tokenizer.add_tokens(all_emojis)

266

In [39]:
tokenizer.tokenize('Eff your video - love Canada 🇨🇦 Stupid geolock')

['e',
 '##ff',
 'your',
 'video',
 '-',
 'love',
 'canada',
 '🇨🇦',
 'stupid',
 'geo',
 '##lock']

### Add error emojis

In [22]:
for i, emoji in enumerate(error_emojis):
        
    # import pdb; pdb.set_trace()
    tokenizer.add_tokens(emoji)
    bert_model.resize_token_embeddings(
        len(tokenizer)
    )
    print(i, emoji, len(tokenizer))

0 🅱 30529
1 🦗 30530
2 ™ 30530
3 ☁ 30531
4 🤫 30532
5 ♀ 30533
6 🧛 30534
7 🇨 30535
8 🤲 30536
9 🤪 30537
10 🇲 30538
11 🦵 30539
12 © 30539
13 🇹 30540
14 ⬆ 30541
15 ✌ 30542
16 ❤ 30543
17 🤯 30544
18 🇦 30545
19 ✌ 30545
20 ☺ 30546
21 🤮 30547
22 ☺ 30547
23 🇫 30548
24 ▫ 30549
25 ✔ 30550
26 🥺 30551
27 🥰 30552
28 ❤ 30552
29 ♂ 30553
30 ☝ 30554
31 ™ 30554
32 🧡 30555
33 🤭 30556
34 🤩 30557
35 ♥ 30557
36 ♥ 30557
37 ❄ 30558
38 🥛 30559
39 🤨 30560
40 🧖 30561


### Add error emojis

In [23]:
for i, emoji in enumerate(health_emojis):
    emoji_embd = torch.from_numpy(e2v[emoji])
    tokenizer.add_tokens(emoji)
    bert_model.resize_token_embeddings(
        len(tokenizer)
    )
    with torch.no_grad():
        bert_model.embeddings.word_embeddings.weight[-1, :] = emoji_embd

  emoji_embd = torch.from_numpy(e2v[emoji])


In [24]:
len(tokenizer)

30757

In [None]:
bert_model.embeddings.word_embeddings.weight.shape

### Add all emojis in training data to tokenizier

In [30]:
bert_model.embeddings.word_embeddings.weight.shape

torch.Size([30757, 768])

In [17]:
len(tokenizer)

30528

In [40]:
tokenizer.add_tokens('🧛')

0

In [18]:
for i, emoji in enumerate(all_emojis):
    emoji = emoji[0]
    if emoji in health_emojis:
        emoji_embd = torch.Tensor(e2v[emoji])
        tokenizer.add_tokens(emoji)
        bert_model.resize_token_embeddings(
            len(tokenizer)
        )
        with torch.no_grad():
            bert_model.embeddings.word_embeddings.weight[-1, :] = emoji_embd
    else:
        
        # import pdb; pdb.set_trace()
        tokenizer.add_tokens(emoji)
        bert_model.resize_token_embeddings(
            len(tokenizer)
        )
        print(i, emoji, len(tokenizer))

  emoji_embd = torch.Tensor(e2v[emoji])


2 🥺 30531
10 🧛 30539
16 ☺ 30545
33 ✔ 30562
52 ™ 30579
59 🇨 30585
62 ⬆ 30588
76 🥰 30601
90 © 30611
94 ✌ 30614
95 🤮 30615
97 🦵 30617
106 🅱 30625
116 🤪 30633
125 🤭 30641
126 ☝ 30642
135 ♥ 30649
153 🇦 30667
155 🥛 30669
158 ♂ 30671
160 ♥ 30672
167 ❤ 30677
174 🧡 30683
176 🤯 30685
187 🇫 30695
191 ▫ 30698
192 🧖 30699
209 ♀ 30714
210 ✌ 30714
215 🤫 30718
218 🤩 30721
223 🇲 30724
225 🦗 30726
226 ☺ 30726
227 🤲 30727
244 🇹 30739
258 ™ 30748
262 ❤ 30751
263 ☁ 30752
265 ❄ 30754
267 🤨 30756


In [19]:
len(tokenizer)

30757

In [21]:
sample = ''.join(error_emojis)

In [22]:
sample

'🥺🧛☺✔™🇨⬆🥰©✌🤮🦵🅱🤪🤭☝♥🇦🥛♂♥❤🧡🤯🇫▫🧖♀✌🤫🤩🇲🦗☺🤲🇹™❤☁❄🤨'

In [24]:
tokenizer.tokenize(sample)

['🥺',
 '🧛',
 '☺',
 '✔',
 '™',
 '🇨',
 '⬆',
 '🥰',
 '©',
 '✌',
 '🤮',
 '🦵',
 '🅱',
 '🤪',
 '🤭',
 '☝',
 '♥',
 '🇦',
 '🥛',
 '♂',
 '♥',
 '❤',
 '🧡',
 '🤯',
 '🇫',
 '▫',
 '🧖',
 '♀',
 '✌',
 '🤫',
 '🤩',
 '🇲',
 '🦗',
 '☺',
 '🤲',
 '🇹',
 '™',
 '❤',
 '☁',
 '❄',
 '🤨']

In [28]:
healthy_sample = ''.join(health_emojis)

In [34]:
healthy_sample

'🎉🤓🙄😐😒😝🤠👍💋🕺😓🤤🙊💚🎏💥🐇😀😏🍭😟🤦😃😵🙏😘💯🤷👴💸👍🙏💔🚀👌😰🤦😂👸🐊🙃💰💖💙💓😳🔥🎂😜😎👏👌🙌⚡😬💪👌❣💭🤙💨💃🤦😠☠😄😥🤔✊🤷😛😹👉🤢🤘🌟🙌💜😪🥕🌿😸😊🗑😇💅🐃✨🍩😨🤚💩👏💪🙆💡👍🦀🤷🤜😔🖕🙏❣😕🤷👁😆🥘💆🙋💀🤷👓😡🏊🙌💎🍰😞🔪👻🖥🛡🗿🍁🙏😭🍆💄🍺🤕🍧🙌⭐🌊😩🤷☹😗💍👀🖤👨🔑😾🤞🎾🚫🙁🦈🙂😻🤞⛏🏳😫😧💲🎵👊🚓😉🙈🥀☕🎣💛🍍😤☹👌😦👏🤦👋💕👍🐢🍿🐕👅🇵🤷🍻🌈😖👎🤗👩✊😱💗🤞💁👑🙎📚🍕😣💦🚒🤰💁🍀😮🌹⚔🌱🎶🤣😴🎖😢😿🏈😑🖐🥂⚰😁😋😍💪⛑👐😅🤦🤷'

In [33]:
tokenizer.encode(healthy_sample)

[101,
 30561,
 30562,
 30563,
 30564,
 30565,
 30566,
 30567,
 30568,
 30569,
 30570,
 30571,
 30572,
 30573,
 30574,
 30575,
 30576,
 30577,
 30578,
 30579,
 30580,
 30581,
 30582,
 30583,
 30584,
 30585,
 30586,
 30587,
 30588,
 30589,
 30590,
 30568,
 30585,
 30591,
 30592,
 30593,
 30594,
 30582,
 30595,
 30596,
 30597,
 30598,
 30599,
 30600,
 30601,
 30602,
 30603,
 30604,
 30605,
 30606,
 30607,
 30608,
 30593,
 30609,
 30610,
 30611,
 30612,
 30593,
 30613,
 30614,
 30615,
 30616,
 30617,
 30582,
 30618,
 30619,
 30620,
 30621,
 30622,
 30623,
 30588,
 30624,
 30625,
 30626,
 30627,
 30628,
 30629,
 30609,
 30630,
 30631,
 30632,
 30633,
 30634,
 30635,
 30636,
 30637,
 30638,
 30639,
 30640,
 30641,
 30642,
 30643,
 30644,
 30608,
 30612,
 30645,
 30646,
 30568,
 30647,
 30588,
 30648,
 30649,
 30650,
 30585,
 30613,
 30651,
 30588,
 30652,
 30653,
 30654,
 30655,
 30656,
 30657,
 30588,
 30658,
 30659,
 30660,
 30609,
 30661,
 30662,
 30663,
 30664,
 30665,
 30666,
 30667,
 3