In [2]:
import torch 
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn 
import torch.optim as optim

In [3]:
df=pd.read_csv('emoji_QA_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Question,Emoji
0,0,Which emoji is used to express sarcasm or dumb?,🤡
1,1,Which emoji signifies being dead from laughing?,💀
2,2,Which emoji represents flexing or showing off?,💪
3,3,Which emoji is used for dramatic crying or bei...,😭
4,4,Which emoji indicates someone or something is ...,🔥


In [4]:
df=df.drop("Unnamed: 0",axis=1)

In [5]:
df.sample(5)

Unnamed: 0,Question,Emoji
155,Which emoji is used to indicate frustration or...,😡
117,Which emoji is used to represent excitement or...,🎊
134,Which emoji indicates frustration or facepalm ...,🤦
130,Which emoji indicates celebration or a party m...,🥳
119,Which emoji shows shyness or modesty?,☺️


In [6]:
def tokenizer(que):
    que=que.replace("?"," ")
    que=que.replace(","," ")
    que=que.lower()
    return que.split()

In [7]:
input_vocab={
    "<UNK>":0
}

In [8]:
def build_vocab(row):
    tokenized_que=tokenizer(row["Question"])

    for token in tokenized_que:
        if token not in input_vocab:
            input_vocab[token]=len(input_vocab)

In [9]:
df.apply(build_vocab,axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
255    None
256    None
257    None
258    None
259    None
Length: 260, dtype: object

In [10]:
input_vocab

{'<UNK>': 0,
 'which': 1,
 'emoji': 2,
 'is': 3,
 'used': 4,
 'to': 5,
 'express': 6,
 'sarcasm': 7,
 'or': 8,
 'dumb': 9,
 'signifies': 10,
 'being': 11,
 'dead': 12,
 'from': 13,
 'laughing': 14,
 'represents': 15,
 'flexing': 16,
 'showing': 17,
 'off': 18,
 'for': 19,
 'dramatic': 20,
 'crying': 21,
 'overwhelmed': 22,
 'indicates': 23,
 'someone': 24,
 'something': 25,
 'hot': 26,
 'laughter': 27,
 'shows': 28,
 'love': 29,
 'affection': 30,
 'approval': 31,
 'agreement': 32,
 'expresses': 33,
 'surprise': 34,
 'shock': 35,
 'embarrassment': 36,
 'blushing': 37,
 'confusion': 38,
 'disbelief': 39,
 'sadness': 40,
 'anger': 41,
 'thinking': 42,
 'pondering': 43,
 'excitement': 44,
 'celebration': 45,
 'tiredness': 46,
 'exhaustion': 47,
 'rolling': 48,
 'eyes': 49,
 'annoyance': 50,
 'fear': 51,
 'worry': 52,
 'kissing': 53,
 'show': 54,
 'partying': 55,
 'fun': 56,
 'clapping': 57,
 'congratulations': 58,
 'hands': 59,
 'together': 60,
 'prayer': 61,
 'thanks': 62,
 'shame': 63,
 

In [11]:
output_vocab={"<UNK>":0}

def build_output(row):
    if row["Emoji"] not in output_vocab:
        output_vocab[row["Emoji"]]=len(output_vocab)

In [12]:
df.apply(build_output,axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
255    None
256    None
257    None
258    None
259    None
Length: 260, dtype: object

In [13]:
output_vocab

{'<UNK>': 0,
 '🤡': 1,
 '💀': 2,
 '💪': 3,
 '😭': 4,
 '🔥': 5,
 '😂': 6,
 '❤️': 7,
 '👍': 8,
 '😮': 9,
 '😊': 10,
 '🤔': 11,
 '😢': 12,
 '😡': 13,
 '🧐': 14,
 '🎉': 15,
 '😩': 16,
 '🙄': 17,
 '😱': 18,
 '😘': 19,
 '🥳': 20,
 '👏': 21,
 '🙏': 22,
 '😳': 23,
 '😴': 24,
 '😎': 25,
 '💸': 26,
 '🥂': 27,
 '🤨': 28,
 '☺️': 29,
 '🥺': 30,
 '🤮': 31,
 '❄️': 32,
 '💔': 33,
 '🤦': 34,
 '🎊': 35,
 '😉': 36,
 '💃': 37,
 '🥰': 38,
 '🛌': 39,
 '🙌': 40,
 '🤣': 41,
 '😲': 42,
 '😬': 43,
 '😔': 44,
 '😏': 45,
 '😈': 46,
 '🤬': 47,
 '😍': 48,
 '🤢': 49,
 '🥶': 50,
 '☀️': 51,
 '😕': 52,
 '😅': 53,
 '😪': 54,
 '😑': 55,
 '😨': 56,
 '😤': 57,
 '💰': 58,
 '😄': 59,
 '😵': 60,
 '😌': 61,
 '🤩': 62,
 '😁': 63,
 '😵\u200d💫': 64,
 '😓': 65,
 '🤯': 66,
 '😼': 67,
 '😛': 68,
 '😜': 69,
 '🎀': 70,
 '🙂': 71,
 '🥲': 72,
 '🫠': 73,
 '🗿': 74,
 '💅🏻': 75,
 '🛐': 76,
 '🙈': 77,
 '😚': 78,
 '🫡': 79,
 '🤧': 80,
 '🌚': 81,
 '🌝': 82,
 '👽': 83,
 '💩': 84,
 '✨': 85,
 '👀': 86,
 '🫦': 87,
 '💫': 88,
 '❤️\u200d🩹': 89,
 '⚡': 90}

In [14]:
def text_to_indices(text,vocab):
    indexed_text=[]

    for token in tokenizer(text):

        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

In [15]:
class QADataset(Dataset):
    def __init__(self, df, input_vocab, output_vocab):
        self.df = df
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        que=self.df.iloc[idx]['Question']
        emoji=self.df.iloc[idx]['Emoji']

        que_numeric_list=text_to_indices(que,self.input_vocab)
        emoji_numeric_list=text_to_indices(emoji,self.output_vocab)[0]

        return torch.tensor(que_numeric_list),torch.tensor(emoji_numeric_list)

In [16]:
QA_dataset=QADataset(df,input_vocab,output_vocab)

In [17]:
QA_DataLoader=DataLoader(QA_dataset,batch_size=1,shuffle=True)

In [18]:
class EmojiChooser(nn.Module):
    def __init__(self,input_vocab_size,output_vocab_size):
        super().__init__()

        self.embedding=nn.Embedding(input_vocab_size,embedding_dim=50)
        self.rnn=nn.RNN(50,64,batch_first=True)
        self.fc=nn.Linear(64,output_vocab_size)

    def forward(self,question):
        embedded_question=self.embedding(question)
        hidden,final=self.rnn(embedded_question)
        output=self.fc(final.squeeze(0))

        return output

In [19]:
model=EmojiChooser(len(input_vocab),len(output_vocab))

In [20]:
# calculating class weights

from collections import Counter

counter=Counter(df["Emoji"])

total_samples=len(df)
num_classes=len(output_vocab)-1

class_weights=[0.01]

for emoji in output_vocab:

    if emoji == "<UNK>":
        continue

    count = counter[emoji]
    
    weight = total_samples / (num_classes * count)
    class_weights.append(weight)

class_weights=torch.tensor(class_weights,dtype=torch.float)


In [21]:
learning_rate=0.001
epochs=40

In [22]:
criterion=nn.CrossEntropyLoss(weight=class_weights)
optimizer=optim.Adam(model.parameters(),lr=learning_rate)

In [23]:
for epoch in range(epochs):
    total_loss=0
    for question,emoji in QA_DataLoader:
        optimizer.zero_grad()

        output=model(question)

        loss=criterion(output,emoji)

        loss.backward()

        optimizer.step()

        total_loss+=loss.item()

    print(f"Epoch {epoch+1} , Loss = {total_loss:.4f}")

Epoch 1 , Loss = 1156.9091
Epoch 2 , Loss = 969.9625
Epoch 3 , Loss = 793.1639
Epoch 4 , Loss = 612.8641
Epoch 5 , Loss = 451.9846
Epoch 6 , Loss = 322.9764
Epoch 7 , Loss = 221.0387
Epoch 8 , Loss = 150.1371
Epoch 9 , Loss = 103.9292
Epoch 10 , Loss = 72.7847
Epoch 11 , Loss = 52.5946
Epoch 12 , Loss = 38.1287
Epoch 13 , Loss = 29.0928
Epoch 14 , Loss = 22.2820
Epoch 15 , Loss = 17.6952
Epoch 16 , Loss = 14.1517
Epoch 17 , Loss = 11.5550
Epoch 18 , Loss = 9.4444
Epoch 19 , Loss = 7.8540
Epoch 20 , Loss = 6.6005
Epoch 21 , Loss = 5.5681
Epoch 22 , Loss = 4.7055
Epoch 23 , Loss = 3.9816
Epoch 24 , Loss = 3.4150
Epoch 25 , Loss = 2.9174
Epoch 26 , Loss = 2.5087
Epoch 27 , Loss = 2.1579
Epoch 28 , Loss = 1.8608
Epoch 29 , Loss = 1.6064
Epoch 30 , Loss = 1.3882
Epoch 31 , Loss = 1.2062
Epoch 32 , Loss = 1.0460
Epoch 33 , Loss = 0.9094
Epoch 34 , Loss = 0.7907
Epoch 35 , Loss = 0.6878
Epoch 36 , Loss = 0.5994
Epoch 37 , Loss = 0.5216
Epoch 38 , Loss = 0.4547
Epoch 39 , Loss = 0.3972
Epoch 4

In [24]:
def search_emoji(model,question,threshold=0.4):

    numerical_que=text_to_indices(question,input_vocab)

    que_tensor=torch.tensor(numerical_que).unsqueeze(0)

    output=model(que_tensor)

    emoji_probs=torch.nn.functional.softmax(output,dim=1)

    value,index=torch.max(emoji_probs,dim=1)

    if(value<threshold):
        print("I Don't Know!")

    else:
        print(list(output_vocab.keys())[index])

In [25]:
search_emoji(model,"Which emoji is used for sigma male")

🗿


In [32]:
search_emoji(model,"Which emoji is used for silliness")

💩


In [38]:
search_emoji(model,"Which emoji is used for strong blushing")

🙈


In [40]:
search_emoji(model,"Which emoji represents smile in pain")

🙂


In [47]:
search_emoji(model,"Which emoji indicates flustered or shocked emotion")

😵
