In [804]:
import torch 
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn 
import torch.optim as optim

In [805]:
df=pd.read_csv('emoji_QA_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Question,Emoji
0,0,Which emoji is used to express sarcasm or dumb?,🤡
1,1,Which emoji signifies being dead from laughing?,💀
2,2,Which emoji represents flexing or showing off?,💪
3,3,Which emoji is used for dramatic crying or bei...,😭
4,4,Which emoji indicates someone or something is ...,🔥


In [806]:
df=df.drop("Unnamed: 0",axis=1)

In [807]:
df.sample(5)

Unnamed: 0,Question,Emoji
248,Which emoji indicates a new moon or mysterious...,🌚
106,Which emoji represents laughter or funny moments?,😂
128,Which emoji represents cold weather or freezing?,🥶
143,Which emoji is used when impressed or mind blown?,🤯
59,Which emoji represents pleading eyes?,🥺


In [808]:
def tokenizer(que):
    que=que.replace("?"," ")
    que=que.replace(","," ")
    que=que.lower()
    return que.split()

In [809]:
input_vocab={
    "<UNK>":0
}

In [810]:
def build_vocab(row):
    tokenized_que=tokenizer(row["Question"])

    for token in tokenized_que:
        if token not in input_vocab:
            input_vocab[token]=len(input_vocab)

In [811]:
df.apply(build_vocab,axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
255    None
256    None
257    None
258    None
259    None
Length: 260, dtype: object

In [812]:
input_vocab

{'<UNK>': 0,
 'which': 1,
 'emoji': 2,
 'is': 3,
 'used': 4,
 'to': 5,
 'express': 6,
 'sarcasm': 7,
 'or': 8,
 'dumb': 9,
 'signifies': 10,
 'being': 11,
 'dead': 12,
 'from': 13,
 'laughing': 14,
 'represents': 15,
 'flexing': 16,
 'showing': 17,
 'off': 18,
 'for': 19,
 'dramatic': 20,
 'crying': 21,
 'overwhelmed': 22,
 'indicates': 23,
 'someone': 24,
 'something': 25,
 'hot': 26,
 'laughter': 27,
 'shows': 28,
 'love': 29,
 'affection': 30,
 'approval': 31,
 'agreement': 32,
 'expresses': 33,
 'surprise': 34,
 'shock': 35,
 'embarrassment': 36,
 'blushing': 37,
 'confusion': 38,
 'disbelief': 39,
 'sadness': 40,
 'anger': 41,
 'thinking': 42,
 'pondering': 43,
 'excitement': 44,
 'celebration': 45,
 'tiredness': 46,
 'exhaustion': 47,
 'rolling': 48,
 'eyes': 49,
 'annoyance': 50,
 'fear': 51,
 'worry': 52,
 'kissing': 53,
 'show': 54,
 'partying': 55,
 'fun': 56,
 'clapping': 57,
 'congratulations': 58,
 'hands': 59,
 'together': 60,
 'prayer': 61,
 'thanks': 62,
 'shame': 63,
 

In [813]:
output_vocab={"<UNK>":0}

def build_output(row):
    if row["Emoji"] not in output_vocab:
        output_vocab[row["Emoji"]]=len(output_vocab)

In [814]:
df.apply(build_output,axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
255    None
256    None
257    None
258    None
259    None
Length: 260, dtype: object

In [815]:
output_vocab

{'<UNK>': 0,
 '🤡': 1,
 '💀': 2,
 '💪': 3,
 '😭': 4,
 '🔥': 5,
 '😂': 6,
 '❤️': 7,
 '👍': 8,
 '😮': 9,
 '😊': 10,
 '🤔': 11,
 '😢': 12,
 '😡': 13,
 '🧐': 14,
 '🎉': 15,
 '😩': 16,
 '🙄': 17,
 '😱': 18,
 '😘': 19,
 '🥳': 20,
 '👏': 21,
 '🙏': 22,
 '😳': 23,
 '😴': 24,
 '😎': 25,
 '💸': 26,
 '🥂': 27,
 '🤨': 28,
 '☺️': 29,
 '🥺': 30,
 '🤮': 31,
 '❄️': 32,
 '💔': 33,
 '🤦': 34,
 '🎊': 35,
 '😉': 36,
 '💃': 37,
 '🥰': 38,
 '🛌': 39,
 '🙌': 40,
 '🤣': 41,
 '😲': 42,
 '😬': 43,
 '😔': 44,
 '😏': 45,
 '😈': 46,
 '🤬': 47,
 '😍': 48,
 '🤢': 49,
 '🥶': 50,
 '☀️': 51,
 '😕': 52,
 '😅': 53,
 '😪': 54,
 '😑': 55,
 '😨': 56,
 '😤': 57,
 '💰': 58,
 '😄': 59,
 '😵': 60,
 '😌': 61,
 '🤩': 62,
 '😁': 63,
 '😵\u200d💫': 64,
 '😓': 65,
 '🤯': 66,
 '😼': 67,
 '😛': 68,
 '😜': 69,
 '🎀': 70,
 '🙂': 71,
 '🥲': 72,
 '🫠': 73,
 '🗿': 74,
 '💅🏻': 75,
 '🛐': 76,
 '🙈': 77,
 '😚': 78,
 '🫡': 79,
 '🤧': 80,
 '🌚': 81,
 '🌝': 82,
 '👽': 83,
 '💩': 84,
 '✨': 85,
 '👀': 86,
 '🫦': 87,
 '💫': 88,
 '❤️\u200d🩹': 89,
 '⚡': 90}

In [816]:
def text_to_indices(text,vocab):
    indexed_text=[]

    for token in tokenizer(text):

        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

In [817]:
class QADataset(Dataset):
    def __init__(self, df, input_vocab, output_vocab):
        self.df = df
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        que=self.df.iloc[idx]['Question']
        emoji=self.df.iloc[idx]['Emoji']

        que_numeric_list=text_to_indices(que,self.input_vocab)
        emoji_numeric_list=text_to_indices(emoji,self.output_vocab)[0]

        return torch.tensor(que_numeric_list),torch.tensor(emoji_numeric_list)

In [818]:
QA_dataset=QADataset(df,input_vocab,output_vocab)

In [819]:
QA_DataLoader=DataLoader(QA_dataset,batch_size=1,shuffle=True)

In [820]:
class EmojiChooser(nn.Module):
    def __init__(self,input_vocab_size,output_vocab_size):
        super().__init__()

        self.embedding=nn.Embedding(input_vocab_size,embedding_dim=50)
        self.lstm=nn.LSTM(50,64,batch_first=True,dropout=0.3)
        self.fc=nn.Linear(64,output_vocab_size)

    def forward(self,question):
        embedded_question=self.embedding(question)
        output,(hidden,cell)=self.lstm(embedded_question)
        output=self.fc(hidden[-1])

        return output

In [821]:
model=EmojiChooser(len(input_vocab),len(output_vocab))



In [822]:
# calculating class weights

from collections import Counter

counter=Counter(df["Emoji"])

total_samples=len(df)
num_classes=len(output_vocab)-1

class_weights=[0.01]

for emoji in output_vocab:

    if emoji == "<UNK>":
        continue

    count = counter[emoji]
    
    weight = total_samples / (num_classes * count)
    class_weights.append(weight)

class_weights=torch.tensor(class_weights,dtype=torch.float)


In [823]:
learning_rate=0.001
epochs=45

In [824]:
# criterion=nn.CrossEntropyLoss()
criterion=nn.CrossEntropyLoss(weight=class_weights)
optimizer=optim.Adam(model.parameters(),lr=learning_rate)

In [825]:
for epoch in range(epochs):
    total_loss=0
    for question,emoji in QA_DataLoader:
        optimizer.zero_grad()

        output=model(question)

        loss=criterion(output,emoji)

        loss.backward()

        optimizer.step()

        total_loss+=loss.item()

    print(f"Epoch {epoch+1} , Loss = {total_loss:.4f}")

Epoch 1 , Loss = 1167.9453
Epoch 2 , Loss = 1024.5178
Epoch 3 , Loss = 749.4415
Epoch 4 , Loss = 507.3767
Epoch 5 , Loss = 338.7415
Epoch 6 , Loss = 223.5078
Epoch 7 , Loss = 152.2613
Epoch 8 , Loss = 101.2270
Epoch 9 , Loss = 71.0614
Epoch 10 , Loss = 52.3581
Epoch 11 , Loss = 38.4352
Epoch 12 , Loss = 28.8640
Epoch 13 , Loss = 22.2353
Epoch 14 , Loss = 17.4280
Epoch 15 , Loss = 14.1414
Epoch 16 , Loss = 11.1428
Epoch 17 , Loss = 9.2544
Epoch 18 , Loss = 7.7078
Epoch 19 , Loss = 6.4851
Epoch 20 , Loss = 5.4230
Epoch 21 , Loss = 4.6361
Epoch 22 , Loss = 3.9347
Epoch 23 , Loss = 3.3888
Epoch 24 , Loss = 2.9101
Epoch 25 , Loss = 2.4815
Epoch 26 , Loss = 2.1470
Epoch 27 , Loss = 1.8537
Epoch 28 , Loss = 1.6106
Epoch 29 , Loss = 1.3913
Epoch 30 , Loss = 1.2090
Epoch 31 , Loss = 1.0458
Epoch 32 , Loss = 0.9107
Epoch 33 , Loss = 0.7935
Epoch 34 , Loss = 0.6921
Epoch 35 , Loss = 0.6012
Epoch 36 , Loss = 0.5229
Epoch 37 , Loss = 0.4571
Epoch 38 , Loss = 0.3981
Epoch 39 , Loss = 0.3478
Epoch 40

In [826]:
def search_emoji(model,question,threshold=0.3):

    numerical_que=text_to_indices(question,input_vocab)

    que_tensor=torch.tensor(numerical_que).unsqueeze(0)

    output=model(que_tensor)

    emoji_probs=torch.nn.functional.softmax(output,dim=1)

    value,index=torch.max(emoji_probs,dim=1)

    if(value<threshold):
        print("I Don't Know!")

    else:
        print(list(output_vocab.keys())[index])

In [827]:
search_emoji(model,"Which emoji is used for sigma male")

🗿


In [829]:
search_emoji(model,"Which emoji is used for strong blushing")

🙈


In [831]:
search_emoji(model,"Which emoji indicates flustered or shocked emotion")

😵
