In [102]:
import torch
from torch.utils.data import Dataset, DataLoader
import json
import pdb, os
import random
from transformers import BertTokenizer
    
class goemotion_loader(Dataset):
    def __init__(self, data_path, model_type, class_type):
        f = open(data_path, 'r')
        self.datalist = f.readlines()
        f.close()
        self.class_type = class_type
            
        model_path = os.path.join('/data/project/rw/rung/model', model_type) # bert-base-cased
        self.tokenizer = BertTokenizer.from_pretrained(model_path)  
        
        f = open('./dataset/emotions.txt', 'r')
        self.fineList = f.readlines()
        self.fineList = [x.strip() for x in self.fineList]
        f.close()        
        
        with open("./dataset/ekman_mapping.json", 'r') as f:
            self.ekman_mapping_data = json.load(f)
            self.ekman_mapping_data['neutral'] = ['neutral']
            self.ekmanList = list(self.ekman_mapping_data.keys())
            self.ekman_reverse_data = {}
            for big, small_list in self.ekman_mapping_data.items():
                for small in small_list:
                    self.ekman_reverse_data[small] = big
                    
        with open("./dataset/sentiment_mapping.json", 'r') as f:
            self.senti_mapping_data = json.load(f)
            self.senti_mapping_data['neutral'] = ['neutral']
            self.sentiList = list(self.senti_mapping_data.keys())
            self.senti_reverse_data = {}
            for big, small_list in self.senti_mapping_data.items():
                for small in small_list:
                    self.senti_reverse_data[small] = big                       
        
        self.ekmanList = list(self.ekman_mapping_data.keys())
        self.sentiList = list(self.senti_mapping_data.keys())
        
    def __len__(self):
        return len(self.datalist)

    def __getitem__(self, idx):
        data = self.datalist[idx]
        utt, labels = data.strip().split('\t')
        label_list = labels.split(',')        
        
        # fine-grained
        fine_label_list = [int(x) for x in label_list]
        
        # ekman & sentiment
        ekman_label_list, senti_label_list = [], []
        for label in label_list:
            emotion = self.fineList[int(label)]
            
            ekman_emotion = self.ekman_reverse_data[emotion]
            ind = self.ekmanList.index(ekman_emotion)
            ekman_label_list.append(ind)
            
            senti_emotion = self.senti_reverse_data[emotion]
            ind = self.sentiList.index(senti_emotion)
            senti_label_list.append(ind)          
            
        data = {}
        data['utt'] = utt
        data['fine_labels'] = fine_label_list
        data['ekman_labels'] = ekman_label_list
        data['senti_labels'] = senti_label_list
        return data
    
    def encode_truncated(self, text):
        max_length = self.tokenizer.model_max_length
        tokenized_tokens = self.tokenizer.encode(text, add_special_tokens=False)
        truncated_tokens = tokenized_tokens[-max_length:]    

        return truncated_tokens  
    
    def padding(self, ids_list):
        max_len = 0
        for ids in ids_list:
            if len(ids) > max_len:
                max_len = len(ids)

        pad_ids = []
        for ids in ids_list:
            pad_len = max_len-len(ids)
            add_ids = [self.tokenizer.pad_token_id for _ in range(pad_len)]

            pad_ids.append(ids+add_ids)

        return torch.tensor(pad_ids)     
    
    def collate_fn(self, data):
        batch_utts = []
        fine_batch_labels, ekman_batch_labels, senti_batch_labels = [], [], []
        batch = len(data)
            
        for session in data: 
            utt = session['utt']
            fine_labels, ekman_labels, senti_labels = session['fine_labels'], session['ekman_labels'], session['senti_labels']
            
            batch_utts.append(utt)
            
            # fine
            fine_batch_label = [0 for _ in range(len(self.fineList))]
            for index in fine_labels:
                fine_batch_label[index] += 1/len(fine_labels)
            fine_batch_labels.append(fine_batch_label)
            
            # ekman
            ekman_batch_label = [0 for _ in range(len(self.ekmanList))]
            for index in ekman_labels:
                ekman_batch_label[index] += 1/len(ekman_labels)
            ekman_batch_labels.append(ekman_batch_label)
            
            # senti
            senti_batch_label = [0 for _ in range(len(self.sentiList))]
            for index in senti_labels:
                senti_batch_label[index] += 1/len(senti_labels)
            senti_batch_labels.append(senti_batch_label)            
        
        result = self.tokenizer.batch_encode_plus(batch_utts, add_special_tokens=False, padding=True, return_tensors='pt')
        batch_input_ids = result['input_ids']
        batch_attention_mask = result['attention_mask']
        
        final_input_ids = torch.cat([torch.tensor([self.tokenizer.cls_token_id for _ in range(batch)]).unsqueeze(1), batch_input_ids], 1)
        final_attention_mask = torch.cat([torch.tensor([1 for _ in range(batch)]).unsqueeze(1), batch_input_ids], 1)

        return final_input_ids, final_attention_mask, torch.tensor(fine_batch_labels), torch.tensor(ekman_batch_labels), torch.tensor(senti_batch_labels)

In [103]:
data_path = './dataset/train.txt'
model_type = 'bert-base-cased'
class_type = 'sentiment'#'fine_grained'
dataset = goemotion_loader(data_path, model_type, class_type)

In [104]:
dataset[11]

{'utt': "Aww... she'll probably come around eventually, I'm sure she was just jealous of [NAME]... I mean, what woman wouldn't be! lol ",
 'fine_labels': [1, 4],
 'ekman_labels': [3, 3],
 'senti_labels': [0, 0]}

In [108]:
label = 0
emotion = dataset.fineList[int(label)]

ekman_emotion = dataset.ekman_reverse_data[emotion]
ind = dataset.ekmanList.index(ekman_emotion)
print(ind)

senti_emotion = dataset.senti_reverse_data[emotion]
ind = dataset.sentiList.index(senti_emotion)
print(ind)

3
0


In [113]:
senti_emotion = dataset.senti_reverse_data[ekman_emotion]
ind = dataset.sentiList.index(senti_emotion)
print(ind)

0


In [114]:
ekman_emotion

'joy'

In [96]:
utt1 = dataset[0]['utt']
utt2 = dataset[1]['utt']
utt1,utt2

("My favourite food is anything I didn't have to cook myself.",
 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead')

In [97]:
dataset_loader = DataLoader(dataset, batch_size=3, shuffle=True, num_workers=4, collate_fn=dataset.collate_fn)

In [98]:
for data in dataset_loader:
    batch_input_ids, batch_attention_mask, fine_batch_labels, ekman_batch_labels, senti_batch_labels = data
    break

In [26]:
batch_input_ids.shape

torch.Size([3, 17])

In [27]:
dataset.tokenizer.batch_decode(batch_input_ids)

['[CLS] This is completely false. [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] I ’ m ready to see yet another person fall tonight. [PAD] [PAD] [PAD] [PAD]',
 "[CLS] How can they expect you to write if you can't read? / s"]

In [99]:
fine_batch_labels

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000]])

In [100]:
ekman_batch_labels

tensor([[0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0.]])

In [101]:
senti_batch_labels

tensor([[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.]])

In [50]:
mapping_data

{'anger': ['anger', 'annoyance', 'disapproval'],
 'disgust': ['disgust'],
 'fear': ['fear', 'nervousness'],
 'joy': ['joy',
  'amusement',
  'approval',
  'excitement',
  'gratitude',
  'love',
  'optimism',
  'relief',
  'pride',
  'admiration',
  'desire',
  'caring'],
 'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse'],
 'surprise': ['surprise', 'realization', 'confusion', 'curiosity'],
 'neutral': 'neutral'}

In [31]:
reverse_data

{'anger': 'anger',
 'annoyance': 'anger',
 'disapproval': 'anger',
 'disgust': 'disgust',
 'fear': 'fear',
 'nervousness': 'fear',
 'joy': 'joy',
 'amusement': 'joy',
 'approval': 'joy',
 'excitement': 'joy',
 'gratitude': 'joy',
 'love': 'joy',
 'optimism': 'joy',
 'relief': 'joy',
 'pride': 'joy',
 'admiration': 'joy',
 'desire': 'joy',
 'caring': 'joy',
 'sadness': 'sadness',
 'disappointment': 'sadness',
 'embarrassment': 'sadness',
 'grief': 'sadness',
 'remorse': 'sadness',
 'surprise': 'surprise',
 'realization': 'surprise',
 'confusion': 'surprise',
 'curiosity': 'surprise'}

In [14]:
list(mapping_data.keys())

['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

In [29]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os, sys
import pdb

class EmoModel(nn.Module):
    def __init__(self, model_type):
        super(EmoModel, self).__init__()        
        
        model_path = os.path.join('/data/project/rw/rung/model', model_type)
        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertModel.from_pretrained(model_path)
        
        f = open('./dataset/emotions.txt', 'r')
        self.emoList = f.readlines()
        f.close()
        self.clsNum = len(self.emoList)        
        
        self.Wc = nn.Linear(self.model.config.hidden_size, self.clsNum) # for classification

    def forward(self, batch_input_ids, batch_attention_mask):
        """
            input_tokens: (batch, len)
        """
        hidden_outs = self.model(batch_input_ids, attention_mask=batch_attention_mask)['last_hidden_state'] # [B, L, 768]
        pred_outs = self.Wc(hidden_outs) # (B, L, C)
        cls_outs = pred_outs[:,0,:] # (B, C)
        return cls_outs

In [30]:
model = EmoModel(model_type)

In [36]:
pred_logits = model(batch_input_ids, batch_attention_mask)

In [37]:
from torch.nn.functional import softmax
pred_distribution = softmax(pred_logits, 1)

In [38]:
pred_distribution.shape, batch_labels.shape

(torch.Size([3, 28]), torch.Size([3, 28]))

In [18]:
def pdloss(batch_pred_distribution, batch_label_distribution):
    """
    batch_pred_distribution: (batch, clsNum)
    batch_label_distribution: (batch, clsNum)
    """
    batch_log_pred_distribution = torch.log(batch_pred_distribution)
    
    loss_val = 0
    for log_pred_distribution, label_distribution in zip(batch_log_pred_distribution, batch_label_distribution):
        for log_pred_prob, label_prob in zip(log_pred_distribution, label_distribution):
            loss_val -= label_prob*log_pred_prob
    return loss_val/len(batch_pred_distribution)

In [39]:
pdloss(pred_distribution, batch_labels)

tensor(3.6069, grad_fn=<DivBackward0>)

In [34]:
batch_labels

tensor([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [52]:
dataset_loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=4, collate_fn=dataset.collate_fn)
for data in dataset_loader:
    batch_input_ids, batch_attention_mask, batch_labels = data
    break

In [53]:
pred_logits = model(batch_input_ids, batch_attention_mask)
from torch.nn.functional import softmax
pred_distribution = softmax(pred_logits, 1)

In [55]:
pred_sort = pred_distribution.sort(descending=True)
indices = pred_sort.indices.tolist()[0]

pred_label = indices[0]

In [56]:
pred_label

9

In [57]:
pred_distribution

tensor([[0.0404, 0.0450, 0.0329, 0.0485, 0.0454, 0.0178, 0.0199, 0.0377, 0.0362,
         0.0579, 0.0449, 0.0357, 0.0462, 0.0288, 0.0341, 0.0326, 0.0328, 0.0323,
         0.0225, 0.0298, 0.0489, 0.0316, 0.0451, 0.0173, 0.0156, 0.0262, 0.0400,
         0.0540]], grad_fn=<SoftmaxBackward>)

In [74]:
for ind, label in enumerate(batch_labels.squeeze(0)):
    if label > 0:
        print(ind)

15


In [1]:
import json
with open('./dataset/ekman_mapping.json', 'r') as f:
    json_data = json.load(f)

In [3]:
json_data.keys()

dict_keys(['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'])

In [115]:
pred_indices = [0, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, 1, 2, 1, 1, 1, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1]
batch_labels = torch.tensor([[1., 0., 0., 0.]])

In [116]:
pred_label = pred_indices[0] # pred_logits.argmax(1).item()            
true_labels = []
for ind, label in enumerate(batch_labels.squeeze(0)):
    if label > 0:
        true_labels.append(ind)

"""Calculation precision@k and recall@k"""
p1, p2, p3 = 0, 0, 0
r1, r2, r3 = 0, 0, 0

for pred_ind in pred_indices[:1]:
    if pred_ind in true_labels:
        p1 += 1
        r1 += 1/len(true_labels)

for pred_ind in pred_indices[:2]:
    if pred_ind in true_labels:
        p2 += 1/2
        r2 += 1/len(true_labels)

for pred_ind in pred_indices[:3]:
    if pred_ind in true_labels:
        p3 += 1/3
        r3 += 1/len(true_labels)

In [121]:
true_labels

[0]