In [None]:
import json
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import torch
import pickle

In [None]:
# Source: https://github.com/LCS2-IIITD/Emotion-Flip-Reasoning/blob/main/Dataloaders/nlp_utils.py
import string
import nltk
import re

numbers = {
    "0":"zero",
    "1":"one",
    "2":"two",
    "3":"three",
    "4":"four",
    "5":"five",
    "6":"six",
    "7":"seven",
    "8":"eight",
    "9":"nine"
}

def remove_puntuations(txt):
    punct = set(string.punctuation)
    txt = " ".join(txt.split("."))
    txt = " ".join(txt.split("!"))
    txt = " ".join(txt.split("?"))
    txt = " ".join(txt.split(":"))
    txt = " ".join(txt.split(";"))
    
    txt = "".join(ch for ch in txt if ch not in punct)
    return txt

def number_to_words(txt):
    for k in numbers.keys():
        txt = txt.replace(k,numbers[k]+" ")
    return txt

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'_',' ',text)
    text = number_to_words(text)
    text = remove_puntuations(text)
    text = ''.join([i if ord(i) < 128 else '' for i in text])
    text = ' '.join(text.split())
    return text

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
train_data = json.load(open('../Subtask_1_train.json'))
test_data = json.load(open('../Subtask_1_test.json'))
all_data = train_data + test_data

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [None]:
bert_utterance2vec = {}

In [None]:
with torch.no_grad():
    for conversation in tqdm(all_data):
        for utterance in conversation['conversation']:
            text = utterance['text']
            text = preprocess_text(text)
            encode = tokenizer(text, return_tensors='pt')
            encode = {k: v.to(device) for k, v in encode.items()}
            output = model(**encode)
            utterance_vector = output.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
            bert_utterance2vec[text] = utterance_vector

In [None]:
pickle.dump(bert_utterance2vec, open('bert_utterance2vec.pkl', 'wb'))