In [None]:
from sentence_transformers import SentenceTransformer
import json
import pickle
import torch
from tqdm import tqdm

In [None]:
# Source: https://github.com/LCS2-IIITD/Emotion-Flip-Reasoning/blob/main/Dataloaders/nlp_utils.py
import string
import nltk
import re

numbers = {
    "0":"zero",
    "1":"one",
    "2":"two",
    "3":"three",
    "4":"four",
    "5":"five",
    "6":"six",
    "7":"seven",
    "8":"eight",
    "9":"nine"
}

def remove_puntuations(txt):
    punct = set(string.punctuation)
    txt = " ".join(txt.split("."))
    txt = " ".join(txt.split("!"))
    txt = " ".join(txt.split("?"))
    txt = " ".join(txt.split(":"))
    txt = " ".join(txt.split(";"))
    
    txt = "".join(ch for ch in txt if ch not in punct)
    return txt

def number_to_words(txt):
    for k in numbers.keys():
        txt = txt.replace(k,numbers[k]+" ")
    return txt

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'_',' ',text)
    text = number_to_words(text)
    text = remove_puntuations(text)
    text = ''.join([i if ord(i) < 128 else '' for i in text])
    text = ' '.join(text.split())
    return text

In [None]:
train_data = json.load(open('../Original_Dataset/Subtask_1_train.json'))
test_data = json.load(open('../Original_Dataset/Subtask_1_test.json'))
all_data = train_data + test_data

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
model = SentenceTransformer('all-mpnet-base-v2', device=device)
sentence_transformer_utterance2vec = {}
with torch.no_grad():
    for conversation in tqdm(all_data):
        for utterance in conversation['conversation']:
            text = utterance['text']
            text = preprocess_text(text)
            sentence_transformer_utterance2vec[text] = model.encode(text, show_progress_bar=False)
pickle.dump(sentence_transformer_utterance2vec, open('sentence_transformer_utterance2vec_768.pkl', 'wb'))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
sentence_transformer_utterance2vec = {}
with torch.no_grad():
    for conversation in tqdm(all_data):
        for utterance in conversation['conversation']:
            text = utterance['text']
            text = preprocess_text(text)
            sentence_transformer_utterance2vec[text] = model.encode(text, show_progress_bar=False)
pickle.dump(sentence_transformer_utterance2vec, open('sentence_transformer_utterance2vec_384.pkl', 'wb'))