In [2]:
import torch
import json
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaModel
import numpy as np
import pickle
import sys
sys.path.append('../')
from utils import *

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
training_data = json.load(open("../dataset/train.json"))
validation_data = json.load(open("../dataset/val.json"))
testing_data = json.load(open("../dataset/test.json"))

In [5]:
training_text, validation_text, testing_text = [], [], []
training_labels, validation_labels = [], []
training_speaker, validation_speaker, testing_speaker = [], [], []

for conversation in training_data:
    for utterance in conversation['conversation']:
        training_text.append(preprocess_text(utterance['text']))
        training_labels.append(utterance['emotion'])
        training_speaker.append(utterance['speaker'])
        
for conversation in validation_data:
    for utterance in conversation['conversation']:
        validation_text.append(preprocess_text(utterance['text']))
        validation_labels.append(utterance['emotion'])
        validation_speaker.append(utterance['speaker'])
        
for conversation in testing_data:
    for utterance in conversation['conversation']:
        testing_text.append(preprocess_text(utterance['text']))
        testing_speaker.append(utterance['speaker'])

unique_utterance = list(set(training_text + validation_text + testing_text))
unique_emotion = list(set(training_labels+validation_labels))
unique_speaker = list(set(training_speaker + validation_speaker + testing_speaker))
len(unique_utterance), len(unique_emotion), len(unique_speaker)

(16895, 7, 404)

In [31]:
utterance2int, int2utterance = {}, {}
emotion2int, int2emotion = {}, {}
speaker2int, int2speaker = {}, {}

for i, utterance in enumerate(unique_utterance):
    utterance2int[utterance] = i
    int2utterance[i] = utterance
for i, emotion in enumerate(unique_emotion):
    emotion2int[emotion] = i
    int2emotion[i] = emotion
for i, speaker in enumerate(unique_speaker):
    speaker2int[speaker] = i
    int2speaker[i] = speaker

In [32]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [33]:
model = RobertaModel.from_pretrained('roberta-base').to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
embedding_matrix = np.zeros((len(int2utterance), 768))
for i, utterance in tqdm(int2utterance.items()):
    inputs = tokenizer(utterance, return_tensors="pt",padding=False, truncation=True,max_length=512)
    inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)
    emb = outputs.last_hidden_state.mean(dim=1).squeeze()

    embedding_matrix[i] = emb.cpu().numpy()

100%|██████████| 16895/16895 [02:45<00:00, 102.02it/s]


In [35]:
with open("/kaggle/working/sentence_embedding_matrix.pkl", "wb") as f:
    pickle.dump(embedding_matrix, f)

with open("/kaggle/working/utterance2int.pkl", "wb") as f:
    pickle.dump(utterance2int, f)
with open("/kaggle/working/int2utterance.pkl", "wb") as f:
    pickle.dump(int2utterance, f)

with open("/kaggle/working/emotion2int.pkl", "wb") as f:
    pickle.dump(emotion2int, f)
with open("/kaggle/working/int2emotion.pkl", "wb") as f:
    pickle.dump(int2emotion, f)
    
with open("/kaggle/working/speaker2int.pkl", "wb") as f:
    pickle.dump(speaker2int, f)
with open("/kaggle/working/int2speaker.pkl", "wb") as f:
    pickle.dump(int2speaker, f)