In [17]:
import torch
import transformers
import json
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import pickle

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
training_data = json.load(open("/kaggle/input/semeval3-task-3-dataset/train.json"))
validation_data = json.load(open("/kaggle/input/semeval3-task-3-dataset/val.json"))
testing_data = json.load(open("/kaggle/input/semeval3-task-3-dataset/test.json"))

In [20]:
training_text, validation_text, testing_text = [], [], []
training_labels, validation_labels = [], []
training_speaker, validation_speaker, testing_speaker = [], [], []

for conversation in training_data:
    for utterance in conversation['conversation']:
        training_text.append(utterance['text'])
        training_labels.append(utterance['emotion'])
        training_speaker.append(utterance['speaker'])
        
for conversation in validation_data:
    for utterance in conversation['conversation']:
        validation_text.append(utterance['text'])
        validation_labels.append(utterance['emotion'])
        validation_speaker.append(utterance['speaker'])
        
for conversation in testing_data:
    for utterance in conversation['conversation']:
        testing_text.append(utterance['text'])
        testing_speaker.append(utterance['speaker'])

unique_utterance = list(set(training_text + validation_text + testing_text))
unique_emotion = list(set(training_labels+validation_labels))
unique_speaker = list(set(training_speaker + validation_speaker + testing_speaker))
len(unique_utterance), len(unique_emotion), len(unique_speaker)

(17414, 7, 404)

In [21]:
utterance2int, int2utterance = {}, {}
emotion2int, int2emotion = {}, {}
speaker2int, int2speaker = {}, {}

for i, utterance in enumerate(unique_utterance):
    utterance2int[utterance] = i
    int2utterance[i] = utterance
for i, emotion in enumerate(unique_emotion):
    emotion2int[emotion] = i
    int2emotion[i] = emotion
for i, speaker in enumerate(unique_speaker):
    speaker2int[speaker] = i
    int2speaker[i] = speaker

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to(device)

In [23]:
embedding_matrix = np.zeros((len(int2utterance), 768))
for i, utterance in tqdm(int2utterance.items()):
    inputs = tokenizer(utterance, return_tensors="pt",padding=False, truncation=True,max_length=512)
    inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    emb = outputs.pooler_output

    embedding_matrix[i] = emb.cpu().numpy()

100%|██████████| 17414/17414 [03:05<00:00, 94.03it/s] 


In [24]:
with open("/kaggle/working/sentence_embedding_matrix.pkl", "wb") as f:
    pickle.dump(embedding_matrix, f)

with open("/kaggle/working/utterance2int.pkl", "wb") as f:
    pickle.dump(utterance2int, f)
with open("/kaggle/working/int2utterance.pkl", "wb") as f:
    pickle.dump(int2utterance, f)

with open("/kaggle/working/emotion2int.pkl", "wb") as f:
    pickle.dump(emotion2int, f)
with open("/kaggle/working/int2emotion.pkl", "wb") as f:
    pickle.dump(int2emotion, f)
    
with open("/kaggle/working/speaker2int.pkl", "wb") as f:
    pickle.dump(speaker2int, f)
with open("/kaggle/working/int2speaker.pkl", "wb") as f:
    pickle.dump(int2speaker, f)