In [1]:
import json

In [2]:
# Source: https://github.com/LCS2-IIITD/Emotion-Flip-Reasoning/blob/main/Dataloaders/nlp_utils.py
import string
import nltk
import re

numbers = {
    "0":"zero",
    "1":"one",
    "2":"two",
    "3":"three",
    "4":"four",
    "5":"five",
    "6":"six",
    "7":"seven",
    "8":"eight",
    "9":"nine"
}

def remove_puntuations(txt):
    punct = set(string.punctuation)
    txt = " ".join(txt.split("."))
    txt = " ".join(txt.split("!"))
    txt = " ".join(txt.split("?"))
    txt = " ".join(txt.split(":"))
    txt = " ".join(txt.split(";"))
    
    txt = "".join(ch for ch in txt if ch not in punct)
    return txt

def number_to_words(txt):
    for k in numbers.keys():
        txt = txt.replace(k,numbers[k]+" ")
    return txt

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'_',' ',text)
    text = number_to_words(text)
    text = remove_puntuations(text)
    text = ''.join([i if ord(i) < 128 else '' for i in text])
    text = ' '.join(text.split())
    return text

In [3]:
train_data = json.load(open('..\\ERC_conversational_level\\train_conversation_level.json'))
test_data = json.load(open('..\\ERC_conversational_level\\test_conversation_level.json'))
val_data = json.load(open('..\\ERC_conversational_level\\val_conversation_level.json'))

In [4]:
train = []
for i in range(len(train_data)):
    conversation = train_data[i]['conversation']
    context = ""
    array = []
    for j in range(len(conversation)):
        target = preprocess_text(conversation[j]['text'])
        emotion = conversation[j]['emotion']
        context += target + " "
        if(emotion == "neutral"):
            continue
        for k in range(j+1):
            evidence = preprocess_text(conversation[k]['text'])
            array.append({
                "context": context,
                "qas":[{
                    "id": f"{i+1}_{j+1}_{k+1}",
                    "is_impossible": True,
                    "question": f"If the target utterance is <{target}> and evidence utterance is <{evidence}> then what is the causal span from context that is relevant to the target utterance's emotion <{emotion}> ?",
                    "answers": []
                }]
            })
    emotion_cause_pairs = train_data[i]['emotion-cause_pairs']
    for j in range(len(emotion_cause_pairs)):
        target_idx, cause_idx, cause = int(emotion_cause_pairs[j][0][0:str.find(emotion_cause_pairs[j][0], "_")]), int(emotion_cause_pairs[j][1][0:str.find(emotion_cause_pairs[j][1], "_")]), emotion_cause_pairs[j][1][str.find(emotion_cause_pairs[j][1], "_")+1:] 
        cause = preprocess_text(cause)
        for k in range(len(array)):
            if f"{i+1}_{target_idx}_{cause_idx}" == array[k]['qas'][0]['id']:
                array[k]['qas'][0]['is_impossible'] = False
                array[k]['qas'][0]['answers'].append({
                    "text": cause,
                    "answer_start": array[k]['context'].find(cause)
                })   

    train += array

json.dump(train, open('train_simple_transformers.json', 'w'), indent=4)

In [5]:
val = []

for i in range(len(val_data)):
    conversation = val_data[i]['conversation']
    context = ""
    array = []
    
    for j in range(len(conversation)):
        target = preprocess_text(conversation[j]['text'])
        emotion = conversation[j]['emotion']
        context += target + " "
        if(emotion == "neutral"):
            continue
        for k in range(j+1):
            evidence = preprocess_text(conversation[k]['text'])
            array.append({
                "context": context,
                "qas":[{
                    "id": f"{i+1}_{j+1}_{k+1}",
                    "is_impossible": True,
                    "question": f"If the target utterance is <{target}> and evidence utterance is <{evidence}> then what is the causal span from context that is relevant to the target utterance's emotion <{emotion}> ?",
                    "answers": []
                }]
            })
    emotion_cause_pairs = val_data[i]['emotion-cause_pairs']

    for j in range(len(emotion_cause_pairs)):
        target_idx, cause_idx, cause = int(emotion_cause_pairs[j][0][0:str.find(emotion_cause_pairs[j][0], "_")]), int(emotion_cause_pairs[j][1][0:str.find(emotion_cause_pairs[j][1], "_")]), emotion_cause_pairs[j][1][str.find(emotion_cause_pairs[j][1], "_")+1:] 
        cause = preprocess_text(cause)
        for k in range(len(array)):
            if f"{i+1}_{target_idx}_{cause_idx}" == array[k]['qas'][0]['id']:
                array[k]['qas'][0]['is_impossible'] = False
                array[k]['qas'][0]['answers'].append({
                    "text": cause,
                    "answer_start": array[k]['context'].find(cause)
                })

    val += array

json.dump(val, open('val_simple_transformers.json', 'w'), indent=4)