This notebook will involve all the dataset processing procedures for both emotion and empathy, including splitting the data input trainable splits,
as well as any up/downsampling of data and augmentation.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture 
import numpy as np
import pandas as pd
import re
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

Importing the EmpatheticPersonas12 (with added love, insecurity, disgust, disappointment, shame, guilt, envy and jealousy)

In [None]:
empPersonas = pd.read_csv('drive/MyDrive/Individual Project/empatheticPersonas12.csv', encoding='cp1252') # this the main dataset that all processing happens on

In [None]:
# the dictionary of all captured emotions in Empathetic Personas 2.0
emotions_dict = {
    'sad': 'sadness',
    'angry': 'anger',
    'anxious': 'fear',
    'happy': 'joy',
    'loving': 'love',
    'insecure': 'instability',
    'disgusted': 'disgust',
    'disappointed': 'disappointment',
    'ashamed': 'shame',
    'guilty': 'guilt',
    'envious': 'envy',
    'jealous': 'jealous'
}

Add rewritten prompts that were not collected for protocol 11 (by taking the protocol 6 question and replacing instances of 6 to 11)

In [None]:
new_emotions = ['insecure', 'disgusted', 'disappointed', 'ashamed', 'guilty', 'envious', 'jealous']
for emotion in new_emotions:
    e_literal = emotion.capitalize()
    protocol_11_list = []

    for row in empPersonas[f'{e_literal} - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?']:
        protocol_11 = row
        if isinstance(protocol_11, str) and "six" in protocol_11:
            protocol_11 = protocol_11.replace("six", "evelen")
        if isinstance(protocol_11, str) and "Six" in protocol_11:
            protocol_11 = protocol_11.replace("Six", "Eleven")
        if isinstance(protocol_11, str) and "6" in protocol_11:
            protocol_11 = protocol_11.replace("6", "11")

        protocol_11_list.append(protocol_11)
    
    empPersonas[f'{e_literal} - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?'] = protocol_11_list

empPersonas.to_csv('drive/MyDrive/Individual Project/empatheticPersonas12.csv')

In [None]:
# Extracting a DataFrame with all emotions and their prompts as a text - emotion(label) combination
emotion_dfs = []
for emotion, label in emotions_dict.items():
    e_literal = emotion.capitalize()
    emotion_df = empPersonas[[f'{e_literal} - Patient response 1', f'{e_literal} - Patient response 2', f'{e_literal} - Patient response 3']]
    e_1 = emotion_df[f'{e_literal} - Patient response 1']
    e_2 = emotion_df[f'{e_literal} - Patient response 2']
    e_3 = emotion_df[f'{e_literal} - Patient response 3']
    emotion_df = pd.concat([e_1, e_2, e_3]).to_frame()
    emotion_df = emotion_df.dropna()
    
    emotion_df.insert(1, 'class', label)
    emotion_df.columns = ['text', 'class']
    emotion_dfs.append(emotion_df)

emotion_data = pd.concat(emotion_dfs)
emotion_data = emotion_data.sample(frac=1).reset_index(drop=True)

# distribution
print(emotion_data.groupby('class').size())

emotion_data = emotion_data.rename(columns={'class': 'emotions'})

print(len(emotion_data)) # A total of 2174 crowd-sourced samples

class
anger             297
disappointment    126
disgust           122
envy              124
fear              284
guilt             121
instability       132
jealous           121
joy               300
love              124
sadness           300
shame             123
dtype: int64
2174


Splitting Emotion Data into train, val and testing subsets



In [None]:
emotion_data['text'] = emotion_data['text'].str.replace('[^\w\s]','')
emotion_data['text'] = emotion_data['text'].str.lower()
 
train_path = "drive/MyDrive/Individual Project/Data/Emotion/plain_train.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/plain_test.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/plain_val.txt"

input_train, input_val, target_train, target_val = train_test_split(emotion_data.text.to_numpy(), 
                                                                    emotion_data.emotions.to_numpy(), 
                                                                    test_size=0.2, random_state=21)

input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.5, random_state=21)

train_dataset = pd.DataFrame(data={"text": input_train, "class": target_train})
val_dataset = pd.DataFrame(data={"text": input_val, "class": target_val})
test_dataset = pd.DataFrame(data={"text": input_test, "class": target_test})
final_dataset = {"train": train_dataset, "val": val_dataset , "test": test_dataset}

train_dataset.to_csv(train_path, sep=";",header=False, index=False)
val_dataset.to_csv(val_path, sep=";",header=False, index=False)
test_dataset.to_csv(test_path, sep=";",header=False, index=False)

train_dataset.count()

  """Entry point for launching an IPython kernel.


text     1739
class    1739
dtype: int64

Data Augmentation

As seen above, the emotions_data is very imbalanced, due to the prevalance of more responses in some emotions and a double recruitment in the 4 prime emotions
(anger, fear, sadness, happiness)

Below are the efforts related to balancing the data with downsampling, upsampling and augmentation.

Downsampling

Not very effective as we end up with much less data, cutting quality responses
from the 4 prime emotions

In [None]:
minority_class = min(emotion_data.groupby('emotions').size())

downsampled_emotion = emotion_data.copy(deep=True)

for emotion in emotions_dict.values():
    downsampled_emotion[downsampled_emotion['emotions'] == emotion] = downsampled_emotion[downsampled_emotion['emotions'] == emotion].sample(minority_class, random_state=21)
    
downsampled_emotion = downsampled_emotion.dropna()
print(downsampled_emotion.groupby('emotions').size())

emotions
anger             121
disappointment    121
disgust           121
envy              121
fear              121
guilt             121
instability       121
jealous           121
joy               121
love              121
sadness           121
shame             121
dtype: int64


Splitting the downsampled emotion into train, val and test subsets.

In [None]:
downsampled_emotion['text'] = downsampled_emotion['text'].str.replace('[^\w\s]','')
downsampled_emotion['text'] = downsampled_emotion['text'].str.lower()

train_path = "drive/MyDrive/Individual Project/Data/Emotion/downsampled_train.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/downsampled_test.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/downsampled_val.txt"

input_train, input_val, target_train, target_val = train_test_split(downsampled_emotion.text.to_numpy(), 
                                                                    downsampled_emotion.emotions.to_numpy(), 
                                                                    test_size=0.2, random_state=21)

input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.5, random_state=21)

train_dataset = pd.DataFrame(data={"text": input_train, "class": target_train})
val_dataset = pd.DataFrame(data={"text": input_val, "class": target_val})
test_dataset = pd.DataFrame(data={"text": input_test, "class": target_test})
final_dataset = {"train": train_dataset, "val": val_dataset , "test": test_dataset}

train_dataset.to_csv(train_path, sep=";",header=False, index=False)
val_dataset.to_csv(val_path, sep=";",header=False, index=False)
test_dataset.to_csv(test_path, sep=";",header=False, index=False)

train_dataset.count()

  """Entry point for launching an IPython kernel.


text     1161
class    1161
dtype: int64

Upsampling the data using augmentation techniques

1. Backtranslation
2. Wordnet Synonyms

In [None]:
# Setting up for the backtranslation augmentation

!pip install sacremoses
!pip install transformers
!pip install nlpaug
import nlpaug.augmenter.word as naw

back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de',
    to_model_name='facebook/wmt19-de-en')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 5.0 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=b90960638cffb4db2ba0d432c7d93f1f13f22d158d7be4cc642975785f488852
  Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
[?25hCollecting huggingface-hub<

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

In [None]:
bt_upsample_emotion = emotion_data.copy(deep=True)

for emotion in ['love', 'instability', 'disgust', 'disappointment', 'shame', 'guilt', 'envy', 'jealous']:
    bt_list = []

    for _, row in bt_upsample_emotion[bt_upsample_emotion['emotions'] == emotion].iterrows():
        bt_text = back_translation_aug.augment(row.text)
        if bt_text:
            bt_list.append(bt_text)

    bt_df = pd.DataFrame(bt_list)
    bt_df.insert(1, 'emotions', emotion)
    bt_df.columns = ['text', 'emotions']
    bt_upsample_emotion = pd.concat([bt_upsample_emotion, bt_df], ignore_index=True)

print(bt_upsample_emotion.groupby('emotions').size())

emotions
anger             297
disappointment    252
disgust           244
envy              248
fear              284
guilt             242
instability       263
jealous           242
joy               300
love              242
sadness           300
shame             246
dtype: int64


Splitting backtranslation augmented data into train, val, test subsets.

In [None]:
bt_upsample_emotion['text'] = bt_upsample_emotion['text'].str.replace('[^\w\s]','')
bt_upsample_emotion['text'] = bt_upsample_emotion['text'].str.lower()

train_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_train.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_test.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_val.txt"

input_train, input_val, target_train, target_val = train_test_split(bt_upsample_emotion.text.to_numpy(), 
                                                                    bt_upsample_emotion.emotions.to_numpy(), 
                                                                    test_size=0.2, random_state=21)

input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.5, random_state=21)

train_dataset = pd.DataFrame(data={"text": input_train, "class": target_train})
val_dataset = pd.DataFrame(data={"text": input_val, "class": target_val})
test_dataset = pd.DataFrame(data={"text": input_test, "class": target_test})
final_dataset = {"train": train_dataset, "val": val_dataset , "test": test_dataset}

train_dataset.to_csv(train_path, sep=";",header=False, index=False)
val_dataset.to_csv(val_path, sep=";",header=False, index=False)
test_dataset.to_csv(test_path, sep=";",header=False, index=False)

train_dataset.count()

  """Entry point for launching an IPython kernel.


text     2528
class    2528
dtype: int64

In [None]:
# Setting up the wordnet synonym augmentation

import nltk
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

syn_aug = naw.SynonymAug(aug_src='wordnet',aug_max=1)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
syn_upsample_emotion = emotion_data.copy(deep=True)

for emotion in ['love', 'instability', 'disgust', 'disappointment', 'shame', 'guilt', 'envy', 'jealous']:
    syn_list = []

    for _, row in syn_upsample_emotion[syn_upsample_emotion['emotions'] == emotion].iterrows():
        syn_text = syn_aug.augment(row.text)
        if syn_text:
            syn_list.append(syn_text)

    syn_df = pd.DataFrame(syn_list)
    syn_df.insert(1, 'emotions', emotion)
    syn_df.columns = ['text', 'emotions']
    syn_upsample_emotion = pd.concat([syn_upsample_emotion, syn_df], ignore_index=True)

print(syn_upsample_emotion.groupby('emotions').size())

emotions
anger             297
disappointment    252
disgust           244
envy              248
fear              284
guilt             242
instability       264
jealous           242
joy               300
love              248
sadness           300
shame             246
dtype: int64


Splitting the wordnet synonym augmented data into train, val and test subsets

In [None]:
syn_upsample_emotion['text'] = syn_upsample_emotion['text'].str.replace('[^\w\s]','')
syn_upsample_emotion['text'] = syn_upsample_emotion['text'].str.lower()

train_path = "drive/MyDrive/Individual Project/Data/Emotion/syn_train.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/syn_test.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/syn_val.txt"

input_train, input_val, target_train, target_val = train_test_split(syn_upsample_emotion.text.to_numpy(), 
                                                                    syn_upsample_emotion.emotions.to_numpy(), 
                                                                    test_size=0.2, random_state=21)

input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.5, random_state=21)

train_dataset = pd.DataFrame(data={"text": input_train, "class": target_train})
val_dataset = pd.DataFrame(data={"text": input_val, "class": target_val})
test_dataset = pd.DataFrame(data={"text": input_test, "class": target_test})
final_dataset = {"train": train_dataset, "val": val_dataset , "test": test_dataset}

train_dataset.to_csv(train_path, sep=";",header=False, index=False)
val_dataset.to_csv(val_path, sep=";",header=False, index=False)
test_dataset.to_csv(test_path, sep=";",header=False, index=False)

train_dataset.count()

NameError: ignored

Balancing both upsampled datasets by sampling the minority class of 242 datapoints

In [None]:
minority_class = min(syn_upsample_emotion.groupby('emotions').size()) # this is the same as bt_upsample_emotion.groupby('emotions').size()

bt_equal_emotion = bt_upsample_emotion.copy(deep=True)
syn_equal_emotion = syn_upsample_emotion.copy(deep=True)

for emotion in emotions_dict.values():
    bt_equal_emotion[bt_equal_emotion['emotions'] == emotion] = bt_equal_emotion[bt_equal_emotion['emotions'] == emotion].sample(minority_class, random_state=21)
    syn_equal_emotion[syn_equal_emotion['emotions'] == emotion] = syn_equal_emotion[syn_equal_emotion['emotions'] == emotion].sample(minority_class, random_state=21)

bt_equal_emotion = bt_equal_emotion.dropna()
syn_equal_emotion = syn_equal_emotion.dropna()

print(bt_equal_emotion.groupby('emotions').size())
print(syn_equal_emotion.groupby('emotions').size())

NameError: ignored

Save the equally split augmented datasets as a train, val, test subsets.

In [None]:
train_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_train_equal.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_test_equal.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_val_equal.txt"

input_train, input_val, target_train, target_val = train_test_split(bt_equal_emotion.text.to_numpy(), 
                                                                    bt_equal_emotion.emotions.to_numpy(), 
                                                                    test_size=0.2, random_state=21)

input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.5, random_state=21)

train_dataset = pd.DataFrame(data={"text": input_train, "class": target_train})
val_dataset = pd.DataFrame(data={"text": input_val, "class": target_val})
test_dataset = pd.DataFrame(data={"text": input_test, "class": target_test})
final_dataset = {"train": train_dataset, "val": val_dataset , "test": test_dataset}

train_dataset.to_csv(train_path, sep=";",header=False, index=False)
val_dataset.to_csv(val_path, sep=";",header=False, index=False)
test_dataset.to_csv(test_path, sep=";",header=False, index=False)

train_dataset.count()

text     2323
class    2323
dtype: int64

Remove jealous label from the bt_equal_emotion dataset and the split into train, test and val subsets.

In [None]:
bt_equal_emotion_no_jealous_train = pd.read_csv("drive/MyDrive/Individual Project/Data/Emotion/bt_train_equal.txt", sep=';')
bt_equal_emotion_no_jealous_train.columns = ['text', 'emotions']
bt_equal_emotion_no_jealous_test = pd.read_csv("drive/MyDrive/Individual Project/Data/Emotion/bt_test_equal.txt", sep=';')
bt_equal_emotion_no_jealous_test.columns = ['text', 'emotions']
bt_equal_emotion_no_jealous_val = pd.read_csv("drive/MyDrive/Individual Project/Data/Emotion/bt_val_equal.txt", sep=';')
bt_equal_emotion_no_jealous_val.columns = ['text', 'emotions']

bt_equal_emotion_no_jealous_train = bt_equal_emotion_no_jealous_train[bt_equal_emotion_no_jealous_train["emotions"] != 'jealous']
print(bt_equal_emotion_no_jealous_train.groupby('emotions').size())
bt_equal_emotion_no_jealous_test = bt_equal_emotion_no_jealous_test[bt_equal_emotion_no_jealous_test["emotions"] != 'jealous']
print(bt_equal_emotion_no_jealous_test.groupby('emotions').size())
bt_equal_emotion_no_jealous_val = bt_equal_emotion_no_jealous_val[bt_equal_emotion_no_jealous_val["emotions"] != 'jealous']
print(bt_equal_emotion_no_jealous_val.groupby('emotions').size())

emotions
anger             192
disappointment    192
disgust           194
envy              191
fear              187
guilt             197
instability       194
joy               199
love              192
sadness           198
shame             197
dtype: int64
emotions
anger             29
disappointment    18
disgust           24
envy              23
fear              29
guilt             26
instability       22
joy               22
love              25
sadness           22
shame             23
dtype: int64
emotions
anger             21
disappointment    31
disgust           24
envy              27
fear              26
guilt             18
instability       26
joy               21
love              25
sadness           22
shame             22
dtype: int64


In [None]:
train_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_train_equal_no_jealous.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_test_equal_no_jealous.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_val_equal_no_jealous.txt"

bt_equal_emotion_no_jealous_train.to_csv(train_path, sep=";",header=False, index=False)
bt_equal_emotion_no_jealous_test.to_csv(val_path, sep=";",header=False, index=False)
bt_equal_emotion_no_jealous_val.to_csv(test_path, sep=";",header=False, index=False)

bt_equal_emotion_no_jealous_train.count()
bt_equal_emotion_no_jealous_test.count()
bt_equal_emotion_no_jealous_val.count()

NameError: ignored

Remove jealousy label from bt_emotion dataset

In [None]:
bt_emotion_no_jealous_train = pd.read_csv("drive/MyDrive/Individual Project/Data/Emotion/bt_train.txt", sep=';')
bt_emotion_no_jealous_train.columns = ['text', 'emotions']
bt_emotion_no_jealous_test = pd.read_csv("drive/MyDrive/Individual Project/Data/Emotion/bt_test.txt", sep=';')
bt_emotion_no_jealous_test.columns = ['text', 'emotions']
bt_emotion_no_jealous_val = pd.read_csv("drive/MyDrive/Individual Project/Data/Emotion/bt_val.txt", sep=';')
bt_emotion_no_jealous_val.columns = ['text', 'emotions']

bt_emotion_no_jealous_train = bt_emotion_no_jealous_train[bt_emotion_no_jealous_train["emotions"] != 'jealous']
print(bt_emotion_no_jealous_train.groupby('emotions').size())
bt_emotion_no_jealous_test = bt_emotion_no_jealous_test[bt_emotion_no_jealous_test["emotions"] != 'jealous']
print(bt_emotion_no_jealous_test.groupby('emotions').size())
bt_emotion_no_jealous_val = bt_emotion_no_jealous_val[bt_emotion_no_jealous_val["emotions"] != 'jealous']
print(bt_emotion_no_jealous_val.groupby('emotions').size())

emotions
anger             243
disappointment    213
disgust           199
envy              199
fear              224
guilt             189
instability       224
joy               236
love              179
sadness           229
shame             202
dtype: int64
emotions
anger             27
disappointment    20
disgust           25
envy              28
fear              28
guilt             22
instability       20
joy               32
love              31
sadness           38
shame             16
dtype: int64
emotions
anger             27
disappointment    19
disgust           20
envy              21
fear              31
guilt             31
instability       19
joy               32
love              31
sadness           32
shame             28
dtype: int64


In [None]:
train_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_train_no_jealous.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_test_no_jealous.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/bt_val_no_jealous.txt"

bt_emotion_no_jealous_train.to_csv(train_path, sep=";",header=False, index=False)
bt_emotion_no_jealous_test.to_csv(val_path, sep=";",header=False, index=False)
bt_emotion_no_jealous_val.to_csv(test_path, sep=";",header=False, index=False)

print(bt_emotion_no_jealous_train.count())
print(bt_emotion_no_jealous_test.count())
print(bt_emotion_no_jealous_val.count())

text        2337
emotions    2337
dtype: int64
text        287
emotions    287
dtype: int64
text        291
emotions    291
dtype: int64


In [None]:
train_path = "drive/MyDrive/Individual Project/Data/Emotion/syn_train_equal.txt" 
test_path = "drive/MyDrive/Individual Project/Data/Emotion/syn_test_equal.txt"
val_path = "drive/MyDrive/Individual Project/Data/Emotion/syn_val_equal.txt"

input_train, input_val, target_train, target_val = train_test_split(syn_equal_emotion.text.to_numpy(), 
                                                                    syn_equal_emotion.emotions.to_numpy(), 
                                                                    test_size=0.2, random_state=21)

input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.5, random_state=21)

train_dataset = pd.DataFrame(data={"text": input_train, "class": target_train})
val_dataset = pd.DataFrame(data={"text": input_val, "class": target_val})
test_dataset = pd.DataFrame(data={"text": input_test, "class": target_test})
final_dataset = {"train": train_dataset, "val": val_dataset , "test": test_dataset}

train_dataset.to_csv(train_path, sep=";",header=False, index=False)
val_dataset.to_csv(val_path, sep=";",header=False, index=False)
test_dataset.to_csv(test_path, sep=";",header=False, index=False)

train_dataset.count()

text     2323
class    2323
dtype: int64

EmpatheticDialogues

This section will transform the EmpatheticDialogues into a format that is readily accessed by our models for easier fine-tuning.

In [None]:
'''
@inproceedings{rashkin2019towards,
  title = {Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset},
  author = {Hannah Rashkin and Eric Michael Smith and Margaret Li and Y-Lan Boureau},
  booktitle = {ACL},
  year = {2019},
}
'''

'\n@inproceedings{rashkin2019towards,\n  title = {Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset},\n  author = {Hannah Rashkin and Eric Michael Smith and Margaret Li and Y-Lan Boureau},\n  booktitle = {ACL},\n  year = {2019},\n}\n'

In [None]:
empDialoguesTrain = pd.read_csv('drive/MyDrive/Individual Project/Data/empatheticdialogues/train.csv', on_bad_lines='skip')
empDialoguesTest = pd.read_csv('drive/MyDrive/Individual Project/Data/empatheticdialogues/test.csv', on_bad_lines='skip')
empDialoguesValid = pd.read_csv('drive/MyDrive/Individual Project/Data/empatheticdialogues/valid.csv', on_bad_lines='skip')

In [None]:
empDialoguesTrain = empDialoguesTrain.drop(['conv_id', 'speaker_idx', 'utterance', 'selfeval', 'tags'], axis=1)
empDialoguesTest = empDialoguesTest.drop(['conv_id', 'speaker_idx', 'utterance', 'selfeval', 'tags'], axis=1)
empDialoguesValid = empDialoguesValid.drop(['conv_id', 'speaker_idx', 'utterance', 'selfeval', 'tags'], axis=1)

empDialoguesTrain = empDialoguesTrain[empDialoguesTrain['utterance_idx'] == 1].drop('utterance_idx', axis=1)
empDialoguesTest = empDialoguesTest[empDialoguesTest['utterance_idx'] == 1].drop('utterance_idx', axis=1)
empDialoguesValid = empDialoguesValid[empDialoguesValid['utterance_idx'] == 1].drop('utterance_idx', axis=1)

empDialoguesTrain =  empDialoguesTrain.rename(columns = {'context': 'emotions', 'prompt': 'text'})
empDialoguesTest =  empDialoguesTest.rename(columns = {'context': 'emotions', 'prompt': 'text'})
empDialoguesValid =  empDialoguesValid.rename(columns = {'context': 'emotions', 'prompt': 'text'})

print(empDialoguesTest)

          emotions                                               text
0           guilty  I felt guilty when I was driving home one nigh...
3           caring  My mother stopped by my house one day and said...
5           lonely  I just broke up with my girlfriend_comma_ we w...
7          excited          I received concert tickets for Christmas.
9              sad  i've read an article about a little newborn ba...
...            ...                                                ...
5691     impressed  I was totally surprised when I saw my friend's...
5693  disappointed  Had to cancel our family vacation coming up ne...
5695      grateful             I'm glad that life is being good to me
5697     disgusted  I saw a huge cockroach outside my house today....
5699       anxious  I have a big test on Monday. I am so nervous_c...

[2541 rows x 2 columns]


In [None]:
# swap places of the columns to keep format of our own dataset
columns_titles = ['text', 'emotions']

empDialoguesTrain = empDialoguesTrain.reindex(columns=columns_titles)
empDialoguesTest = empDialoguesTest.reindex(columns=columns_titles)
empDialoguesValid = empDialoguesValid.reindex(columns=columns_titles)

print(empDialoguesTrain.groupby('emotions').size())
print(empDialoguesTest.groupby('emotions').size())
print(empDialoguesValid.groupby('emotions').size())

emotions
afraid          584
angry           637
annoyed         612
anticipating    561
anxious         568
apprehensive    419
ashamed         446
caring          476
confident       567
content         515
devastated      514
disappointed    550
disgusted       568
embarrassed     514
excited         684
faithful        324
furious         552
grateful        585
guilty          573
hopeful         563
impressed       564
jealous         536
joyful          556
lonely          584
nostalgic       541
prepared        536
proud           637
sad             608
sentimental     473
surprised       922
terrified       575
trusting        453
dtype: int64
emotions
afraid           73
angry            84
annoyed          91
anticipating     73
anxious          78
apprehensive     71
ashamed          64
caring           80
confident        75
content          76
devastated       66
disappointed     81
disgusted        86
embarrassed      81
excited          91
faithful         50
furious  

In [None]:
# save the modified empatheticDialogues datasets as a train, val and test dataset
empDialoguesTrainPath = 'drive/MyDrive/Individual Project/Data/Emotion/emp_dia_train.txt'
empDialoguesTestPath = 'drive/MyDrive/Individual Project/Data/Emotion/emp_dia_test.txt'
empDialoguesValidPath = 'drive/MyDrive/Individual Project/Data/Emotion/emp_dia_val.txt'

empDialoguesTrain.to_csv(empDialoguesTrainPath, sep=";",header=False, index=False)
empDialoguesTest.to_csv(empDialoguesTestPath, sep=";",header=False, index=False)
empDialoguesValid.to_csv(empDialoguesValidPath, sep=";",header=False, index=False)


As the EmpatheticDialogues corpus contains a lot of emotions that are different nuances and intensities of the same emotions, we could combine them for coarser classification as suggested by the paper authors themselves. 

In [None]:
empDialoguesSmallTrainPath = 'drive/MyDrive/Individual Project/Data/Emotion/small_dia_train.txt'
empDialoguesSmallTestPath = 'drive/MyDrive/Individual Project/Data/Emotion/small_dia_test.txt'
empDialoguesSmallValidPath = 'drive/MyDrive/Individual Project/Data/Emotion/small_dia_val.txt'

empDialoguesTrain[empDialoguesTrain['emotions'] == 'furious'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'furious'].replace('furious','angry')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'angry'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'angry'].replace('angry','anger')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'excited'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'excited'].replace('excited','joyful')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'anticipating'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'anticipating'].replace('anticipating','joyful')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'joyful'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'joyful'].replace('joyful','joy')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'devastated'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'devastated'].replace('devastated','sad')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'sad'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'sad'].replace('sad','sadness')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'terrified'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'terrified'].replace('terrified','afraid')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'afraid'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'afraid'].replace('afraid','fear')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'embarrassed'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'embarrassed'].replace('embarrassed','ashamed')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'ashamed'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'ashamed'].replace('ashamed','shame')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'faithful'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'faithful'].replace('faithful','trusting')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'nostalgic'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'nostalgic'].replace('nostalgic','sentimental')

empDialoguesTrain[empDialoguesTrain['emotions'] == 'caring'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'caring'].replace('caring','love')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'apprehensive'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'apprehensive'].replace('apprehensive','instability')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'disgusted'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'disgusted'].replace('disgusted','disgust')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'disappointed'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'disappointed'].replace('disappointed','disappointment')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'guilty'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'guilty'].replace('guilty','guilt')
empDialoguesTrain[empDialoguesTrain['emotions'] == 'jealous'] = empDialoguesTrain[empDialoguesTrain['emotions'] == 'jealous'].replace('jealous','envy')

print(empDialoguesTrain.groupby('emotions').size())
print(empDialoguesTrain['emotions'].nunique())


empDialoguesTest[empDialoguesTest['emotions'] == 'furious'] = empDialoguesTest[empDialoguesTest['emotions'] == 'furious'].replace('furious','angry')
empDialoguesTest[empDialoguesTest['emotions'] == 'angry'] = empDialoguesTest[empDialoguesTest['emotions'] == 'angry'].replace('angry','anger')
empDialoguesTest[empDialoguesTest['emotions'] == 'excited'] = empDialoguesTest[empDialoguesTest['emotions'] == 'excited'].replace('excited','joyful')
empDialoguesTest[empDialoguesTest['emotions'] == 'anticipating'] = empDialoguesTest[empDialoguesTest['emotions'] == 'anticipating'].replace('anticipating','joyful')
empDialoguesTest[empDialoguesTest['emotions'] == 'joyful'] = empDialoguesTest[empDialoguesTest['emotions'] == 'joyful'].replace('joyful','joy')
empDialoguesTest[empDialoguesTest['emotions'] == 'devastated'] = empDialoguesTest[empDialoguesTest['emotions'] == 'devastated'].replace('devastated','sad')
empDialoguesTest[empDialoguesTest['emotions'] == 'sad'] = empDialoguesTest[empDialoguesTest['emotions'] == 'sad'].replace('sad','sadness')
empDialoguesTest[empDialoguesTest['emotions'] == 'terrified'] = empDialoguesTest[empDialoguesTest['emotions'] == 'terrified'].replace('terrified','afraid')
empDialoguesTest[empDialoguesTest['emotions'] == 'afraid'] = empDialoguesTest[empDialoguesTest['emotions'] == 'afraid'].replace('afraid','fear')
empDialoguesTest[empDialoguesTest['emotions'] == 'embarrassed'] = empDialoguesTest[empDialoguesTest['emotions'] == 'embarrassed'].replace('embarrassed','ashamed')
empDialoguesTest[empDialoguesTest['emotions'] == 'ashamed'] = empDialoguesTest[empDialoguesTest['emotions'] == 'ashamed'].replace('ashamed','shame')
empDialoguesTest[empDialoguesTest['emotions'] == 'faithful'] = empDialoguesTest[empDialoguesTest['emotions'] == 'faithful'].replace('faithful','trusting')
empDialoguesTest[empDialoguesTest['emotions'] == 'nostalgic'] = empDialoguesTest[empDialoguesTest['emotions'] == 'nostalgic'].replace('nostalgic','sentimental')

empDialoguesTest[empDialoguesTest['emotions'] == 'caring'] = empDialoguesTest[empDialoguesTest['emotions'] == 'caring'].replace('caring','love')
empDialoguesTest[empDialoguesTest['emotions'] == 'apprehensive'] = empDialoguesTest[empDialoguesTest['emotions'] == 'apprehensive'].replace('apprehensive','instability')
empDialoguesTest[empDialoguesTest['emotions'] == 'disgusted'] = empDialoguesTest[empDialoguesTest['emotions'] == 'disgusted'].replace('disgusted','disgust')
empDialoguesTest[empDialoguesTest['emotions'] == 'disappointed'] = empDialoguesTest[empDialoguesTest['emotions'] == 'disappointed'].replace('disappointed','disappointment')
empDialoguesTest[empDialoguesTest['emotions'] == 'guilty'] = empDialoguesTest[empDialoguesTest['emotions'] == 'guilty'].replace('guilty','guilt')
empDialoguesTest[empDialoguesTest['emotions'] == 'jealous'] = empDialoguesTest[empDialoguesTest['emotions'] == 'jealous'].replace('jealous','envy')

print(empDialoguesTest.groupby('emotions').size())
print(empDialoguesTest['emotions'].nunique())

empDialoguesValid[empDialoguesValid['emotions'] == 'furious'] = empDialoguesValid[empDialoguesValid['emotions'] == 'furious'].replace('furious','angry')
empDialoguesValid[empDialoguesValid['emotions'] == 'angry'] = empDialoguesValid[empDialoguesValid['emotions'] == 'angry'].replace('angry','anger')
empDialoguesValid[empDialoguesValid['emotions'] == 'excited'] = empDialoguesValid[empDialoguesValid['emotions'] == 'excited'].replace('excited','joyful')
empDialoguesValid[empDialoguesValid['emotions'] == 'anticipating'] = empDialoguesValid[empDialoguesValid['emotions'] == 'anticipating'].replace('anticipating','joyful')
empDialoguesValid[empDialoguesValid['emotions'] == 'anticipating'] = empDialoguesValid[empDialoguesValid['emotions'] == 'anticipating'].replace('anticipating','joyful')
empDialoguesValid[empDialoguesValid['emotions'] == 'joyful'] = empDialoguesValid[empDialoguesValid['emotions'] == 'joyful'].replace('joyful','joy')
empDialoguesValid[empDialoguesValid['emotions'] == 'devastated'] = empDialoguesValid[empDialoguesValid['emotions'] == 'devastated'].replace('devastated','sad')
empDialoguesValid[empDialoguesValid['emotions'] == 'sad'] = empDialoguesValid[empDialoguesValid['emotions'] == 'sad'].replace('sad','sadness')
empDialoguesValid[empDialoguesValid['emotions'] == 'terrified'] = empDialoguesValid[empDialoguesValid['emotions'] == 'terrified'].replace('terrified','afraid')
empDialoguesValid[empDialoguesValid['emotions'] == 'afraid'] = empDialoguesValid[empDialoguesValid['emotions'] == 'afraid'].replace('afraid','fear')
empDialoguesValid[empDialoguesValid['emotions'] == 'embarrassed'] = empDialoguesValid[empDialoguesValid['emotions'] == 'embarrassed'].replace('embarrassed','ashamed')
empDialoguesValid[empDialoguesValid['emotions'] == 'ashamed'] = empDialoguesValid[empDialoguesValid['emotions'] == 'ashamed'].replace('ashamed','shame')
empDialoguesValid[empDialoguesValid['emotions'] == 'faithful'] = empDialoguesValid[empDialoguesValid['emotions'] == 'faithful'].replace('faithful','trusting')
empDialoguesValid[empDialoguesValid['emotions'] == 'nostalgic'] = empDialoguesValid[empDialoguesValid['emotions'] == 'nostalgic'].replace('nostalgic','sentimental')

empDialoguesValid[empDialoguesValid['emotions'] == 'caring'] = empDialoguesValid[empDialoguesValid['emotions'] == 'caring'].replace('caring','love')
empDialoguesValid[empDialoguesValid['emotions'] == 'apprehensive'] = empDialoguesValid[empDialoguesValid['emotions'] == 'apprehensive'].replace('apprehensive','instability')
empDialoguesValid[empDialoguesValid['emotions'] == 'disgusted'] = empDialoguesValid[empDialoguesValid['emotions'] == 'disgusted'].replace('disgusted','disgust')
empDialoguesValid[empDialoguesValid['emotions'] == 'disappointed'] = empDialoguesValid[empDialoguesValid['emotions'] == 'disappointed'].replace('disappointed','disappointment')
empDialoguesValid[empDialoguesValid['emotions'] == 'guilty'] = empDialoguesValid[empDialoguesValid['emotions'] == 'guilty'].replace('guilty','guilt')
empDialoguesValid[empDialoguesValid['emotions'] == 'jealous'] = empDialoguesValid[empDialoguesValid['emotions'] == 'jealous'].replace('jealous','envy')

print(empDialoguesValid.groupby('emotions').size())
print(empDialoguesValid['emotions'].nunique())

# save the modified dataset of 24 emotions to train, val and test files
empDialoguesTrain.to_csv(empDialoguesSmallTrainPath, sep=";",header=False, index=False)
empDialoguesTest.to_csv(empDialoguesSmallTestPath, sep=";",header=False, index=False)
empDialoguesValid.to_csv(empDialoguesSmallValidPath, sep=";",header=False, index=False)

emotions
anger             1189
annoyed            612
anxious            568
confident          567
content            515
disappointment     550
disgust            568
envy               536
fear              1159
grateful           585
guilt              573
hopeful            563
impressed          564
instability        419
joy               1801
lonely             584
love               476
prepared           536
proud              637
sadness           1122
sentimental       1014
shame              960
surprised          922
trusting           777
dtype: int64
24
emotions
anger             151
annoyed            91
anxious            78
confident          75
content            76
disappointment     81
disgust            86
envy               81
fear              144
grateful           95
guilt              66
hopeful            79
impressed          81
instability        71
joy               247
lonely             78
love               80
prepared           77
proud             

In the next section we drop emotions that are not of interest and are not analogical to the ones crowd-sourced in order to use this new dataset to finetune our model.

In [None]:
# the anxious emotion we will avoid (we will use the afraid one to map to anxious/fearful)
empDialoguesTrain = empDialoguesTrain[(empDialoguesTrain['emotions'] == 'fear') 
                    | (empDialoguesTrain['emotions'] == 'anger') 
                    | (empDialoguesTrain['emotions'] == 'love') 
                    | (empDialoguesTrain['emotions'] == 'disappointment')
                    | (empDialoguesTrain['emotions'] == 'disgust')
                    | (empDialoguesTrain['emotions'] == 'guilt')
                    | (empDialoguesTrain['emotions'] == 'joy')
                    | (empDialoguesTrain['emotions'] == 'sadness')
                    | (empDialoguesTrain['emotions'] == 'envy')
                    | (empDialoguesTrain['emotions'] == 'instability')
                    | (empDialoguesTrain['emotions'] == 'shame')]

empDialoguesTest = empDialoguesTest[(empDialoguesTest['emotions'] == 'fear') 
                    | (empDialoguesTest['emotions'] == 'anger') 
                    | (empDialoguesTest['emotions'] == 'love') 
                    | (empDialoguesTest['emotions'] == 'disappointment')
                    | (empDialoguesTest['emotions'] == 'disgust')
                    | (empDialoguesTest['emotions'] == 'guilt')
                    | (empDialoguesTest['emotions'] == 'joy')
                    | (empDialoguesTest['emotions'] == 'sadness')
                    | (empDialoguesTest['emotions'] == 'envy')
                    | (empDialoguesTest['emotions'] == 'instability')
                    | (empDialoguesTest['emotions'] == 'shame')]

empDialoguesValid = empDialoguesValid[(empDialoguesValid['emotions'] == 'fear') 
                    | (empDialoguesValid['emotions'] == 'anger') 
                    | (empDialoguesValid['emotions'] == 'love') 
                    | (empDialoguesValid['emotions'] == 'disappointment')
                    | (empDialoguesValid['emotions'] == 'disgust')
                    | (empDialoguesValid['emotions'] == 'guilt')
                    | (empDialoguesValid['emotions'] == 'joy')
                    | (empDialoguesValid['emotions'] == 'sadness')
                    | (empDialoguesValid['emotions'] == 'envy')
                    | (empDialoguesValid['emotions'] == 'instability')
                    | (empDialoguesValid['emotions'] == 'shame')]



print(empDialoguesTrain.groupby('emotions').size())
print(empDialoguesTrain['emotions'].nunique())

print(empDialoguesTest.groupby('emotions').size())
print(empDialoguesTest['emotions'].nunique())

print(empDialoguesValid.groupby('emotions').size())
print(empDialoguesValid['emotions'].nunique())


emotions
anger             1189
disappointment     550
disgust            568
envy               536
fear              1159
guilt              573
instability        419
joy               1801
love               476
sadness           1122
shame              960
dtype: int64
11
emotions
anger             151
disappointment     81
disgust            86
envy               81
fear              144
guilt              66
instability        71
joy               247
love               80
sadness           154
shame             145
dtype: int64
11
emotions
anger             161
disappointment     91
disgust            82
envy               89
fear              171
guilt              71
instability        74
joy               257
love               75
sadness           176
shame             165
dtype: int64
11


In [None]:
# save our version of the EmpatheticDialogues for finetuning
myEmpDialoguesTrainPath = 'drive/MyDrive/Individual Project/Data/Emotion/my_dia_train.txt'
myEmpDialoguesTestPath = 'drive/MyDrive/Individual Project/Data/Emotion/my_dia_test.txt'
myEmpDialoguesValidPath = 'drive/MyDrive/Individual Project/Data/Emotion/my_dia_val.txt'

empDialoguesTrain.to_csv(myEmpDialoguesTrainPath, sep=";",header=False, index=False)
empDialoguesTest.to_csv(myEmpDialoguesTestPath, sep=";",header=False, index=False)
empDialoguesValid.to_csv(myEmpDialoguesValidPath, sep=";",header=False, index=False)

In [None]:
# as the classes in the above dataset are imbalances (especially after we added several datapoints together, we will now downsamples the overly represented ones)

minority_class_train = min(empDialoguesTrain.groupby('emotions').size())
minority_class_test = min(empDialoguesTest.groupby('emotions').size())
minority_class_val = min(empDialoguesValid.groupby('emotions').size())

downsampled_emp_dia_train = empDialoguesTrain.copy(deep=True)
downsampled_emp_dia_test = empDialoguesTest.copy(deep=True)
downsampled_emp_dia_val = empDialoguesValid.copy(deep=True)

for emotion in ['sadness', 'anger', 'joy', 'fear', 'love', 'instability', 'disgust', 'disappointment', 'shame', 'guilt', 'envy']:
    downsampled_emp_dia_train[downsampled_emp_dia_train['emotions'] == emotion] = downsampled_emp_dia_train[downsampled_emp_dia_train['emotions'] == emotion].sample(minority_class_train, random_state=21)
    downsampled_emp_dia_test[downsampled_emp_dia_test['emotions'] == emotion] = downsampled_emp_dia_test[downsampled_emp_dia_test['emotions'] == emotion].sample(minority_class_test, random_state=21)
    downsampled_emp_dia_val[downsampled_emp_dia_val['emotions'] == emotion] = downsampled_emp_dia_val[downsampled_emp_dia_val['emotions'] == emotion].sample(minority_class_val, random_state=21)
    
downsampled_emp_dia_train = downsampled_emp_dia_train.dropna()
print(downsampled_emp_dia_train.groupby('emotions').size())

downsampled_emp_dia_test = downsampled_emp_dia_test.dropna()
print(downsampled_emp_dia_test.groupby('emotions').size())

downsampled_emp_dia_val = downsampled_emp_dia_val.dropna()
print(downsampled_emp_dia_val.groupby('emotions').size())

emotions
anger             419
disappointment    419
disgust           419
envy              419
fear              419
guilt             419
instability       419
joy               419
love              419
sadness           419
shame             419
dtype: int64
emotions
anger             66
disappointment    66
disgust           66
envy              66
fear              66
guilt             66
instability       66
joy               66
love              66
sadness           66
shame             66
dtype: int64
emotions
anger             71
disappointment    71
disgust           71
envy              71
fear              71
guilt             71
instability       71
joy               71
love              71
sadness           71
shame             71
dtype: int64


In [None]:
#save the downsampled train, test, val datasets

downsampled_emp_dia_train_path = 'drive/MyDrive/Individual Project/Data/Emotion/my_emp_dia_train_equal.txt'
downsampled_emp_dia_test_path = 'drive/MyDrive/Individual Project/Data/Emotion/my_emp_dia_test_equal.txt'
downsampled_emp_dia_val_path = 'drive/MyDrive/Individual Project/Data/Emotion/my_emp_dia_val_equal.txt'

downsampled_emp_dia_train.to_csv(downsampled_emp_dia_train_path, sep=";",header=False, index=False)
downsampled_emp_dia_test.to_csv(downsampled_emp_dia_test_path, sep=";",header=False, index=False)
downsampled_emp_dia_val.to_csv(downsampled_emp_dia_val_path, sep=";",header=False, index=False)

Empathetic Data



Splitting labeled empathy into train, val and test datasets

In [None]:
labeled_data = pd.read_csv('drive/MyDrive/Individual Project/empathy_labelled_ds.csv')#put data in Drive root folder or change path
print(len(labeled_data))

labeled_data = labeled_data.drop(['annotator1_score', 'annotator2_score', 'annotator3_score'], axis=1)

labeled_data["empathy_score"].replace({0: "no", 1: "weak", 2: "strong"}, inplace=True)


train_path = "drive/MyDrive/Individual Project/Data/Empathy/train.txt"
test_path = "drive/MyDrive/Individual Project/Data/Empathy/test.txt"
val_path = "drive/MyDrive/Individual Project/Data/Empathy/val.txt"


from sklearn.model_selection import train_test_split
import numpy as np

input_train, input_val, target_train, target_val = train_test_split(labeled_data.response.to_numpy(), 
                                                                    labeled_data.empathy_score.to_numpy(), 
                                                                    test_size=0.15,
                                                                    random_state=21)

input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.67, random_state=21)


train_dataset = pd.DataFrame(data={"text": input_train, "class": target_train})
val_dataset = pd.DataFrame(data={"text": input_val, "class": target_val})
test_dataset = pd.DataFrame(data={"text": input_test, "class": target_test})
final_dataset = {"train": train_dataset, "val": val_dataset , "test": test_dataset }

train_dataset.to_csv(train_path, sep=";",header=False, index=False)
val_dataset.to_csv(val_path, sep=";",header=False, index=False)
test_dataset.to_csv(test_path, sep=";",header=False, index=False)

Creating a dataset containing all empathetic rewritings from EmpatheticPersonas 2.0

In [None]:
empathy_dfs = []
for emotion, label in emotions_dict.items():
    e_literal = emotion.capitalize()
    empathy_df = pd.DataFrame()
    empathy_features = []
    if emotion == 'happy' or emotion == 'loving':
        empathy_features = [f'{e_literal} - That\'s Good! Let me recommend a protocol you can attempt.']
    else:
        empathy_features = [f'{e_literal} - Was this caused by a specific event/s?',
                            f'{e_literal} - Was this caused by a recent or distant event (or events)?',
                            f'{e_literal} - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Thank you. Now I will ask some questions to understand your situation.',
                            f'{e_literal} - Have you strongly felt or expressed any of the following emotions towards someone:',
                            f'{e_literal} - Do you believe that you should be the saviour of someone else?',
                            f'{e_literal} - Do you see yourself as the victim, blaming someone else for how negative you feel?',
                            f'{e_literal} - Do you feel that you are trying to control someone?',
                            f'{e_literal} - Are you always blaming and accusing yourself for when something goes wrong?',
                            f'{e_literal} - In previous conversations, have you considered other viewpoints presented?',
                            f'{e_literal} - Are you undergoing a personal crisis (experiencing difficulties with loved ones e.g. falling out with friends)?']

    for i in range(len(empathy_features)):
        empathy_response = empPersonas[empathy_features[i]]
        empathy_df = pd.concat([empathy_df, empathy_response])
        empathy_df.dropna(how='any', inplace=True)
    empathy_df.insert(1, 'class', label)
    empathy_df.columns = ['text', 'class']

    empathy_dfs.append(empathy_df)

empathy_data = pd.concat(empathy_dfs)
empathy_data.dropna(how='any', inplace=True)
empathy_data = empathy_data.sample(frac=1).reset_index(drop=True)

empathy_data = empathy_data.rename(columns={'class': 'emotions'})

print(empathy_data)
print(empathy_data.groupby('emotions').size())

empathy_data.to_csv('drive/MyDrive/Individual Project/Data/Empathy/empathy_responses.csv')

                                                   text        emotions
0     Are these feelings caused by someone else that...  disappointment
1     Do you feel like you are losing friends or lov...     instability
2                Which negative feelings have you felt?         disgust
3     Thank you for sharing this with me. I will ask...  disappointment
4     In all honesty, do you think that you are alwa...           anger
...                                                 ...             ...
5161  Have you felt strong emotions that you could n...     instability
5162  Thank you for sharing your feelings with me. N...         disgust
5163  I know that protocol 6 may cause you to relive...           shame
5164  Thank you. Do you, by any chance, think that y...            fear
5165  This is certainly not an ideal situation to be...           anger

[5166 rows x 2 columns]
emotions
anger             520
disappointment    530
disgust           538
envy              457
fear          

Extracting the new (new 8 emotions) prompt rewritings in the format of Lisa's empathy_labelled_ds.csv and putting them into an unlabeled dataset

Additionally, we will cut the annotator_score_x columns from the original
empathy_labelled_ds.csv as those are unused for the feature training in empathy (only keeping empathy_score)

In [None]:
unlabeled_empathy = empathy_data.copy(deep=True)

unlabeled_empathy.drop(unlabeled_empathy[unlabeled_empathy['emotions'] == 'anger'].index, inplace=True)
unlabeled_empathy.drop(unlabeled_empathy[unlabeled_empathy['emotions'] == 'joy'].index, inplace=True)
unlabeled_empathy.drop(unlabeled_empathy[unlabeled_empathy['emotions'] == 'fear'].index, inplace=True)
unlabeled_empathy.drop(unlabeled_empathy[unlabeled_empathy['emotions'] == 'sadness'].index, inplace=True)


unlabeled_empathy = unlabeled_empathy.drop('emotions', axis=1)
unlabeled_empathy.insert(1, 'empathy_score', None)

unlabeled_empathy.to_csv('drive/MyDrive/Individual Project/Data/Empathy/empathy_unlabeled.csv', index=False)

In [None]:
labeled_empathy = pd.read_csv('drive/MyDrive/Individual Project/empathy_labelled_ds.csv')

labeled_empathy = labeled_empathy.drop('annotator1_score', axis=1)
labeled_empathy = labeled_empathy.drop('annotator2_score', axis=1)
labeled_empathy = labeled_empathy.drop('annotator3_score', axis=1)

labeled_empathy.to_csv('drive/MyDrive/Individual Project/Data/Empathy/empathy_labeled.csv', index=False)

Combining Empathy dataset in a combined ground truth-labeled and pseudo-labeled dataset (from both RoBERTa and T5 as the teachers)
NOTE: In order to retain an unseen validation and test dataset, we will only
combine the pseudo-labels with the train dataset for empathy. That will become
our new train data, while the val and test data will remain the same ground-truth labeled data.

In [None]:
roberta_train_path = "drive/MyDrive/Individual Project/Data/Empathy/roberta_train.txt"
t5_train_path = "drive/MyDrive/Individual Project/Data/Empathy/t5_train.txt" 

roberta_labeled_empathy = pd.read_csv('drive/MyDrive/Individual Project/Data/Empathy/RoBERTa_labeled_empathy.csv')

t5_labeled_empathy = pd.read_csv('drive/MyDrive/Individual Project/Data/Empathy/T5_labeled_empathy.csv')

roberta_labeled_empathy["empathy_score"].replace({0: "no", 1: "weak", 2: "strong"}, inplace=True)
t5_labeled_empathy["empathy_score"].replace({0: "no", 1: "weak", 2: "strong"}, inplace=True)

roberta_labeled_empathy.to_csv(roberta_train_path, sep=";",header=False, index=False)
t5_labeled_empathy.to_csv(t5_train_path, sep=";",header=False, index=False)

In [None]:
import fileinput

train_path = "drive/MyDrive/Individual Project/Data/Empathy/train.txt"
roberta_train_path = "drive/MyDrive/Individual Project/Data/Empathy/roberta_train.txt"
t5_train_path = "drive/MyDrive/Individual Project/Data/Empathy/t5_train.txt" 

roberta_train_list = [train_path, roberta_train_path]
t5_train_list = [train_path, t5_train_path]

with open('drive/MyDrive/Individual Project/Data/Empathy/roberta_train_empathy.txt', 'w') as file:
    input_lines = fileinput.input(roberta_train_list)
    file.writelines(input_lines)

with open('drive/MyDrive/Individual Project/Data/Empathy/t5_train_empathy.txt', 'w') as file:
    input_lines = fileinput.input(t5_train_list)
    file.writelines(input_lines)

Adding the Crowd-Sourced Empathetic responses to the datasets of each Empathetic Persona (Kai, Arman, Gabrielle, Olivia, Robert)

Let's start with Kai and push all of the empathetic rewritings we have to them (as Kai is a combination of all ages and genders)

In [None]:
empPersonas = pd.read_csv('drive/MyDrive/Individual Project/empatheticPersonas12.csv', encoding='utf-8')
kai_data = pd.read_csv('drive/MyDrive/Individual Project/kai.csv')
kai_data = kai_data.dropna()
new_emotions = ['loving', 'insecure', 'disgusted', 'disappointed', 'ashamed', 'guilty', 'envious', 'jealous']

for emotion in new_emotions:
    e_literal = emotion.capitalize()
    empathy_df = pd.DataFrame()
    empathy_features = []
    if emotion == 'loving':
        empathy_features = [f'{e_literal} - That\'s Good! Let me recommend a protocol you can attempt.']
    else:
        empathy_features = [f'{e_literal} - Was this caused by a specific event/s?',
                            f'{e_literal} - Was this caused by a recent or distant event (or events)?',
                            f'{e_literal} - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Thank you. Now I will ask some questions to understand your situation.',
                            f'{e_literal} - Have you strongly felt or expressed any of the following emotions towards someone:',
                            f'{e_literal} - Do you believe that you should be the saviour of someone else?',
                            f'{e_literal} - Do you see yourself as the victim, blaming someone else for how negative you feel?',
                            f'{e_literal} - Do you feel that you are trying to control someone?',
                            f'{e_literal} - Are you always blaming and accusing yourself for when something goes wrong?',
                            f'{e_literal} - In previous conversations, have you considered other viewpoints presented?',
                            f'{e_literal} - Are you undergoing a personal crisis (experiencing difficulties with loved ones e.g. falling out with friends)?']

    for i in range(len(empathy_features)):
        empathy_response = empPersonas[empathy_features[i]].dropna().reset_index(drop=True)
        kai_data = pd.concat([kai_data, empathy_response], axis=1)

kai_data.to_csv('drive/MyDrive/Individual Project/Data/Empathy/kai.csv')


Gathering responses to build Robert's database (male 40-69 year old repsonses)


In [None]:
empPersonas = pd.read_csv('drive/MyDrive/Individual Project/empatheticPersonas12.csv', encoding='utf-8')
robert_data = pd.read_csv('drive/MyDrive/Individual Project/robert.csv')
robert_data = robert_data.dropna()
new_emotions = ['loving', 'insecure', 'disgusted', 'disappointed', 'ashamed', 'guilty', 'envious', 'jealous']

for emotion in new_emotions:
    e_literal = emotion.capitalize()
    empathy_df = pd.DataFrame()
    empathy_features = []
    if emotion == 'loving':
        empathy_features = [f'{e_literal} - That\'s Good! Let me recommend a protocol you can attempt.']
    else:
        empathy_features = [f'{e_literal} - Was this caused by a specific event/s?',
                            f'{e_literal} - Was this caused by a recent or distant event (or events)?',
                            f'{e_literal} - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Thank you. Now I will ask some questions to understand your situation.',
                            f'{e_literal} - Have you strongly felt or expressed any of the following emotions towards someone:',
                            f'{e_literal} - Do you believe that you should be the saviour of someone else?',
                            f'{e_literal} - Do you see yourself as the victim, blaming someone else for how negative you feel?',
                            f'{e_literal} - Do you feel that you are trying to control someone?',
                            f'{e_literal} - Are you always blaming and accusing yourself for when something goes wrong?',
                            f'{e_literal} - In previous conversations, have you considered other viewpoints presented?',
                            f'{e_literal} - Are you undergoing a personal crisis (experiencing difficulties with loved ones e.g. falling out with friends)?']

    for i in range(len(empathy_features)):
        empPersonasMale = empPersonas[empPersonas['Sex'] == 'Male']
        empPersonasRobert = empPersonasMale[(empPersonasMale['Age'] == '40-49') | (empPersonasMale['Age'] == '50-59') | (empPersonasMale['Age'] == '60-69')]
        empathy_response = empPersonasRobert[empathy_features[i]].dropna().reset_index(drop=True)
        robert_data = pd.concat([robert_data, empathy_response], axis=1)

robert_data.to_csv('drive/MyDrive/Individual Project/Data/Empathy/robert.csv')

Gathering responses to build Gabrielle's database (female 40-69 year old responses)

In [None]:
empPersonas = pd.read_csv('drive/MyDrive/Individual Project/empatheticPersonas12.csv', encoding='utf-8')
gabrielle_data = pd.read_csv('drive/MyDrive/Individual Project/gabrielle.csv')
gabrielle_data = gabrielle_data.dropna()
new_emotions = ['loving', 'insecure', 'disgusted', 'disappointed', 'ashamed', 'guilty', 'envious', 'jealous']

for emotion in new_emotions:
    e_literal = emotion.capitalize()
    empathy_df = pd.DataFrame()
    empathy_features = []
    if emotion == 'loving':
        empathy_features = [f'{e_literal} - That\'s Good! Let me recommend a protocol you can attempt.']
    else:
        empathy_features = [f'{e_literal} - Was this caused by a specific event/s?',
                            f'{e_literal} - Was this caused by a recent or distant event (or events)?',
                            f'{e_literal} - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Thank you. Now I will ask some questions to understand your situation.',
                            f'{e_literal} - Have you strongly felt or expressed any of the following emotions towards someone:',
                            f'{e_literal} - Do you believe that you should be the saviour of someone else?',
                            f'{e_literal} - Do you see yourself as the victim, blaming someone else for how negative you feel?',
                            f'{e_literal} - Do you feel that you are trying to control someone?',
                            f'{e_literal} - Are you always blaming and accusing yourself for when something goes wrong?',
                            f'{e_literal} - In previous conversations, have you considered other viewpoints presented?',
                            f'{e_literal} - Are you undergoing a personal crisis (experiencing difficulties with loved ones e.g. falling out with friends)?']

    for i in range(len(empathy_features)):
        empPersonasFemale = empPersonas[empPersonas['Sex'] == 'Female']
        empPersonasGabrielle = empPersonasFemale[(empPersonasFemale['Age'] == '40-49') | (empPersonasFemale['Age'] == '50-59') | (empPersonasFemale['Age'] == '60-69')]
        empathy_response = empPersonasGabrielle[empathy_features[i]].dropna().reset_index(drop=True)
        gabrielle_data = pd.concat([gabrielle_data, empathy_response], axis=1)

gabrielle_data.to_csv('drive/MyDrive/Individual Project/Data/Empathy/gabrielle.csv')

Gathering responses to build Arman's database (male 18-39 year old responses)

In [None]:
empPersonas = pd.read_csv('drive/MyDrive/Individual Project/empatheticPersonas12.csv', encoding='utf-8')
arman_data = pd.read_csv('drive/MyDrive/Individual Project/arman.csv')
arman_data = arman_data.dropna()
new_emotions = ['loving', 'insecure', 'disgusted', 'disappointed', 'ashamed', 'guilty', 'envious', 'jealous']

for emotion in new_emotions:
    e_literal = emotion.capitalize()
    empathy_df = pd.DataFrame()
    empathy_features = []
    if emotion == 'loving':
        empathy_features = [f'{e_literal} - That\'s Good! Let me recommend a protocol you can attempt.']
    else:
        empathy_features = [f'{e_literal} - Was this caused by a specific event/s?',
                            f'{e_literal} - Was this caused by a recent or distant event (or events)?',
                            f'{e_literal} - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Thank you. Now I will ask some questions to understand your situation.',
                            f'{e_literal} - Have you strongly felt or expressed any of the following emotions towards someone:',
                            f'{e_literal} - Do you believe that you should be the saviour of someone else?',
                            f'{e_literal} - Do you see yourself as the victim, blaming someone else for how negative you feel?',
                            f'{e_literal} - Do you feel that you are trying to control someone?',
                            f'{e_literal} - Are you always blaming and accusing yourself for when something goes wrong?',
                            f'{e_literal} - In previous conversations, have you considered other viewpoints presented?',
                            f'{e_literal} - Are you undergoing a personal crisis (experiencing difficulties with loved ones e.g. falling out with friends)?']

    for i in range(len(empathy_features)):
        empPersonasMale = empPersonas[empPersonas['Sex'] == 'Female']
        empPersonasArman = empPersonasMale[(empPersonasMale['Age'] == '18-24') | (empPersonasMale['Age'] == '25-29') | (empPersonasMale['Age'] == '30-39')]
        empathy_response = empPersonasArman[empathy_features[i]].dropna().reset_index(drop=True)
        arman_data = pd.concat([arman_data, empathy_response], axis=1)

arman_data.to_csv('drive/MyDrive/Individual Project/Data/Empathy/arman.csv')

Gathering responses to build Olivia' database (female 18-39 year old repsonses)

In [None]:
empPersonas = pd.read_csv('drive/MyDrive/Individual Project/empatheticPersonas12.csv', encoding='utf-8')
olivia_data = pd.read_csv('drive/MyDrive/Individual Project/olivia.csv')
olivia_data = olivia_data.dropna()
new_emotions = ['loving', 'insecure', 'disgusted', 'disappointed', 'ashamed', 'guilty', 'envious', 'jealous']

for emotion in new_emotions:
    e_literal = emotion.capitalize()
    empathy_df = pd.DataFrame()
    empathy_features = []
    if emotion == 'loving':
        empathy_features = [f'{e_literal} - That\'s Good! Let me recommend a protocol you can attempt.']
    else:
        empathy_features = [f'{e_literal} - Was this caused by a specific event/s?',
                            f'{e_literal} - Was this caused by a recent or distant event (or events)?',
                            f'{e_literal} - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?',
                            f'{e_literal} - Thank you. Now I will ask some questions to understand your situation.',
                            f'{e_literal} - Have you strongly felt or expressed any of the following emotions towards someone:',
                            f'{e_literal} - Do you believe that you should be the saviour of someone else?',
                            f'{e_literal} - Do you see yourself as the victim, blaming someone else for how negative you feel?',
                            f'{e_literal} - Do you feel that you are trying to control someone?',
                            f'{e_literal} - Are you always blaming and accusing yourself for when something goes wrong?',
                            f'{e_literal} - In previous conversations, have you considered other viewpoints presented?',
                            f'{e_literal} - Are you undergoing a personal crisis (experiencing difficulties with loved ones e.g. falling out with friends)?']

    for i in range(len(empathy_features)):
        empPersonasFemale = empPersonas[empPersonas['Sex'] == 'Female']
        empPersonasOlivia = empPersonasFemale[(empPersonasFemale['Age'] == '18-24') | (empPersonasFemale['Age'] == '25-29') | (empPersonasFemale['Age'] == '30-39')]
        empathy_response = empPersonasOlivia[empathy_features[i]].dropna().reset_index(drop=True)
        olivia_data = pd.concat([olivia_data, empathy_response], axis=1)

olivia_data.to_csv('drive/MyDrive/Individual Project/Data/Empathy/olivia.csv')

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [None]:
#computing the highest influence scored sentence

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import torch
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

regextokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stemmer = nltk.stem.PorterStemmer()
gpttokenizer = GPT2Tokenizer.from_pretrained('gpt2')

#Load pre-trained GPT2 language model weights
with torch.no_grad():
    gptmodel = GPT2LMHeadModel.from_pretrained('gpt2')
    gptmodel.eval()


def repetition_penalty(sentence):
  '''
  Adds a penalty for each repeated (stemmed) token in
  an utterance. Returns the total penalty of the sentence
  '''
  word_list = regextokenizer.tokenize(sentence.lower())
  filtered_words = [word for word in word_list if word not in stopwords.words('english')]
  stem_list = [stemmer.stem(word) for word in filtered_words]
  penalty = 0
  visited = []
  for w in stem_list:
    if w not in visited:
      visited.append(w)
    else:
      penalty += 0.005
  return penalty

def perplexity(sentence):
  '''
  Computes the PPL of an utterance using GPT2 LM
  '''
  tokenize_input = gpttokenizer.encode(sentence)
  tensor_input = torch.tensor([tokenize_input])
  with torch.no_grad():
      loss = gptmodel(tensor_input, labels=tensor_input)[0]
  return np.exp(loss.detach().numpy())


def fluency_score(sentence):
  '''
  Computes the fluency score of an utterance, given by the
  inverse of the perplexity minus a penalty for repeated tokens
  '''
  ppl = perplexity(sentence)
  penalty = repetition_penalty(sentence)
  score = (1 / ppl) - penalty
  return score

def replace_with_score(cell):
    if isinstance(cell, int):
        return

    return cell.replace(cell, str(fluency_score(cell)))

data = pd.read_csv('drive/MyDrive/Individual Project/Data/Empathy/kai.csv')


#data.applymap(replace_with_score)
data[:] = np.vectorize(replace_with_score)(data)

print(data.max())




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: ignored

In [None]:
print(data)

     Unnamed: 0       Sad - Was this caused by a specific event/s?  \
0             0  In order to try to make you feel better, I nee...   
1             1  In order to try to make you feel better, I nee...   
2             2  In order to try to make you feel better, I nee...   
3             3  In order to try to make you feel better, I nee...   
4             4  In order to try to make you feel better, I nee...   
..          ...                                                ...   
331         331  Thank you for letting me know, I am terribly s...   
332         332  Thank you for letting me know, I am terribly s...   
333         333  Thank you for letting me know, I am terribly s...   
334         334  Thank you for letting me know, I am terribly s...   
335         335  Thank you for letting me know, I am terribly s...   

    Sad - Was this caused by a recent or distant event (or events)?  \
0    It is important to understand how long you hav...                
1    It is import