# Load EmpatheticIntents dataset into a dataframe

## Clone EmpatheticIntents Repo as it contains the dataset files

In [None]:
!git clone https://github.com/anuradha1992/EmpatheticIntents.git

fatal: destination path 'EmpatheticIntents' already exists and is not an empty directory.


In [None]:
import pandas as pd
import glob

In [None]:
path = r'/content/EmpatheticIntents/datasets/empatheticdialogues_annotated' 
all_files = glob.glob(path + "/*.csv")

li = []

# load the contents of the dialogues in each of the emotion files
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_annotated = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df_annotated.shape

(132103, 5)

In [None]:
df_annotated.head

<bound method NDFrame.head of                   Dialog_ID       Type     Actor  \
0          hit:335_conv:670  situation      none   
1          hit:335_conv:670  utterance   speaker   
2          hit:335_conv:670  utterance  listener   
3          hit:335_conv:670  utterance   speaker   
4          hit:335_conv:670  utterance  listener   
...                     ...        ...       ...   
132098  hit:5478_conv:10957  situation      none   
132099  hit:5478_conv:10957  utterance   speaker   
132100  hit:5478_conv:10957  utterance  listener   
132101  hit:5478_conv:10957  utterance   speaker   
132102  hit:5478_conv:10957  utterance  listener   

                                                     Text          Label  
0          I am embarrassed to be a Cleveland Browns fan.        ashamed  
1       I am embarrassed to be a fan of the Cleveland ...        ashamed  
2       I would be as well if I were you. Are you from...    questioning  
3                             Yes. I grew up 

In [None]:
df_intents = df_annotated.query('Actor=="listener"')['Label']


In [None]:
df_intents.unique()

array(['questioning', 'suggesting', 'sympathizing', 'neutral', 'agreeing',
       'consoling', 'acknowledging', 'faithful', 'trusting', 'disgusted',
       'anxious', 'content', 'sad', 'hopeful', 'surprised', 'encouraging',
       'prepared', 'impressed', 'confident', 'ashamed', 'sentimental',
       'jealous', 'joyful', 'annoyed', 'grateful', 'nostalgic', 'proud',
       'disappointed', 'guilty', 'wishing', 'embarrassed', 'caring',
       'angry', 'apprehensive', 'devastated', 'anticipating', 'lonely',
       'furious', 'afraid', 'terrified', 'excited'], dtype=object)

## Generate set of all dialog ids

In [None]:
annotated_conv_ids_set = set(df_annotated['Dialog_ID'].unique())

In [None]:
len(annotated_conv_ids_set)

24856

# Load the EmpatheticDialogues Dataset

In [None]:
!wget https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz
!tar -xvzf empatheticdialogues.tar.gz

--2022-04-23 05:17:47--  https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28022709 (27M) [application/gzip]
Saving to: ‘empatheticdialogues.tar.gz.1’


2022-04-23 05:17:49 (21.9 MB/s) - ‘empatheticdialogues.tar.gz.1’ saved [28022709/28022709]

empatheticdialogues/
empatheticdialogues/test.csv
empatheticdialogues/train.csv
empatheticdialogues/valid.csv


In [None]:
def read_csv_file_custom(file_path):
    # pd read csv with , was throwing errors. Hence explicitly fetching with line.split conditions only for assertions that everything is alright
    with open(file_path) as file_buf: 
        contents = []
        data = file_buf.readlines()
        header = data[0].split(',')
        for line in data[1:]: 
            # if len(line.split(',')) != len(header): 
            #     print(len(line.split(',')))
            contents.append(line.split(',')[:len(header)])  
            # take first len(header) columns
            # Note from Prasoon: 9th and 10th columns seem to have utter garbage on manual inspection of raw csv files

    df = pd.DataFrame(contents, columns=header)
    return df

train_df = read_csv_file_custom("empatheticdialogues/train.csv")
val_df = read_csv_file_custom("empatheticdialogues/valid.csv")
test_df = read_csv_file_custom("empatheticdialogues/test.csv")
print(f"Num conversations in ED train set {len(set(list(train_df['conv_id'])))}")
print(f"Num conversations in ED validation set {len(set(list(val_df['conv_id'])))}")
print(f"Num conversations in ED test set {len(set(list(test_df['conv_id'])))}")

Num conversations in ED train set 19533
Num conversations in ED validation set 2770
Num conversations in ED test set 2547


## Get set of all dialog ids

In [None]:
train_dialog_ids_set = set(train_df['conv_id'].unique())
val_dialog_ids_set = set(val_df['conv_id'].unique())
test_dialog_ids_set = set(test_df['conv_id'].unique())

In [None]:
empathetic_dialogues_set = train_dialog_ids_set | val_dialog_ids_set | test_dialog_ids_set

In [None]:
len(empathetic_dialogues_set)

24850

## Get the missing ids

In [None]:
union_ids = annotated_conv_ids_set | empathetic_dialogues_set 

In [None]:
len(union_ids)

24859

In [None]:
print(annotated_conv_ids_set - empathetic_dialogues_set)

{'hit:1675_conv:3350 (1)', 'hit:9040_conv:18080 (1)', 'hit:9022_conv:18044 (1)', 'hit:8745_conv:17490 (1)', 'hit:3518_conv:7037 (1)', 'hit:3789_conv:7578 (1)', 'hit:3654_conv:7308 (1)', 'hit:11672_conv:23344 (1)', 'hit:2658_conv:5316 (1)'}


In [None]:
print(empathetic_dialogues_set - annotated_conv_ids_set)

{'hit:12423_conv:24847', 'hit:12424_conv:24849', 'hit:12392_conv:24785'}


In [None]:
print(df_annotated.query('Dialog_ID=="hit:2658_conv:5316"'))

                Dialog_ID       Type     Actor  \
75185  hit:2658_conv:5316  situation      none   
75186  hit:2658_conv:5316  utterance   speaker   
75187  hit:2658_conv:5316  utterance  listener   
75188  hit:2658_conv:5316  utterance   speaker   
75189  hit:2658_conv:5316  utterance  listener   
75190  hit:2658_conv:5316  utterance   speaker   

                                                    Text        Label  
75185  I can't face my wife, I had a child out of my ...  embarrassed  
75186  I can't face my wife, I had a child out of my ...  embarrassed  
75187               Oh my goodness! Why did you do that?  questioning  
75188                 I was so stupid, now I know better  embarrassed  
75189     You should tell your wife to clear your guilt.   suggesting  
75190  I am too embarrassed maybe I will talk to my p...  embarrassed  


In [None]:
print(df_annotated.query('Dialog_ID=="hit:2658_conv:5316 (1)"'))

                    Dialog_ID       Type     Actor  \
73638  hit:2658_conv:5316 (1)  situation      none   
73639  hit:2658_conv:5316 (1)  utterance   speaker   
73640  hit:2658_conv:5316 (1)  utterance  listener   
73641  hit:2658_conv:5316 (1)  utterance   speaker   
73642  hit:2658_conv:5316 (1)  utterance  listener   
73643  hit:2658_conv:5316 (1)  utterance   speaker   

                                                    Text        Label  
73638  I can't face my wife, I had a child out of my ...  embarrassed  
73639  I can't face my wife, I had a child out of my ...  embarrassed  
73640               Oh my goodness! Why did you do that?  questioning  
73641                 I was so stupid, now I know better  embarrassed  
73642     You should tell your wife to clear your guilt.   suggesting  
73643  I am too embarrassed maybe I will talk to my p...  embarrassed  


In [None]:
intersection_ids = annotated_conv_ids_set & empathetic_dialogues_set 

In [None]:
len(intersection_ids)

24847

#Preprocessing the empathetic intents df

## Use only the common Dialog Ids

In [None]:
filtered_intent_df = df_annotated.loc[df_annotated['Dialog_ID'].isin(list(intersection_ids))]

In [None]:
filtered_intent_df.shape

(132055, 5)

###Total number of conversations

In [None]:
print(len(set(filtered_intent_df['Dialog_ID'].unique())))

24847


##Remove the situation utterance row

In [None]:
filtered_intent_df = filtered_intent_df[filtered_intent_df['Type'] != 'situation']
filtered_intent_df['Utterance_ID'] = filtered_intent_df.groupby(['Dialog_ID']).cumcount()+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Dialog_ID,Type,Actor,Text,Label,Utterance_ID
1,hit:335_conv:670,utterance,speaker,I am embarrassed to be a fan of the Cleveland ...,ashamed,1
2,hit:335_conv:670,utterance,listener,I would be as well if I were you. Are you from...,questioning,2
3,hit:335_conv:670,utterance,speaker,Yes. I grew up in the city.,lonely,3
4,hit:335_conv:670,utterance,listener,wasn't Johnny Manziel susposed to save that team?,questioning,4
6,hit:11202_conv:22405,utterance,speaker,"When I met my friend for coffee this morning, ...",embarrassed,1
...,...,...,...,...,...,...
132097,hit:8635_conv:17271,utterance,listener,"Oh, that had to be super gross!",acknowledging,4
132099,hit:5478_conv:10957,utterance,speaker,"I don't think I am a judgmental person, but we...",disgusted,1
132100,hit:5478_conv:10957,utterance,listener,oh that little unfair,angry,2
132101,hit:5478_conv:10957,utterance,speaker,"Trust me, this place was out of control. They ...",disgusted,3


In [None]:
filtered_intent_df.shape

(107208, 6)

In [None]:
filtered_intent_df.head()

Unnamed: 0,Dialog_ID,Type,Actor,Text,Label,Utterance_ID
1,hit:335_conv:670,utterance,speaker,I am embarrassed to be a fan of the Cleveland ...,ashamed,1
2,hit:335_conv:670,utterance,listener,I would be as well if I were you. Are you from...,questioning,2
3,hit:335_conv:670,utterance,speaker,Yes. I grew up in the city.,lonely,3
4,hit:335_conv:670,utterance,listener,wasn't Johnny Manziel susposed to save that team?,questioning,4
6,hit:11202_conv:22405,utterance,speaker,"When I met my friend for coffee this morning, ...",embarrassed,1


###Confirm that we still have the same number of conversations and only the situation rows are deleted

In [None]:
print(len(set(filtered_intent_df['Dialog_ID'].unique())))

24847


##Split the filtered dataset into train, test and val datasets

###Use the same splits as those in the EmpatheticDialogs dataset

In [None]:
train_intent_df = filtered_intent_df.loc[filtered_intent_df['Dialog_ID'].isin(list(train_dialog_ids_set))]
val_intent_df = filtered_intent_df.loc[filtered_intent_df['Dialog_ID'].isin(list(val_dialog_ids_set))]
test_intent_df = filtered_intent_df.loc[filtered_intent_df['Dialog_ID'].isin(list(test_dialog_ids_set))]

In [None]:
print(len(set(train_intent_df['Dialog_ID'].unique())))

19532


In [None]:
print(len(set(val_intent_df['Dialog_ID'].unique())))

2769


In [None]:
print(len(set(test_intent_df['Dialog_ID'].unique())))

2546


In [None]:
test_intent_df.head()

Unnamed: 0,Dialog_ID,Type,Actor,Text,Label,Utterance_ID
3067,hit:11191_conv:22383,utterance,speaker,I couldn't sleep last night. I ended up stayin...,ashamed,1
3068,hit:11191_conv:22383,utterance,listener,Why couldn't you sleep?,questioning,2
3069,hit:11191_conv:22383,utterance,speaker,Combination of stress and new meds I think. Ma...,suggesting,3
3070,hit:11191_conv:22383,utterance,listener,I hope so! Why are you stressed?,encouraging,4
3072,hit:10220_conv:20441,utterance,speaker,My cat puked all over my roommates shoes. I fe...,guilty,1


In [None]:
train_intent_df.head()

Unnamed: 0,Dialog_ID,Type,Actor,Text,Label,Utterance_ID
1,hit:335_conv:670,utterance,speaker,I am embarrassed to be a fan of the Cleveland ...,ashamed,1
2,hit:335_conv:670,utterance,listener,I would be as well if I were you. Are you from...,questioning,2
3,hit:335_conv:670,utterance,speaker,Yes. I grew up in the city.,lonely,3
4,hit:335_conv:670,utterance,listener,wasn't Johnny Manziel susposed to save that team?,questioning,4
6,hit:11202_conv:22405,utterance,speaker,"When I met my friend for coffee this morning, ...",embarrassed,1


In [None]:
def transform_df(df):
    df["speaker_idx"] = df["Actor"].apply(lambda x: 0 if x == 'speaker' else 1).astype(int)
    df = df.rename({
        "Dialog_ID": "conv_id", 
        "Utterance_ID": "utterance_idx", 
        "Text": "utterance", 
        "Label": "label", 
    })
    df = df[["conv_id", "utterance_idx", "utterance", "label"]]
    return df

train_intent_df = transform_df(train_intent_df)
val_intent_df   = transform_df(val_intent_df)
test_intent_df  = transform_df(test_intent_df)

10969

# Perspective API evaluation

##The initialization

In [None]:
from googleapiclient import discovery
import json

API_KEY = 'AIzaSyAOf7k6AwhH3MRxCiWJNeHUEe49_Lck-J0'

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  # static_discovery=False,
)

##Invoking Perspective API

In [None]:
text = 'The text whose toxicity needs to be evaluated'
analyze_request = {
  'comment': { 'text': text },
  'requestedAttributes': {'TOXICITY': {}}
}

response = client.comments().analyze(body=analyze_request).execute()

In [None]:
response['attributeScores']['TOXICITY']['summaryScore']['value']


0.17723809