In [None]:
import json
import re
from typing import List

import numpy as np
import pandas as pd

from datasets import load_dataset, concatenate_datasets
from lexical_diversity import lex_div as ld
from tqdm import tqdm

# Preprocess Datasets

In [None]:
daily_dialog = load_dataset("daily_dialog")
persona_chat = load_dataset("bavard/personachat_truecased")
wizard_of_wikipedia = load_dataset("chujiezheng/wizard_of_wikipedia")
empathetic_dialogues = load_dataset("empathetic_dialogues")
blended_skill_talk = load_dataset("blended_skill_talk")
prosocial_dialog = load_dataset("allenai/prosocial-dialog")
soda = load_dataset("allenai/soda")

In [None]:
dialogues = {
    "daily_dialog": [],
    "persona_chat": [],
    "wizard_of_wikipedia": [],
    "empathetic_dialogues": [],
    "blended_skill_talk": [],
    "prosocial_dialog": [],
    "soda": [],
    "time_delta": [],
}

## Daily Dialog

In [None]:
all_daily_dialog = concatenate_datasets(
    [
        daily_dialog["train"],
        daily_dialog["validation"],
        daily_dialog["test"]
    ]
)

In [None]:
all_daily_dialog[0]

In [None]:
for dialog in all_daily_dialog:
    dialogues["daily_dialog"].append(dialog["dialog"])

## PersonaChat

In [None]:
df_personachat_train = pd.DataFrame(persona_chat["train"])
df_personachat_valid = pd.DataFrame(persona_chat["validation"])

In [None]:
df_personachat_train.sort_values(by=["conv_id", "utterance_idx"], inplace=True)
df_personachat_valid.sort_values(by=["conv_id", "utterance_idx"], inplace=True)

In [None]:
last_personachat_train = df_personachat_train.groupby("conv_id").last()
last_personachat_valid = df_personachat_valid.groupby("conv_id").last()

In [None]:
last_personachat = pd.concat([last_personachat_train, last_personachat_valid])

In [None]:
print(len(last_personachat))
last_personachat.head()

In [None]:
for dialog in last_personachat.itertuples():
    dialogues["persona_chat"].append(dialog.history)

## Wizard of Wikipedia

In [None]:
all_wizard_of_wikipedia = concatenate_datasets(
    [
        wizard_of_wikipedia["train"],
        wizard_of_wikipedia["validation"],
        wizard_of_wikipedia["test"]
    ]
)

In [None]:
all_wizard_of_wikipedia[0]

In [92]:
for dialog in all_wizard_of_wikipedia:
    dialogues["wizard_of_wikipedia"].append(dialog["response"])

## Empathetic Dialogues

In [93]:
df_empathetic_dialogues_train = pd.DataFrame(empathetic_dialogues["train"])
df_empathetic_dialogues_valid = pd.DataFrame(empathetic_dialogues["validation"])
df_empathetic_dialogues_test = pd.DataFrame(empathetic_dialogues["test"])

In [94]:
grouped_empathetic_dialogues_train = df_empathetic_dialogues_train.groupby("conv_id").agg(list).reset_index()
grouped_empathetic_dialogues_valid = df_empathetic_dialogues_valid.groupby("conv_id").agg(list).reset_index()
grouped_empathetic_dialogues_test = df_empathetic_dialogues_test.groupby("conv_id").agg(list).reset_index()

In [95]:
grouped_empathetic_dialogues = pd.concat(
    [
        grouped_empathetic_dialogues_train,
        grouped_empathetic_dialogues_valid,
        grouped_empathetic_dialogues_test
    ]
)

In [96]:
print(len(grouped_empathetic_dialogues))
grouped_empathetic_dialogues.head()

23149


Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,"[1, 2, 3, 4, 5, 6]","[sentimental, sentimental, sentimental, sentim...",[I remember going to the fireworks with my bes...,"[1, 0, 1, 0, 1, 0]",[I remember going to see the fireworks with my...,"[5|5|5_2|2|5, 5|5|5_2|2|5, 5|5|5_2|2|5, 5|5|5_...","[, , , , , ]"
1,hit:10000_conv:20000,"[1, 2, 3, 4]","[surprised, surprised, surprised, surprised]",[My girlfriend got me a toad today! I was so s...,"[209, 4, 209, 4]","[My girlfriend got me a pet toad today!, Do yo...","[5|5|5_5|5|5, 5|5|5_5|5|5, 5|5|5_5|5|5, 5|5|5_...","[, , , ]"
2,hit:10000_conv:20001,"[1, 2, 3, 4]","[impressed, impressed, impressed, impressed]","[I really like the new paint job on my house.,...","[4, 209, 4, 209]","[I really like the new paint job on my house.,...","[5|5|5_5|5|5, 5|5|5_5|5|5, 5|5|5_5|5|5, 5|5|5_...","[, , , ]"
3,hit:10001_conv:20002,"[1, 2, 3, 4]","[lonely, lonely, lonely, lonely]",[I went to the skating rink all by myself toda...,"[209, 513, 209, 513]",[I went to the skating rink all by myself toda...,"[5|5|5_3|3|5, 5|5|5_3|3|5, 5|5|5_3|3|5, 5|5|5_...","[, , , ]"
4,hit:10002_conv:20004,"[1, 2, 3, 4]","[ashamed, ashamed, ashamed, ashamed]",[I was walking on the road. I saw beggar and i...,"[43, 516, 43, 516]",[I was walking on the road. I saw a beggar and...,"[5|5|5_4|4|5, 5|5|5_4|4|5, 5|5|5_4|4|5, 5|5|5_...","[, , , ]"


In [97]:
for dialog in grouped_empathetic_dialogues.itertuples():
    dialogues["empathetic_dialogues"].append(dialog.utterance)

## Blended Skill Talk

In [98]:
all_blended_skill_talk = concatenate_datasets(
    [
        blended_skill_talk["train"],
        blended_skill_talk["validation"],
        blended_skill_talk["test"]
    ]
)

In [99]:
all_blended_skill_talk[0]

{'personas': ["i've 2 kids.", 'i love flowers.'],
 'additional_context': '',
 'previous_utterance': ["I love live music, that's why I try to go to concerts",
  'I do too. Wat do you like?'],
 'context': 'empathetic_dialogues',
 'free_messages': ['I like acting, I hope to be an actor, what about you?',
  'No, but someday.',
  'After I am done with school I plan to have a family.',
  'I hope so, how old are your kids?',
  'I would imagine. I am sure they a great kids.',
  'I wish I had more time to do stuff like that. Medical school is exhausting. '],
 'guided_messages': ['that is ok.  have any kids?',
  'that is good. I have 2',
  'that is great! you will be ready',
  '5 & 7.  they take up a lot of my time',
  'luckily, they love flowers just as much as I do.  we spend a lot of time in the garden',
  'sounds like it. have you gotten any acting jobs, though?'],
 'suggestions': {'convai2': ["i love acting ! i'll be famous someday . what do you do ?",
   'no no kids , might get some though

In [100]:
for dialog in all_blended_skill_talk:
    all_utterances = dialog["previous_utterance"] + dialog["free_messages"] + dialog["guided_messages"]
    dialogues["blended_skill_talk"].append(all_utterances)

## Prosocial Dialog

In [101]:
df_prosocial_dialog_train = pd.DataFrame(prosocial_dialog["train"])
df_prosocial_dialog_valid = pd.DataFrame(prosocial_dialog["validation"])
df_prosocial_dialog_test = pd.DataFrame(prosocial_dialog["test"])

In [102]:
grouped_prosocial_dialog_train = df_prosocial_dialog_train.groupby("dialogue_id").agg(list).reset_index()
grouped_prosocial_dialog_valid = df_prosocial_dialog_valid.groupby("dialogue_id").agg(list).reset_index()
grouped_prosocial_dialog_test = df_prosocial_dialog_test.groupby("dialogue_id").agg(list).reset_index()

In [103]:
grouped_prosocial_dialog = pd.concat(
    [
        grouped_prosocial_dialog_train,
        grouped_prosocial_dialog_valid,
        grouped_prosocial_dialog_test
    ]
)

In [104]:
print(len(grouped_prosocial_dialog))
grouped_prosocial_dialog.head()

58137


Unnamed: 0,dialogue_id,context,response,rots,safety_label,safety_annotations,safety_annotation_reasons,source,etc,response_id,episode_done
0,0,[I tried a bit of shadowboxing today. Got one ...,"[This isn't a joke about black people, right? ...",[[It's not right to think black people deserve...,"[__needs_caution__, __casual__]","[[needs caution, needs caution, needs caution]...",[[It's rude to make jokes about the color of p...,"[sbic, sbic]","[, ]","[0, 1]","[False, True]"
1,1,[Did I tell you how I used to poke holes in co...,"[That's a pretty low down thing to do., You sh...","[[It's immoral to poke holes in condoms.], [It...","[__needs_intervention__, __needs_intervention_...","[[needs caution, needs intervention, needs int...",[[Speaker admits to committing a seriously wro...,"[socialchemistry, socialchemistry, socialchemi...","[reddit/confessions/bsajnv, reddit/confessions...","[0, 1, 2]","[False, False, True]"
2,2,[I'm going to demand that my girlfriends gymna...,"[Do you have a reason for this?, I get your po...","[[It's wrong to try to control others., People...","[__needs_caution__, __needs_caution__, __needs...","[[needs caution, needs caution, needs caution]...",[[It would be rude to demand something like th...,"[socialchemistry, socialchemistry, socialchemi...","[reddit/amitheasshole/b4qz08, reddit/amitheass...","[0, 1, 2]","[False, False, True]"
3,3,[I don't get Sandy Hook jokes. I guess they ar...,"[What are your thoughts on mass shootings?, Th...","[[You shouldn't trivialize harm to victims], [...","[__needs_caution__, __needs_caution__]","[[needs caution, needs caution, needs caution]...",[[It's offensive to make jokes about a school ...,"[sbic, sbic]","[, ]","[0, 1]","[False, True]"
4,4,[Diversity Visa. Chuck Shummer. JEW.JEW. JEW. ...,[Do you see the Jewish community in a very neg...,"[[It's not right to think Jews are evil, It's ...","[__needs_caution__, __needs_caution__]","[[needs caution, needs caution, needs caution]...","[[It's wrong to be anti-Semitic., It's offensi...","[sbic, sbic]","[, ]","[0, 1]","[False, True]"


In [105]:
for dialog in grouped_prosocial_dialog.itertuples():
    all_utterances = dialog.context + dialog.response
    dialogues["prosocial_dialog"].append(all_utterances)

## SODA

In [106]:
all_soda = concatenate_datasets(
    [
        soda["train"],
        soda["validation"],
        soda["test"]
    ]
)

In [107]:
all_soda[0]

{'head': 'PersonX thought about going to church',
 'relation': 'xNeed',
 'tail': 'to be interested in going to church',
 'literal': 'Veda was interested in going to church. Veda thought about going to church.',
 'narrative': 'Veda thought about going to church because she was interested in the religion. She had never been to church before, but she had heard good things about it. She decided to go to a nearby church and see what it was like.',
 'dialogue': ["Hi, Father. I'm Veda. I'm new to the area and was curious about your church. Could you tell me a little bit about it?",
  "Of course, Veda. Our church is based on the teachings of Jesus Christ. We believe in loving our neighbor and treating others as we would want to be treated. We strive to live according to Christ's example and teachings.",
  'That sounds like a really great way to live. I can see why so many people are drawn to this religion. What do you think makes Christianity different from other religions?',
  'Well, there ar

In [108]:
for dialog in all_soda:
    dialogues["soda"].append(dialog["dialogue"])

## Time Delta

In [109]:
with open("../results/gpt-3.5-turbo-0613_mc_taco.jsonl", "r") as f:
    time_delta = [json.loads(line) for line in f]
    for idx, instance in enumerate(time_delta):
        output = instance["generated"]
        
        pattern = r"\[with time elapsed\](.*?)\[without time elapsed\]"
        matches = re.findall(pattern, output, re.DOTALL)

        if matches:
            dialog_text = matches[0].strip()
        else:
            print(f"No match found for index {idx + 1}.")
            continue

        dialog = []
        for utt in dialog_text.split("\n"):
            if utt.startswith("[") or len(utt) < 4:
                continue
            elif ": " in utt:
                dialog.append(utt.split(": ")[1])
            else:
                print(f"Unexpected utterance: {utt}")
    
        dialogues["time_delta"].append(dialog)

Unexpected utterance: How long did they know the plane had been hijacked?


# Compute Stats

In [110]:
stats = {
    "daily_dialog": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
    "persona_chat": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
    "wizard_of_wikipedia": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
    "empathetic_dialogues": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
    "blended_skill_talk": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
    "prosocial_dialog": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
    "soda": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
    "time_delta": {
        "num_dialog": 0,
        "avg_num_turns": 0,
        "avg_utt_length": 0,
        "mtld": 0,
    },
}

In [111]:
def compute_mtld(dialogues: List[List[str]]) -> float:
    return round(np.mean([ld.mtld(ld.flemmatize(' '.join(dialog))) for dialog in dialogues]), 2)

def utterance_length(dialogues: List[List[str]]) -> float:
    utterances = np.hstack(dialogues).tolist()
    return np.mean([len(utterance.split()) for utterance in utterances])

def turn_number(dialogues: List[List[str]]) -> float:
    return np.mean([len(dialogue) for dialogue in dialogues])

In [112]:
dataset_names = [
    "daily_dialog",
    "persona_chat",
    "wizard_of_wikipedia",
    "empathetic_dialogues",
    "blended_skill_talk",
    "prosocial_dialog",
    "soda",
    "time_delta"
]

In [113]:
for dataset_name in tqdm(dataset_names):
    stats[dataset_name]["num_dialog"] = len(dialogues[dataset_name])
    stats[dataset_name]["avg_num_turns"] = turn_number(dialogues[dataset_name])
    stats[dataset_name]["avg_utt_length"] = utterance_length(dialogues[dataset_name])
    stats[dataset_name]["mtld"] = compute_mtld(dialogues[dataset_name])

100%|██████████| 8/8 [06:46<00:00, 50.82s/it] 


In [114]:
print("Data Statistics")
print()
print("Dataset\tDialog\tTurn\tUtt\tMTLD")
print("-------\t-----\t----\t----\t----")
for k, v in stats.items():
    print(f"{k[:5]}\t{v['num_dialog']}\t{v['avg_num_turns']:.1f}\t{v['avg_utt_length']:.1f}\t{v['mtld']:.1f}")

Data Statistics

Dataset	Dialog	Turn	Utt	MTLD
-------	-----	----	----	----
daily	13118	7.9	13.6	63.5
perso	18878	13.8	9.7	59.3
wizar	22311	4.0	18.3	69.1
empat	23149	4.3	16.6	66.8
blend	6808	13.2	13.5	70.3
proso	58137	5.7	20.0	59.5
soda	1486896	7.6	16.1	67.9
time_	324	5.5	10.5	71.2
