In [1]:
# Import libraries and set size of plots
import pandas as pd
from matplotlib import pyplot as plt

from src.utils.analysis import get_tfidf, get_type_token_ratio, sunburst_ngrams
from src.utils.helpers import merge_data
from src.utils.reporting import report_stats_original_datasets, report_stats_overlap_datasets, report_stats_dataset

plt.rcParams['figure.figsize'] = [20, 5]

In [2]:
# Read data
dialogue_df = pd.read_csv("./../data/original_datasets/DIALOCONAN.csv").sort_values(by=["dialogue_id", "turn_id"])
knowledge_df = pd.read_csv("./../data/original_datasets/multitarget_KN_grounded_CN.csv")

report_stats_original_datasets(dialogue_df, knowledge_df)

Utterances in DIALOCONAN: 16625
Dialogues in DIALOCONAN: 3059
HS-CN pairs in k-CONAN: 195

HS in DIALOCONAN: 8314
CN in DIALOCONAN: 8311
Unique HS in DIALOCONAN: 4771
Unique CN in DIALOCONAN: 6887

Unique HS in k-CONAN: 104
Unique CN in k-CONAN: 195
Unique knowledge in k-CONAN: 156

Average utterances per dialogue in DIALOCONAN: 16625 / 3059 = 5.434782608695652


In [3]:
# Calculate overlap between datasets
overlap_df = dialogue_df.merge(knowledge_df, how="inner", left_on="text", right_on="hate_speech", sort=True)
report_stats_overlap_datasets(overlap_df, dialogue_df)

Matched utterances in k-DIALOCONAN: 100

Unique dialogues with knowledge matches: 98
Unique utterances with knowledge matches: 36

Unique HS with relevant knowledge: 36
Unique CN: 34
Unique knowledge sentences: 67
Unique k-CN: 77



In [1]:
# See how many knowledge items there are per dialogue
per_dialogue_df = overlap_df.groupby(["dialogue_id"])[["knowledge_sentence", "counter_narrative"]].nunique()

# Check if we have more knowledge sentences than counter-narratives
more_k = per_dialogue_df[per_dialogue_df["knowledge_sentence"] > per_dialogue_df["counter_narrative"]]
less_k = per_dialogue_df[per_dialogue_df["knowledge_sentence"] < per_dialogue_df["counter_narrative"]]
equal_k = per_dialogue_df[per_dialogue_df["knowledge_sentence"] == per_dialogue_df["counter_narrative"]]

print(f"Comparing num of knoweldge items vs counter-narratives:\n"
      f"\tmore knowledge:{len(more_k)}\n"
      f"\tless knowledge: {len(less_k)}\n"
      f"\tequal knowledge: {len(equal_k)}")

# Visualize distribution
per_dialogue_df.sort_values(by=["knowledge_sentence", "counter_narrative"], inplace=True)
per_dialogue_df.plot.bar()
plt.savefig("./../plots/dialogue_knowledge_distribution.png")  #, bbox_inches="tight")

NameError: name 'overlap_df' is not defined

In [5]:
# Average knowledge and cn, per dialogue
avg_k_sentences = per_dialogue_df.mean()
print(f"Average knowledge items per dialogue: {avg_k_sentences['knowledge_sentence']}")
print(f"Average knowledge based counter-narratives per dialogue: {avg_k_sentences['counter_narrative']}")

Average knowledge items per dialogue: 1.989795918367347
Average knowledge based counter-narratives per dialogue: 2.2142857142857144


In [6]:
# Create dataset as in DSTC: {dialogue_id}
logs = merge_data(dialogue_df, knowledge_df, format='DSTC', save_as_json=True)
hs, cn, k, kcn = report_stats_dataset(logs, 'k-DIALOCONAN')

Wrote dataset to file: ./../data/KDIALOCONAN_gold.json
Dialogues in k-DIALOCONAN: 3059
Utterances in k-DIALOCONAN: 16629
Utterances (HS) with knowledge in k-DIALOCONAN: 100

HS in k-DIALOCONAN: 8316
CN in k-DIALOCONAN: 8313
Knowledge items in k-DIALOCONAN: 195
k-CN in k-DIALOCONAN: 217

Unique HS in k-DIALOCONAN: 4770
Unique CN in k-DIALOCONAN: 6885
Unique knowledge items in k-DIALOCONAN: 67
Unique k-CN in k-DIALOCONAN: 77

Average utterances per dialogue k-DIALOCONAN: 5.43609022556391
Average utterances with knowledge per dialogue k-DIALOCONAN: 0.032690421706440015
Average knowledge items per dialogue k-DIALOCONAN: 0.06374632232755803
Average k-CN per dialogue k-DIALOCONAN: 0.07093821510297482

Average knowledge items per utterance in k-DIALOCONAN: 0.011726501894281075
Average k-CN per utterance in k-DIALOCONAN: 0.01304949185158458

Average knowledge items per HS in k-DIALOCONAN: 0.023448773448773448
Average k-CN per HS in k-DIALOCONAN: 0.026094276094276093



In [7]:
logs_grounded = merge_data(dialogue_df, knowledge_df, format='DSTC_filtered', save_as_json=True)
hs_g, cn_g, k_g, kcn_g = report_stats_dataset(logs_grounded, 'k-DIALOCONAN (filtered)')

Wrote dataset to file: ./../data/KDIALOCONAN_grounded_gold.json
Dialogues in k-DIALOCONAN (filtered): 98
Utterances in k-DIALOCONAN (filtered): 546
Utterances (HS) with knowledge in k-DIALOCONAN (filtered): 100

HS in k-DIALOCONAN (filtered): 273
CN in k-DIALOCONAN (filtered): 273
Knowledge items in k-DIALOCONAN (filtered): 195
k-CN in k-DIALOCONAN (filtered): 217

Unique HS in k-DIALOCONAN (filtered): 156
Unique CN in k-DIALOCONAN (filtered): 257
Unique knowledge items in k-DIALOCONAN (filtered): 67
Unique k-CN in k-DIALOCONAN (filtered): 77

Average utterances per dialogue k-DIALOCONAN (filtered): 5.571428571428571
Average utterances with knowledge per dialogue k-DIALOCONAN (filtered): 1.0204081632653061
Average knowledge items per dialogue k-DIALOCONAN (filtered): 1.989795918367347
Average k-CN per dialogue k-DIALOCONAN (filtered): 2.2142857142857144

Average knowledge items per utterance in k-DIALOCONAN (filtered): 0.35714285714285715
Average k-CN per utterance in k-DIALOCONAN (fil

In [8]:
# Get tfidf features
corpus_columns = zip([hs_g, cn_g, k_g, kcn_g], ['HS', 'CN', 'K', 'KCN'])
tfidf_df = get_tfidf(corpus_columns)
tfidf_df

Unnamed: 0,HS,CN,K,KCN
0,children,don,islamic,don
1,don,jews,jews,gays
2,europe,like,muslim,jews
3,gays,muslims,muslims,like
4,hate,people,orientation,muslims
5,jews,religion,rights,s
6,muslims,s,s,sexual
7,s,society,said,society
8,t,t,sexual,t
9,want,world,women,world


In [9]:
# Get type token ratio
corpus_columns = zip([hs_g, cn_g, k_g, kcn_g], ['HS', 'CN', 'K', 'KCN'])
type_token_df = get_type_token_ratio(corpus_columns)
type_token_df

Unnamed: 0,HS,CN,K,KCN
0,16.026073,17.268206,9.338456,11.50358


In [10]:
# Get most frequent trigrams
corpus_columns = zip([hs_g, cn_g, k_g, kcn_g], ['HS', 'CN', 'K', 'KCN'])
sunburst_ngrams(corpus_columns)

Wrote plot to file: ./../plots/sunburst_distribution_HS.html
Wrote plot to file: ./../plots/sunburst_distribution_CN.html
Wrote plot to file: ./../plots/sunburst_distribution_K.html
Wrote plot to file: ./../plots/sunburst_distribution_KCN.html
