Skip to content

Commit

Permalink
clean streamlit code
Browse files Browse the repository at this point in the history
  • Loading branch information
Guangsen Wang committed Nov 28, 2022
1 parent 75ec79e commit 89937dd
Show file tree
Hide file tree
Showing 13 changed files with 312 additions and 285 deletions.
38 changes: 22 additions & 16 deletions botsim/modules/remediator/dashboard/dashboard_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@

import os, json
import numpy as np
from botsim.botsim_utils.utils import read_s3_json, dump_s3_file, file_exists, read_s3_data, convert_list_to_dict
from botsim.botsim_utils.utils import (
read_s3_json,
dump_s3_file,
file_exists,
read_s3_data,
convert_list_to_dict,
S3_BUCKET_NAME)
from sentence_transformers import SentenceTransformer


Expand All @@ -30,10 +36,11 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
dev_embedding, dev_labels = np.empty((0, 384)), {"label": []}

if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
if file_exists("botsim", goals_dir + "/dev_embedding.npy") and \
file_exists("botsim", goals_dir + "/dev_embedding_label.npy"):
dev_embedding = np.frombuffer(read_s3_data("botsim", goals_dir + "/dev_embedding.npy")).reshape(-1, 384)
dev_labels = read_s3_json("botsim", goals_dir + "/dev_embedding_label.npy")["label"]
if file_exists(S3_BUCKET_NAME, goals_dir + "/dev_embedding.npy") and \
file_exists(S3_BUCKET_NAME, goals_dir + "/dev_embedding_label.npy"):
dev_embedding = np.frombuffer(read_s3_data(S3_BUCKET_NAME, goals_dir + "/dev_embedding.npy")).reshape(-1,
384)
dev_labels = read_s3_json(S3_BUCKET_NAME, goals_dir + "/dev_embedding_label.npy")["label"]
return dev_embedding, dev_labels
else:
if os.path.exists(goals_dir + "/dev_embedding.npy"):
Expand All @@ -47,13 +54,13 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
for i, intent in enumerate(intents):
file_name = goals_dir + "/" + intent + "_" + para_setting + ".paraphrases.json"
if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
if not file_exists("botsim", file_name):
if not file_exists(S3_BUCKET_NAME, file_name):
paraphrase = False
file_name = goals_dir + "/" + intent + ".json"
utterances = read_s3_json("botsim", file_name)[intent]
utterances = read_s3_json(S3_BUCKET_NAME, file_name)[intent]
else:
print("processing", intent)
paras = read_s3_json("botsim", file_name)
paras = read_s3_json(S3_BUCKET_NAME, file_name)
utterances = []
for p in paras:
utterances.append(p["source"])
Expand All @@ -65,7 +72,6 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
file_name = goals_dir + "/" + intent + ".json"
utterances = json.load(open(file_name))[intent]
else:
print("processing", intent)
paras = json.load(open(file_name))
utterances = []
for p in paras:
Expand All @@ -81,7 +87,7 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
dump_s3_file(goals_dir + "/dev_embedding.npy", dev_embedding.tobytes())
dump_s3_file(goals_dir + "/dev_embedding_label.npy", bytes(json.dumps(dev_labels, indent=2).encode("UTF-8")))
else:
with open(goals_dir + "/dev_embedding.npy", "wb") as f:
with open(goals_dir + "/dev_embedding.npy", "wb") as f:
np.save(f, dev_embedding, allow_pickle=False)
with open(goals_dir + "/dev_embedding_label.npy", "wb") as f:
np.save(f, dev_labels, allow_pickle=True)
Expand Down Expand Up @@ -110,9 +116,9 @@ def get_bot_health_reports(database, test_id):
report_path = "data/bots/{}/{}/aggregated_report.json".format(config["type"], test_id)

if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
if not file_exists("botsim", report_path):
if not file_exists(S3_BUCKET_NAME, report_path):
return None, None, None
report = read_s3_json("botsim", report_path)
report = read_s3_json(S3_BUCKET_NAME, report_path)
else:
if not os.path.exists(report_path):
return None, None, None
Expand All @@ -130,8 +136,8 @@ def get_entities(database, test_id):
entities = None

if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
if file_exists("botsim", entity_path):
entities = read_s3_json("botsim", entity_path)
if file_exists(S3_BUCKET_NAME, entity_path):
entities = read_s3_json(S3_BUCKET_NAME, entity_path)
else:
if os.path.exists(entity_path):
entities = json.load(open(entity_path, "r"))
Expand All @@ -153,8 +159,8 @@ def parse_confusion_matrix(database, test_id, mode):
config = dict(database.get_one_bot_test_instance(test_id))
cm_report_path = "data/bots/{}/{}/remediation/cm_{}_report.json".format(config["type"], test_id, mode)

if file_exists("botsim", cm_report_path):
report = read_s3_json("botsim", cm_report_path)
if file_exists(S3_BUCKET_NAME, cm_report_path):
report = read_s3_json(S3_BUCKET_NAME, cm_report_path)
else:
return None, None, None, None, None, None, None
rows = report["cm_table"]["body_row"]
Expand Down
123 changes: 55 additions & 68 deletions botsim/modules/remediator/dashboard/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,40 @@
import botsim.modules.remediator.dashboard.plot as dashboard_plot
from streamlit_chat import message


def plot_dialog_performance_banner(overall_performance, F1_scores, selected_intent, mode):
row2_spacer1, row2_1, row2_spacer2, row2_2, row2_spacer3, row2_3, \
row2_spacer4, row2_4, row2_spacer5, row2_5, row2_spacer6, row2_6 = st.columns(
(.8, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5))

intent_performance = overall_performance[mode.lower()][selected_intent.replace("_eval", "")]
row2_1.metric("#Sessions", str(sum(list(intent_performance["overall_performance"].values()))), "")
if F1_scores[selected_intent] < 0.9:
row2_2.metric("F1 score", str(F1_scores[selected_intent]), "", "inverse")
else:
row2_2.metric("F1 score", str(F1_scores[selected_intent]), "Good")
if intent_performance["success_rate"] < 0.7:
row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "", "inverse")
else:
row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "")
if intent_performance["intent_error_rate"] > 0.5:
row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "", "inverse")
else:
row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "")
if intent_performance["NER_error_rate"] > 0.5:
row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "", "inverse")
else:
row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "")
row2_6.metric("Other Error Rate", str(intent_performance["other_error_rate"]), "")


def render_summary_reports(database, mode, test, dataset_info, overall_performance):
row1_spacer1, row1_1, row1_spacer2 = st.columns((.2, 7.1, .2))
with row1_1:
st.header("Bot health reports 📊")
st.markdown("The bot health reports consist of a summary report across all intents and "
"per-intent reports to show both the task-completion and NLU performance.")
st.markdown("The bot health reports comprises 1) a summary report of a simulation session "
"across all intents and 2) "
"intent/dialog-specific reports to show both the task-completion and NLU performance.")
row2_spacer1, row2_1, row2_spacer2 = st.columns((.2, 7.1, .4))
with row2_1:
st.subheader("Performance summary for selected test (test_id={}):".format(test))
Expand Down Expand Up @@ -46,36 +74,15 @@ def render_summary_reports(database, mode, test, dataset_info, overall_performan
st.plotly_chart(dashboard_plot.plot_test_performance(intent_to_errors), use_container_width=True)


def render_dialog_report(mode, selected_intent, F1s, overall_performance, detailed_performance):
def render_dialog_report(mode, selected_intent, F1_scores, overall_performance, detailed_performance):
row1_spacer1, row1_1, row1_spacer2 = st.columns((.3, 7.1, .4))
if not F1s:
F1s = {selected_intent: 1.0}
if not F1_scores:
F1_scores = {selected_intent: 1.0}
with row1_1:
st.markdown("---")
st.subheader("Performance report for selected dialog \"" + selected_intent + "\"")
row2_spacer1, row2_1, row2_spacer2, row2_2, row2_spacer3, row2_3, \
row2_spacer4, row2_4, row2_spacer5, row2_5, row2_spacer6, row2_6 = st.columns(
(.8, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5))

intent_performance = overall_performance[mode.lower()][selected_intent.replace("_eval", "")]
row2_1.metric("#Sessions", str(sum(list(intent_performance["overall_performance"].values()))), "")
if F1s[selected_intent] < 0.9:
row2_2.metric("F1 score", str(F1s[selected_intent]), "", "inverse")
else:
row2_2.metric("F1 score", str(F1s[selected_intent]), "Good")
if intent_performance["success_rate"] < 0.7:
row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "", "inverse")
else:
row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "")
if intent_performance["intent_error_rate"] > 0.5:
row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "", "inverse")
else:
row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "")
if intent_performance["NER_error_rate"] > 0.5:
row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "", "inverse")
else:
row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "")
row2_6.metric("Other Error Rate", str(intent_performance["other_error_rate"]), "")
plot_dialog_performance_banner(overall_performance, F1_scores, selected_intent, mode)

st.plotly_chart(
dashboard_plot.plot_intent_performance(
Expand All @@ -84,10 +91,10 @@ def render_dialog_report(mode, selected_intent, F1s, overall_performance, detail
use_container_width=True)


def render_remediation(mode, selected_intent, F1s, overall_performance, detailed_performance):
def render_remediation(mode, selected_intent, F1_scores, overall_performance, detailed_performance):
row1_spacer1, row1_1, row1_spacer2 = st.columns((.2, 7.1, .2))
if not F1s:
F1s = {selected_intent: 1.0}
if not F1_scores:
F1_scores = {selected_intent: 1.0}
with row1_1:
st.markdown("---")
st.header("Remediation Suggestions for {} 🛠️".format(selected_intent))
Expand All @@ -96,28 +103,7 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
"They can also be extended by BotSIM users to include domain expertise or bot-specific "
"information. ")

row2_spacer1, row2_1, row2_spacer2, row2_2, row2_spacer3, row2_3, \
row2_spacer4, row2_4, row2_spacer5, row2_5, row2_spacer6, row2_6 = st.columns(
(.8, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5))
intent_performance = overall_performance[mode.lower()][selected_intent.replace("_eval", "")]
row2_1.metric("#Sessions", str(sum(list(intent_performance["overall_performance"].values()))), "")
if F1s[selected_intent] < 0.9:
row2_2.metric("F1 score", str(F1s[selected_intent]), "", "inverse")
else:
row2_2.metric("F1 score", str(F1s[selected_intent]), "Good")
if intent_performance["success_rate"] < 0.7:
row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "", "inverse")
else:
row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "")
if intent_performance["intent_error_rate"] > 0.5:
row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "", "inverse")
else:
row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "")
if intent_performance["NER_error_rate"] > 0.5:
row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "", "inverse")
else:
row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "")
row2_6.metric("Other Error Rate", str(intent_performance["other_error_rate"]), "")
plot_dialog_performance_banner(overall_performance, F1_scores, selected_intent, mode)

row3_spacer1, row3_1, row3_spacer2 = st.columns((.2, 7.1, .2))
with row3_1:
Expand Down Expand Up @@ -147,13 +133,13 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
if len(droplist_labels) > 0:
row4_spacer1, row4_1, row4_spacer2, row4_2, row4_spacer3 = st.columns((.4, 8.3, .4, .4, .2))
with row4_1:
st.markdown("For intent classification models, we show the wrongly predicted paraphrases intent queries "
st.markdown("For intent models, we show the wrongly predicted paraphrases intent queries "
"grouped by their corresponding original"
" training utterances (**sorted in descending order by number of errors**). "
"Detailed analysis can be found on the right hand side expander.")
row5_spacer1, row5_1, row5_spacer2, row5_2, row5_spacer3 = st.columns((.4, 4.3, .4, 4.3, .2))
with row5_1:
utt_selected = st.selectbox("Which utterance do you want to analyze? "
utt_selected = st.selectbox("Which utterance do you want to investigate? "
"(" + str(len(droplist_labels)) + " in total)",
list(droplist_labels), key="utt")
with row5_2:
Expand Down Expand Up @@ -213,15 +199,16 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
st.json(ner_errors)


def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_to_clusters, intent_to_supports,
def render_analytics(database, test, cm_plot, recalls, precisions, F1_scores, intent_to_clusters, intent_to_supports,
all_intents):
row1_spacer1, row1_1, row1_spacer2 = st.columns((.2, 7.1, .2))
with row1_1:
st.markdown("---")
st.header("Conversation Analytics ⚙️")
st.markdown("BotSIM also offers analytical tools for helping users gain more insights into their systems. "
st.markdown("Analytical tools for helping users gain insights into their bots for "
"troubleshooting and improvement. "
"These tools include confusion matrix analysis, intent utterance tSNE clustering and "
"bootstrap-based confidence analysis ")
"many more can be added in the layout.")

row2_spacer1, row2_1, row2_spacer2 = st.columns((.4, 7.1, .4))
with row2_1:
Expand All @@ -244,24 +231,24 @@ def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_t

sorted_recall = dict(sorted(recalls.items(), key=lambda item: -item[1]))
sorted_precision = dict(sorted(precisions.items(), key=lambda item: -item[1]))
sorted_F1 = dict(sorted(F1s.items(), key=lambda item: -item[1]))
sorted_F1 = dict(sorted(F1_scores.items(), key=lambda item: -item[1]))
table = []

if sorted_by == "Sorted by Recall":
for intent in sorted_recall:
precision, recall, F1 = sorted_precision[intent], recalls[intent], F1s[intent]
precision, recall, F1_score = sorted_precision[intent], recalls[intent], F1_scores[intent]
table.append(
[intent, precision, recall, F1, intent_to_supports[intent], intent_to_clusters[intent]])
[intent, precision, recall, F1_score, intent_to_supports[intent], intent_to_clusters[intent]])
elif sorted_by == "Sorted by Precision":
for intent in sorted_precision:
precision, recall, F1 = sorted_precision[intent], recalls[intent], F1s[intent]
precision, recall, F1_score = sorted_precision[intent], recalls[intent], F1_scores[intent]
table.append(
[intent, precision, recall, F1, intent_to_supports[intent], intent_to_clusters[intent]])
[intent, precision, recall, F1_score, intent_to_supports[intent], intent_to_clusters[intent]])
else:
for intent in sorted_F1:
precision, recall, F1 = sorted_precision[intent], recalls[intent], F1s[intent]
precision, recall, F1_score = sorted_precision[intent], recalls[intent], F1_scores[intent]
table.append(
[intent, precision, recall, F1, intent_to_supports[intent], intent_to_clusters[intent]])
[intent, precision, recall, F1_score, intent_to_supports[intent], intent_to_clusters[intent]])

row4_spacer1, row4_1, row4_2, row4_3, row4_4, row4_5, row4_6, row4_spacer2 = st.columns(
(2.3, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 0.5))
Expand Down Expand Up @@ -290,10 +277,10 @@ def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_t
with row5_1:
st.markdown("---")
st.subheader("tSNE visualisation of intent training utterances")
st.markdown("To gauge the quality of the intent training utterances and identify intent overlaps, "
"tSNE clustering is performed based on the sentence transformer embeddings of the intent training "
st.markdown("To gauge the intent training data quality, "
"tSNE clustering is performed on the sentence transformer embeddings of the intent training "
"utterances. "
"By examining the clusters, not only can users find intents with significant overlap in training "
"Not only can the clustering identify intents with significant overlap in training "
"data semantic space, "
"they can also potentially discover novel intents from production logs to aid dialog re-design.")
"it can also potentially discover novel intents from production logs to aid dialog re-design.")
st.plotly_chart(dashboard_plot.plot_tSNE(all_intents, database, test), use_container_width=True)
11 changes: 4 additions & 7 deletions botsim/modules/remediator/dashboard/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,24 +165,21 @@ def plot_intent_performance(intent, mode, overall_performance, detailed_performa
ner_errors = detailed_performance[mode][intent.replace("_eval", "")]["ner_errors"]
intent_predictions = overall_performance[mode][intent.replace("_eval", "")]["intent_predictions"]

import plotly.graph_objects as go
from plotly.subplots import make_subplots

prediction_labels, prediction_counts = [], []
for p in intent_predictions:
prediction_labels.append(p)
prediction_counts.append(intent_predictions[p])

entity_labels, entity_counts = [], []
for ent in ner_errors:
type = ner_errors[ent]["extraction_type"]
extraction_type = ner_errors[ent]["extraction_type"]
if "pattern" in ner_errors[ent]:
pattern = ner_errors[ent]["pattern"]
if type == "UNK": continue
if type == "regex":
if extraction_type == "UNK": continue
if extraction_type == "regex":
entity_labels.append(ner_errors[ent]["entity_name"] + " (" + pattern + ")")
else:
entity_labels.append(ner_errors[ent]["entity_name"] + " extracted via " + type)
entity_labels.append(ner_errors[ent]["entity_name"] + " extracted via " + extraction_type)
count = 0
if "missed" in ner_errors[ent]:
count += len(ner_errors[ent]["missed"])
Expand Down
Loading

0 comments on commit 89937dd

Please sign in to comment.