clean streamlit code

salesforce · Nov 28, 2022 · 89937dd · 89937dd
1 parent 75ec79e
commit 89937dd
Show file tree

Hide file tree

Showing 13 changed files with 312 additions and 285 deletions.
diff --git a/botsim/modules/remediator/dashboard/dashboard_utils.py b/botsim/modules/remediator/dashboard/dashboard_utils.py
@@ -5,7 +5,13 @@
 
 import os, json
 import numpy as np
-from botsim.botsim_utils.utils import read_s3_json, dump_s3_file, file_exists, read_s3_data, convert_list_to_dict
+from botsim.botsim_utils.utils import (
+    read_s3_json,
+    dump_s3_file,
+    file_exists,
+    read_s3_data,
+    convert_list_to_dict,
+    S3_BUCKET_NAME)
 from sentence_transformers import SentenceTransformer
 
 
@@ -30,10 +36,11 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
     dev_embedding, dev_labels = np.empty((0, 384)), {"label": []}
 
     if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
-        if file_exists("botsim", goals_dir + "/dev_embedding.npy") and \
-                file_exists("botsim", goals_dir + "/dev_embedding_label.npy"):
-            dev_embedding = np.frombuffer(read_s3_data("botsim", goals_dir + "/dev_embedding.npy")).reshape(-1, 384)
-            dev_labels = read_s3_json("botsim", goals_dir + "/dev_embedding_label.npy")["label"]
+        if file_exists(S3_BUCKET_NAME, goals_dir + "/dev_embedding.npy") and \
+                file_exists(S3_BUCKET_NAME, goals_dir + "/dev_embedding_label.npy"):
+            dev_embedding = np.frombuffer(read_s3_data(S3_BUCKET_NAME, goals_dir + "/dev_embedding.npy")).reshape(-1,
+                                                                                                                  384)
+            dev_labels = read_s3_json(S3_BUCKET_NAME, goals_dir + "/dev_embedding_label.npy")["label"]
             return dev_embedding, dev_labels
     else:
         if os.path.exists(goals_dir + "/dev_embedding.npy"):
@@ -47,13 +54,13 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
     for i, intent in enumerate(intents):
         file_name = goals_dir + "/" + intent + "_" + para_setting + ".paraphrases.json"
         if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
-            if not file_exists("botsim", file_name):
+            if not file_exists(S3_BUCKET_NAME, file_name):
                 paraphrase = False
                 file_name = goals_dir + "/" + intent + ".json"
-                utterances = read_s3_json("botsim", file_name)[intent]
+                utterances = read_s3_json(S3_BUCKET_NAME, file_name)[intent]
             else:
                 print("processing", intent)
-                paras = read_s3_json("botsim", file_name)
+                paras = read_s3_json(S3_BUCKET_NAME, file_name)
                 utterances = []
                 for p in paras:
                     utterances.append(p["source"])
@@ -65,7 +72,6 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
                 file_name = goals_dir + "/" + intent + ".json"
                 utterances = json.load(open(file_name))[intent]
             else:
-                print("processing", intent)
                 paras = json.load(open(file_name))
                 utterances = []
                 for p in paras:
@@ -81,7 +87,7 @@ def get_embedding(intents, database, test_id="169", paraphrase=False, para_setti
         dump_s3_file(goals_dir + "/dev_embedding.npy", dev_embedding.tobytes())
         dump_s3_file(goals_dir + "/dev_embedding_label.npy", bytes(json.dumps(dev_labels, indent=2).encode("UTF-8")))
     else:
-        with  open(goals_dir + "/dev_embedding.npy", "wb") as f:
+        with open(goals_dir + "/dev_embedding.npy", "wb") as f:
             np.save(f, dev_embedding, allow_pickle=False)
         with  open(goals_dir + "/dev_embedding_label.npy", "wb") as f:
             np.save(f, dev_labels, allow_pickle=True)
@@ -110,9 +116,9 @@ def get_bot_health_reports(database, test_id):
     report_path = "data/bots/{}/{}/aggregated_report.json".format(config["type"], test_id)
 
     if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
-        if not file_exists("botsim", report_path):
+        if not file_exists(S3_BUCKET_NAME, report_path):
             return None, None, None
-        report = read_s3_json("botsim", report_path)
+        report = read_s3_json(S3_BUCKET_NAME, report_path)
     else:
         if not os.path.exists(report_path):
             return None, None, None
@@ -130,8 +136,8 @@ def get_entities(database, test_id):
     entities = None
 
     if "STORAGE" in os.environ and os.environ["STORAGE"] == "S3":
-        if file_exists("botsim", entity_path):
-            entities = read_s3_json("botsim", entity_path)
+        if file_exists(S3_BUCKET_NAME, entity_path):
+            entities = read_s3_json(S3_BUCKET_NAME, entity_path)
     else:
         if os.path.exists(entity_path):
             entities = json.load(open(entity_path, "r"))
@@ -153,8 +159,8 @@ def parse_confusion_matrix(database, test_id, mode):
     config = dict(database.get_one_bot_test_instance(test_id))
     cm_report_path = "data/bots/{}/{}/remediation/cm_{}_report.json".format(config["type"], test_id, mode)
 
-    if file_exists("botsim", cm_report_path):
-        report = read_s3_json("botsim", cm_report_path)
+    if file_exists(S3_BUCKET_NAME, cm_report_path):
+        report = read_s3_json(S3_BUCKET_NAME, cm_report_path)
     else:
         return None, None, None, None, None, None, None
     rows = report["cm_table"]["body_row"]

diff --git a/botsim/modules/remediator/dashboard/layout.py b/botsim/modules/remediator/dashboard/layout.py
@@ -8,12 +8,40 @@
 import botsim.modules.remediator.dashboard.plot as dashboard_plot
 from streamlit_chat import message
 
+
+def plot_dialog_performance_banner(overall_performance, F1_scores, selected_intent, mode):
+    row2_spacer1, row2_1, row2_spacer2, row2_2, row2_spacer3, row2_3, \
+    row2_spacer4, row2_4, row2_spacer5, row2_5, row2_spacer6, row2_6 = st.columns(
+        (.8, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5))
+
+    intent_performance = overall_performance[mode.lower()][selected_intent.replace("_eval", "")]
+    row2_1.metric("#Sessions", str(sum(list(intent_performance["overall_performance"].values()))), "")
+    if F1_scores[selected_intent] < 0.9:
+        row2_2.metric("F1 score", str(F1_scores[selected_intent]), "", "inverse")
+    else:
+        row2_2.metric("F1 score", str(F1_scores[selected_intent]), "Good")
+    if intent_performance["success_rate"] < 0.7:
+        row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "", "inverse")
+    else:
+        row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "")
+    if intent_performance["intent_error_rate"] > 0.5:
+        row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "", "inverse")
+    else:
+        row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "")
+    if intent_performance["NER_error_rate"] > 0.5:
+        row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "", "inverse")
+    else:
+        row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "")
+    row2_6.metric("Other Error Rate", str(intent_performance["other_error_rate"]), "")
+
+
 def render_summary_reports(database, mode, test, dataset_info, overall_performance):
     row1_spacer1, row1_1, row1_spacer2 = st.columns((.2, 7.1, .2))
     with row1_1:
         st.header("Bot health reports 📊")
-        st.markdown("The bot health reports consist of a summary report across all intents and "
-                    "per-intent reports to show both the task-completion and NLU performance.")
+        st.markdown("The bot health reports comprises 1) a summary report of a simulation session "
+                    "across all intents and 2) "
+                    "intent/dialog-specific reports to show both the task-completion and NLU performance.")
     row2_spacer1, row2_1, row2_spacer2 = st.columns((.2, 7.1, .4))
     with row2_1:
         st.subheader("Performance summary for selected test (test_id={}):".format(test))
@@ -46,36 +74,15 @@ def render_summary_reports(database, mode, test, dataset_info, overall_performan
         st.plotly_chart(dashboard_plot.plot_test_performance(intent_to_errors), use_container_width=True)
 
 
-def render_dialog_report(mode, selected_intent, F1s, overall_performance, detailed_performance):
+def render_dialog_report(mode, selected_intent, F1_scores, overall_performance, detailed_performance):
     row1_spacer1, row1_1, row1_spacer2 = st.columns((.3, 7.1, .4))
-    if not F1s:
-        F1s = {selected_intent: 1.0}
+    if not F1_scores:
+        F1_scores = {selected_intent: 1.0}
     with row1_1:
         st.markdown("---")
         st.subheader("Performance report for selected dialog \"" + selected_intent + "\"")
-    row2_spacer1, row2_1, row2_spacer2, row2_2, row2_spacer3, row2_3, \
-    row2_spacer4, row2_4, row2_spacer5, row2_5, row2_spacer6, row2_6 = st.columns(
-        (.8, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5))
 
-    intent_performance = overall_performance[mode.lower()][selected_intent.replace("_eval", "")]
-    row2_1.metric("#Sessions", str(sum(list(intent_performance["overall_performance"].values()))), "")
-    if F1s[selected_intent] < 0.9:
-        row2_2.metric("F1 score", str(F1s[selected_intent]), "", "inverse")
-    else:
-        row2_2.metric("F1 score", str(F1s[selected_intent]), "Good")
-    if intent_performance["success_rate"] < 0.7:
-        row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "", "inverse")
-    else:
-        row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "")
-    if intent_performance["intent_error_rate"] > 0.5:
-        row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "", "inverse")
-    else:
-        row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "")
-    if intent_performance["NER_error_rate"] > 0.5:
-        row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "", "inverse")
-    else:
-        row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "")
-    row2_6.metric("Other Error Rate", str(intent_performance["other_error_rate"]), "")
+    plot_dialog_performance_banner(overall_performance, F1_scores, selected_intent, mode)
 
     st.plotly_chart(
         dashboard_plot.plot_intent_performance(
@@ -84,10 +91,10 @@ def render_dialog_report(mode, selected_intent, F1s, overall_performance, detail
         use_container_width=True)
 
 
-def render_remediation(mode, selected_intent, F1s, overall_performance, detailed_performance):
+def render_remediation(mode, selected_intent, F1_scores, overall_performance, detailed_performance):
     row1_spacer1, row1_1, row1_spacer2 = st.columns((.2, 7.1, .2))
-    if not F1s:
-        F1s = {selected_intent: 1.0}
+    if not F1_scores:
+        F1_scores = {selected_intent: 1.0}
     with row1_1:
         st.markdown("---")
         st.header("Remediation Suggestions for {} 🛠️".format(selected_intent))
@@ -96,28 +103,7 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
                     "They can also be extended by BotSIM users to include domain expertise or bot-specific "
                     "information. ")
 
-    row2_spacer1, row2_1, row2_spacer2, row2_2, row2_spacer3, row2_3, \
-    row2_spacer4, row2_4, row2_spacer5, row2_5, row2_spacer6, row2_6 = st.columns(
-        (.8, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5, .4, 2.5))
-    intent_performance = overall_performance[mode.lower()][selected_intent.replace("_eval", "")]
-    row2_1.metric("#Sessions", str(sum(list(intent_performance["overall_performance"].values()))), "")
-    if F1s[selected_intent] < 0.9:
-        row2_2.metric("F1 score", str(F1s[selected_intent]), "", "inverse")
-    else:
-        row2_2.metric("F1 score", str(F1s[selected_intent]), "Good")
-    if intent_performance["success_rate"] < 0.7:
-        row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "", "inverse")
-    else:
-        row2_3.metric("Goal-completion Rate", str(intent_performance["success_rate"]), "")
-    if intent_performance["intent_error_rate"] > 0.5:
-        row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "", "inverse")
-    else:
-        row2_4.metric("Intent Error Rate", str(intent_performance["intent_error_rate"]), "")
-    if intent_performance["NER_error_rate"] > 0.5:
-        row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "", "inverse")
-    else:
-        row2_5.metric("NER Error Rate", str(intent_performance["NER_error_rate"]), "")
-    row2_6.metric("Other Error Rate", str(intent_performance["other_error_rate"]), "")
+    plot_dialog_performance_banner(overall_performance, F1_scores, selected_intent, mode)
 
     row3_spacer1, row3_1, row3_spacer2 = st.columns((.2, 7.1, .2))
     with row3_1:
@@ -147,13 +133,13 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
     if len(droplist_labels) > 0:
         row4_spacer1, row4_1, row4_spacer2, row4_2, row4_spacer3 = st.columns((.4, 8.3, .4, .4, .2))
         with row4_1:
-            st.markdown("For intent classification models, we show the wrongly predicted paraphrases intent queries "
+            st.markdown("For intent models, we show the wrongly predicted paraphrases intent queries "
                         "grouped by their corresponding original"
                         " training utterances (**sorted in descending order by number of errors**). "
                         "Detailed analysis can be found on the right hand side expander.")
         row5_spacer1, row5_1, row5_spacer2, row5_2, row5_spacer3 = st.columns((.4, 4.3, .4, 4.3, .2))
         with row5_1:
-            utt_selected = st.selectbox("Which utterance do you want to analyze? "
+            utt_selected = st.selectbox("Which utterance do you want to investigate? "
                                         "(" + str(len(droplist_labels)) + " in total)",
                                         list(droplist_labels), key="utt")
         with row5_2:
@@ -213,15 +199,16 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
                     st.json(ner_errors)
 
 
-def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_to_clusters, intent_to_supports,
+def render_analytics(database, test, cm_plot, recalls, precisions, F1_scores, intent_to_clusters, intent_to_supports,
                      all_intents):
     row1_spacer1, row1_1, row1_spacer2 = st.columns((.2, 7.1, .2))
     with row1_1:
         st.markdown("---")
         st.header("Conversation Analytics ⚙️")
-        st.markdown("BotSIM also offers  analytical tools for helping users gain more insights into their systems. "
+        st.markdown("Analytical tools for helping users gain insights into their bots for "
+                    "troubleshooting and improvement. "
                     "These tools include confusion matrix analysis, intent utterance tSNE clustering and "
-                    "bootstrap-based confidence analysis ")
+                    "many more can be added in the layout.")
 
     row2_spacer1, row2_1, row2_spacer2 = st.columns((.4, 7.1, .4))
     with row2_1:
@@ -244,24 +231,24 @@ def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_t
 
             sorted_recall = dict(sorted(recalls.items(), key=lambda item: -item[1]))
             sorted_precision = dict(sorted(precisions.items(), key=lambda item: -item[1]))
-            sorted_F1 = dict(sorted(F1s.items(), key=lambda item: -item[1]))
+            sorted_F1 = dict(sorted(F1_scores.items(), key=lambda item: -item[1]))
             table = []
 
             if sorted_by == "Sorted by Recall":
                 for intent in sorted_recall:
-                    precision, recall, F1 = sorted_precision[intent], recalls[intent], F1s[intent]
+                    precision, recall, F1_score = sorted_precision[intent], recalls[intent], F1_scores[intent]
                     table.append(
-                        [intent, precision, recall, F1, intent_to_supports[intent], intent_to_clusters[intent]])
+                        [intent, precision, recall, F1_score, intent_to_supports[intent], intent_to_clusters[intent]])
             elif sorted_by == "Sorted by Precision":
                 for intent in sorted_precision:
-                    precision, recall, F1 = sorted_precision[intent], recalls[intent], F1s[intent]
+                    precision, recall, F1_score = sorted_precision[intent], recalls[intent], F1_scores[intent]
                     table.append(
-                        [intent, precision, recall, F1, intent_to_supports[intent], intent_to_clusters[intent]])
+                        [intent, precision, recall, F1_score, intent_to_supports[intent], intent_to_clusters[intent]])
             else:
                 for intent in sorted_F1:
-                    precision, recall, F1 = sorted_precision[intent], recalls[intent], F1s[intent]
+                    precision, recall, F1_score = sorted_precision[intent], recalls[intent], F1_scores[intent]
                     table.append(
-                        [intent, precision, recall, F1, intent_to_supports[intent], intent_to_clusters[intent]])
+                        [intent, precision, recall, F1_score, intent_to_supports[intent], intent_to_clusters[intent]])
 
         row4_spacer1, row4_1, row4_2, row4_3, row4_4, row4_5, row4_6, row4_spacer2 = st.columns(
             (2.3, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 0.5))
@@ -290,10 +277,10 @@ def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_t
     with row5_1:
         st.markdown("---")
         st.subheader("tSNE visualisation of intent training utterances")
-        st.markdown("To gauge the quality of the intent training utterances and identify intent overlaps,  "
-                    "tSNE clustering is performed based on the sentence transformer embeddings of the intent training "
+        st.markdown("To gauge the intent training data quality,  "
+                    "tSNE clustering is performed on the sentence transformer embeddings of the intent training "
                     "utterances. "
-                    "By examining the clusters, not only can  users find intents with significant overlap in training "
+                    "Not only can  the clustering identify intents with significant overlap in training "
                     "data semantic space, "
-                    "they can also potentially discover novel intents from production logs to aid dialog re-design.")
+                    "it can also potentially discover novel intents from production logs to aid dialog re-design.")
         st.plotly_chart(dashboard_plot.plot_tSNE(all_intents, database, test), use_container_width=True)
diff --git a/botsim/modules/remediator/dashboard/plot.py b/botsim/modules/remediator/dashboard/plot.py
@@ -165,24 +165,21 @@ def plot_intent_performance(intent, mode, overall_performance, detailed_performa
     ner_errors = detailed_performance[mode][intent.replace("_eval", "")]["ner_errors"]
     intent_predictions = overall_performance[mode][intent.replace("_eval", "")]["intent_predictions"]
 
-    import plotly.graph_objects as go
-    from plotly.subplots import make_subplots
-
     prediction_labels, prediction_counts = [], []
     for p in intent_predictions:
         prediction_labels.append(p)
         prediction_counts.append(intent_predictions[p])
 
     entity_labels, entity_counts = [], []
     for ent in ner_errors:
-        type = ner_errors[ent]["extraction_type"]
+        extraction_type = ner_errors[ent]["extraction_type"]
         if "pattern" in ner_errors[ent]:
             pattern = ner_errors[ent]["pattern"]
-        if type == "UNK": continue
-        if type == "regex":
+        if extraction_type == "UNK": continue
+        if extraction_type == "regex":
             entity_labels.append(ner_errors[ent]["entity_name"] + " (" + pattern + ")")
         else:
-            entity_labels.append(ner_errors[ent]["entity_name"] + " extracted via " + type)
+            entity_labels.append(ner_errors[ent]["entity_name"] + " extracted via " + extraction_type)
         count = 0
         if "missed" in ner_errors[ent]:
             count += len(ner_errors[ent]["missed"])