In [None]:
!pip install sentence-transformers bertopic umap-learn hdbscan numpy python_dotenv azure-storage-blob panda pytz ipywidgets python-dotenv sendgrid

In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from azure.storage.blob import BlobServiceClient
import json, os, pytz, html
from azure.core.exceptions import ResourceExistsError
import json
from datetime import datetime, timedelta, UTC
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import Mail
from dotenv import load_dotenv
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def send_email(content):
    message = Mail(
        from_email=os.getenv("MAIL_FROM"),
        to_emails=os.getenv("MAIL_TO"),
        subject= "Weekly Chatbot Summary",
        html_content=content)

    try:
        sg = SendGridAPIClient(os.environ.get('SENDGRID_API_KEY'))
        response = sg.send(message)
        print(response.status_code)
        print(response.body)
        print(response.headers)
    except Exception as e:
        print(e.body)

In [None]:
load_dotenv('/content/drive/MyDrive/Colab Notebooks/LCI Chatbot/.env')
AZURE_CONN_STR = os.environ["AZURE_CSV_CONTAINER"]
print(AZURE_CONN_STR)

AZURE_CONN_STR = os.environ["AZURE_CSV_CONTAINER"]
DATA_CONTAINER = "chat-logs"
DATA_BLOB_NAME = "logs"

SUMMARY_CONTAINER = "weekly-summaries"
SUMMARY_BLOB_NAME = "topic_discovery.txt"

MODEL_CONTAINER = "models"
MODEL_BLOB_NAME = "topic_inference.zip"

MODEL_DIR = "./topic_model_dir"

In [None]:
def compute_week_window(tz_name="America/Chicago", which="last_full_iso_week", offset_weeks=0, now_utc=None):
    tz = pytz.timezone(tz_name)
    if now_utc is None:
        now_utc = datetime.now(UTC)

    now_local = now_utc.astimezone(tz)
    dow = now_local.isoweekday()
    start_of_this_week = (now_local - timedelta(days=dow - 1)).replace(
        hour=0, minute=0, second=0, microsecond=0
    )
    end = start_of_this_week
    start = end - timedelta(weeks=1)

    if offset_weeks:
        end -= timedelta(weeks=offset_weeks)
        start -= timedelta(weeks=offset_weeks)

    return start, end

def iso_week_id_from_start(start_dt):
    iso = start_dt.isocalendar()
    return f"{iso.year}-W{iso.week:02d}"

def append_summary_to_azure(conn_str, container, blob_name, text_block):
    blob_service_client = BlobServiceClient.from_connection_string(conn_str)
    try:
        blob_service_client.create_container(container)
    except ResourceExistsError:
        pass
    append_client = blob_service_client.get_blob_client(container, blob_name)
    if not append_client.exists():
        append_client.create_append_blob()

    payload = (text_block.rstrip() + "\n\n" + "-"*60 + "\n\n").encode("utf-8")
    append_client.append_block(payload)
    print(f"Appended text to {container}/{blob_name}")


In [None]:
def download_summary(conn_str, container, blob_name, local_path="./summary.txt"):

    service_client = BlobServiceClient.from_connection_string(conn_str)
    client = service_client.get_blob_client(container, blob_name)

    os.makedirs(os.path.dirname(local_path) or ".", exist_ok=True)

    with open(local_path, "wb") as f:
        data = client.download_blob()
        f.write(data.readall())

    print(f"Text file saved at {local_path}")

def download_and_unzip_model(conn_str, container, blob_name, out_dir):

    service_client = BlobServiceClient.from_connection_string(conn_str)
    client = service_client.get_blob_client(container, blob_name)
    if not client.exists():
        print(f"No remote model found at {container}/{blob_name}")
        return False

    import zipfile, io
    data = client.download_blob().readall()
    with zipfile.ZipFile(io.BytesIO(data)) as zf:
        zf.extractall(out_dir)
    print(f"Model extracted to {out_dir}")
    return True

In [None]:
def discovery(container, in_blob, start=None, end=None, tz_name = "America/Chicago"):
if not os.path.exists(in_blob):
    print("Data not found locally, downloading data...")
    client = BlobServiceClient.from_connection_string(AZURE_CONN_STR)
    blob = client.get_blob_client(container, in_blob)

    with open(in_blob, "wb") as f:
        downloader = blob.download_blob()
        for chunk in downloader.chunks():
            f.write(chunk)


df = pd.read_json(in_blob, lines=True)
df["time"] = pd.to_datetime(df["time"], errors="coerce", utc=True)
df = df.dropna(subset=["time", "question"])
df["question"] = (df["question"]
                  .str.lower()
                  .str.replace(r"https?://\S+"," ", regex=True)
                  .str.replace(r"\s+"," ", regex=True)
                  .str.strip()
                  )




tz = pytz.timezone(tz_name)
df["time_local"] = df["time"].dt.tz_convert(tz)
if start is None or end is None:
    start, end = compute_week_window(tz_name=tz_name, which="last_full_iso_week")
print(f"Summarizing window: {start.isoformat()} to {end.isoformat()} [{tz_name}]")

week_mask = (df["time_local"] >= start) & (df["time_local"] < end)
week_df = df.loc[week_mask].copy()
partial_df = week_df.sample(frac=1, random_state=42) # shuffles

if download_and_unzip_model(AZURE_CONN_STR, MODEL_CONTAINER, MODEL_BLOB_NAME, MODEL_DIR):
    print("Loading existing models...")

    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
    topic_model = BERTopic.load(f"{MODEL_DIR}/topic_model", embedding_model=embed_model)
    topics, _ = topic_model.transform(partial_df["question"].tolist())
else:
    print("Failed to download model")
    return 0

partial_df["topic_id"] = topics
topic_dict = topic_model.get_topics()



def auto_title(tid, topn=3):
    if tid == -1:
        return "Other / Noise"
    words = [w for w, _ in topic_dict[tid][:topn]]
    return " / ".join([w.capitalize() for w in words]) if words else "Unknown"

summary = (
        partial_df["topic_id"]
        .value_counts()
        .rename_axis("topic")
        .reset_index(name="count")
        .sort_values("count", ascending=False)
        .reset_index(drop=True)
        )

if summary.empty:
    iso_week = iso_week_id_from_start(start)
    summary_dict = {
        "run_id": iso_week,
        "week_start": start.date().isoformat(),
        "week_end": end.date().isoformat(),
        "n_prompts": 0,
        "top_topics": [],
        "method": {"algo": "BERTopic", "embedding": "all-MiniLM-L6-v2", "probabilities": False},
        "notes": "No data in window",
    }
    return summary, iso_week, summary_dict, partial_df, start, end

summary["topic_name"] = summary["topic"].apply(auto_title)
summary["share"] = (summary["count"] / max(len(partial_df), 1)).round(3)

top_topics = [
    {
        "topic_id": int(row["topic"]),
        "topic_name": str(row["topic_name"]),
        "count": int(row["count"]),
        "share": float(row["share"]),
    }
    for _, row in summary.iterrows()
]

iso_week = iso_week_id_from_start(start)

summary_dict = {
    "run_id": iso_week,
    "week_start": start.date().isoformat(),
    "week_end": end.date().isoformat(),
    "n_prompts": int(len(partial_df)),
    "top_topics": top_topics,
    "method": {"algo": "BERTopic", "embedding": "all-MiniLM-L6-v2", "probabilities": False},
    "notes": None,
}

print("Summary JSON preview:", json.dumps(summary_dict)[:300] + "...")
return summary, iso_week, summary_dict, partial_df, start, end


In [None]:
def build_plain_text_summary(iso_week, start, end, window_df, summary_df, top_k=10):
total_msgs = int(len(window_df))
flagged = int(window_df.get("flagged", pd.Series(dtype=bool)).sum()) if total_msgs else 0

lines = []
lines.append(f"Weekly Topic Summary — {iso_week}")
lines.append(f"Window: {start.date().isoformat()} to {end.date().isoformat()}")
lines.append(f"Total messages: {total_msgs}")
lines.append(f"Flagged messages: {flagged}")
lines.append("")

if total_msgs == 0 or summary_df.empty:
    lines.append("No messages this week.")
    return "\n".join(lines)

lines.append("Top topics this week:")

for _, row in summary_df.head(top_k).iterrows():
    pct = f"{row['share']*100:.1f}%"
    lines.append(f"  • {row['topic_name']} — {pct} ({int(row['count'])} messages)")
    return "\n".join(lines)

def build_html_summary(iso_week, start, end, window_df, summary_df, top_k=10):
total_msgs = int(len(window_df))
flagged = int(window_df.get("flagged", pd.Series(dtype=bool)).sum()) if total_msgs else 0

#header
header = (
    f"<h2 style='margin:0 0 8px;'>Weekly Topic Summary — {html.escape(str(iso_week))}</h2>"
    f"<p style='margin:0 0 12px; line-height:1.5;'>"
    f"<strong>Window:</strong> {start.date().isoformat()} to {end.date().isoformat()}<br>"
    f"<strong>Total messages:</strong> {total_msgs}<br>"
    f"<strong>Flagged messages:</strong> {flagged}"
    f"</p>"
)

if total_msgs == 0 or summary_df is None or summary_df.empty:
    body = "<p>No messages this week.</p>"
    return f"<!doctype html><html><body>{header}{body}</body></html>"

items = []
for _, row in summary_df.head(top_k).iterrows():
    topic = html.escape(str(row["topic_name"]))
    pct = f"{row['share']*100:.1f}%"
    cnt = int(row["count"])
    items.append(f"<li>{topic} — {pct} ({cnt} messages)</li>")

body = (
    "<h3 style='margin:16px 0 8px;'>Top topics this week:</h3>"
    f"<ul style='margin:8px 0 0 20px; padding:0;'>{''.join(items)}</ul>"
)
return f"<!doctype html><html><body style='font-family:Arial,Helvetica,sans-serif; font-size:14px; color:#111;'>{header}{body}</body></html>"

In [None]:
if not os.path.exists(MODEL_DIR):
  os.mkdir(MODEL_DIR)

summary_df, iso_week, summary_dict, window_df, start, end = discovery(DATA_CONTAINER, DATA_BLOB_NAME)

readable_text = build_plain_text_summary(
  iso_week=iso_week,
  start=start,
  end=end,
  window_df=window_df,
  summary_df=summary_df,
  top_k=10
)

append_summary_to_azure(AZURE_CONN_STR, SUMMARY_CONTAINER, SUMMARY_BLOB_NAME, readable_text)
json_text = build_html_summary(
  iso_week=iso_week,
  start=start,
  end=end,
  window_df=window_df,
  summary_df=summary_df,
  top_k=10)
send_email(json_text)