# Dashboard: ”Early Warning – Mognad i Sveriges öppna data‑diskussioner”

* [#1489](https://github.com/salgo60/DIGG-skuggbacklog/issues/1489)
* Denna Notebook [1489 DIGG forum](https://github.com/salgo60/DIGG-skuggbacklog/blob/master/Notebook/1489_DIGG_forum.ipynb)
   * se även Notebook [1478 DIGG forum](https://github.com/salgo60/DIGG-skuggbacklog/blob/master/Notebook/1478%20DIGG%20forum.ipynb)



In [1]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-12-05 05:30:27


In [2]:
# 1. läs in
df = pd.read_json("forum_data.json")

# Kolla snabbt
df.head(), df.info()

NameError: name 'pd' is not defined

In [None]:
# Normaliserad text 
from bs4 import BeautifulSoup

df["content_text"] = df["content_html"].apply(
    lambda x: BeautifulSoup(str(x), "html.parser").get_text(" ", strip=True)
)
df["content_text_norm"] = df["content_text"].astype(str).str.lower()

# B) Säker post_time och post_month 
df["post_time"] = pd.to_datetime(df["post_timestamp"], errors="coerce")
df["post_month"] = df["post_time"].dt.strftime("%Y-%m")


In [None]:
# dashboard_generator.py  -- klistra in i en Notebook cell eller kör som .py
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from pathlib import Path
from bs4 import BeautifulSoup
import numpy as np
import re
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer

# ---------- Utdata
OUT = Path("figures")
OUT.mkdir(exist_ok=True)

PDF_OUT = Path("dashboard_report.pdf")

# ---------- Läs in data
#df = pd.read_json("forum_enriched.json", lines=True)
df = pd.read_json("forum_data.json")

# Säkerställ tid, månadssträng och text
df["post_time"] = pd.to_datetime(df["post_timestamp"], errors="coerce")
df = df.dropna(subset=["post_time"]).copy()
df["post_month"] = df["post_time"].dt.strftime("%Y-%m")
df["username_norm"] = df["username"].astype(str).str.strip().str.lower()

# content text från HTML (skapa om inte redan finns)
if "content_text" not in df.columns:
    def html_to_text(x):
        if pd.isna(x): return ""
        return BeautifulSoup(str(x), "html.parser").get_text(" ", strip=True)
    df["content_text"] = df["content_html"].apply(html_to_text)

df["content_text_norm"] = df["content_text"].astype(str).str.lower().fillna("")

# DIGG-flag (justera listan om ni vill)
digg_users = [
    "digg_admin","josefinlassi","mkfsk","digitalist-ops","mikkeschiren",
    "fabian-von-tiedemann","fnordlander","tony","jenniferskoglund",
    "maria_dalhage","nina_berlin","kristine_","sven-erik"
]
digg_users_norm = {u.lower() for u in digg_users}
df["is_digg"] = df["username_norm"].isin(digg_users_norm)

# Helper: spara figur och return fig
def save_fig(fname):
    path = OUT / fname
    plt.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close()
    return str(path)

saved_files = []

# ---------- 1: Inlägg över tid
posts_per_month = df.groupby("post_month")["pid"].count().sort_index()
plt.figure(figsize=(10,4))
plt.plot(posts_per_month.index, posts_per_month.values, marker="o")
plt.title("Antal inlägg per månad – hela forumet")
plt.xlabel("Månad"); plt.ylabel("Antal inlägg")
plt.xticks(rotation=45)
saved_files.append(save_fig("01_inlagg_over_tid.png"))

# ---------- 2: Topp 15 användare
top_users = df.groupby("username_norm")["pid"].count().sort_values(ascending=False).head(15)
plt.figure(figsize=(10,4))
top_users.plot(kind="bar")
plt.title("Topp 15 mest aktiva användare")
plt.ylabel("Antal inlägg")
plt.xticks(rotation=60, ha="right")
saved_files.append(save_fig("02_topp_15_anvandare.png"))

# ---------- 3: DIGG vs övriga volym
plt.figure(figsize=(6,4))
counts = df.groupby("is_digg")["pid"].count().rename(index={False:"Övriga", True:"DIGG"})
counts.plot(kind="bar")
plt.title("Antal inlägg: DIGG vs övriga")
plt.ylabel("Antal inlägg")
saved_files.append(save_fig("03_digg_vs_ovriga.png"))

# ---------- 4: DIGG över tid (med/utan Maria)
df_digg = df[df["is_digg"]]
df_digg_no_maria = df_digg[df_digg["username_norm"] != "maria_dalhage"]
digg_all = df_digg.groupby("post_month")["pid"].count().sort_index()
digg_no_maria = df_digg_no_maria.groupby("post_month")["pid"].count().sort_index()
plt.figure(figsize=(10,4))
plt.plot(digg_all.index, digg_all.values, label="DIGG - alla", marker="o")
plt.plot(digg_no_maria.index, digg_no_maria.values, label="DIGG - utan Maria", marker="o", linestyle="--")
plt.title("DIGG:s aktivitet över tid")
plt.xlabel("Månad"); plt.ylabel("Antal inlägg")
plt.xticks(rotation=45); plt.legend()
saved_files.append(save_fig("04_digg_over_tid_med_utan_maria.png"))

# ---------- 5: Trådstarter vs svar
topics_created = df[df["post_index"] == 0]
replies = df[df["post_index"] > 0]
plt.figure(figsize=(6,4))
pd.Series({"Trådstart": len(topics_created), "Svar": len(replies)}).plot(kind="bar")
plt.title("Trådstarter vs svar – hela forumet")
plt.ylabel("Antal inlägg")
saved_files.append(save_fig("05_tradstarter_vs_svar.png"))

# ---------- 6: DIGG roll (startar vs svar)
digg_starters = df_digg[df_digg["post_index"] == 0]
digg_replies = df_digg[df_digg["post_index"] > 0]
plt.figure(figsize=(6,4))
pd.Series({"DIGG - startade trådar": len(digg_starters), "DIGG - svar": len(digg_replies)}).plot(kind="bar")
plt.title("DIGG:s roll i diskussionerna")
plt.ylabel("Antal inlägg")
saved_files.append(save_fig("06_digg_startar_vs_svarar.png"))

# ---------- 7: Enkel ämnesklassificering (om kolumn category finns, annars skip)
if "category" in df.columns:
    plt.figure(figsize=(10,4))
    df["category"].value_counts().head(10).plot(kind="bar")
    plt.title("Vad diskuteras i forumet? (kategori)")
    plt.ylabel("Antal inlägg")
    plt.xticks(rotation=45, ha="right")
    saved_files.append(save_fig("07_amnesfordelning.png"))

# ---------- 8: Lorenz-kurva + Gini
posts_per_user = df.groupby("username_norm")["pid"].count().sort_values()
cum = posts_per_user.cumsum() / posts_per_user.sum()
x = np.arange(len(cum))
plt.figure(figsize=(8,6))
plt.plot(x, cum.values, label="Ackumulerad andel inlägg")
plt.plot(x, x / x[-1], linestyle="--", label="Perfekt jämlikhet")
plt.title("Koncentration av aktivitet (Lorenz-kurva)")
plt.xlabel("Användare (sorterade)")
plt.ylabel("Ackumulerad andel inlägg")
plt.legend()
saved_files.append(save_fig("08_lorenz_kurva.png"))

def gini(x):
    x = np.array(x, dtype=float)
    if x.sum() == 0: return 0.0
    x = np.sort(x)
    n = len(x)
    cumx = np.cumsum(x)
    return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n

gini_value = gini(posts_per_user.values)
print(f"Gini (inlägg per användare): {gini_value:.3f}")

# ---------- 9: Median svar per tråd över tid
replies_per_topic = df.groupby(["post_month","tid"])["pid"].count().reset_index(name="posts")
replies_per_topic["replies"] = replies_per_topic["posts"] - 1
median_replies = replies_per_topic.groupby("post_month")["replies"].median().sort_index()
plt.figure(figsize=(10,4))
plt.plot(median_replies.index, median_replies.values, marker="o")
plt.title("Median antal svar per tråd och månad")
plt.ylabel("Antal svar"); plt.xlabel("Månad")
plt.xticks(rotation=45)
saved_files.append(save_fig("09_median_svar_per_trad.png"))

# ---------- 10: DIGG andel av total aktivitet (%)
total_pm = df.groupby("post_month")["pid"].count().sort_index()
digg_pm = df[df["is_digg"]].groupby("post_month")["pid"].count().reindex(total_pm.index, fill_value=0)
share = (digg_pm / total_pm * 100).fillna(0)
plt.figure(figsize=(10,4))
plt.plot(share.index, share.values, marker="o")
plt.title("DIGG:s andel av inlägg över tid (%)")
plt.ylabel("Andel (%)"); plt.xlabel("Månad")
plt.xticks(rotation=45)
saved_files.append(save_fig("10_digg_andel_over_tid.png"))

# ---------- 11: Burst-index (kortvarig hög aktivitet) för topp 20 aktiva (med minst 5 inlägg)
user_month = df.groupby(["username_norm","post_month"])["pid"].count().reset_index(name="n_posts")
burst = user_month.groupby("username_norm").agg(total_posts=("n_posts","sum"), max_month_posts=("n_posts","max"))
burst["burst_index"] = burst["max_month_posts"]/burst["total_posts"]
burst_filt = burst[burst["total_posts"]>=5]
top20 = burst_filt.sort_values("total_posts", ascending=False).head(20)
plt.figure(figsize=(8,6))
top20.sort_values("burst_index")["burst_index"].plot(kind="barh")
plt.title("Kortvarig hög aktivitet per användare (Burst-index) - topp20")
plt.xlabel("Andel inlägg i mest aktiva månaden")
saved_files.append(save_fig("11_burst_index_top20.png"))

# ---------- 12: Aktiv livslängd vs volym (scatter)
user_stats = df.groupby("username_norm").agg(total_posts=("pid","count"), first_post=("post_time","min"), last_post=("post_time","max"))
user_stats["active_days"] = (user_stats["last_post"] - user_stats["first_post"]).dt.days
user_stats_f = user_stats[user_stats["total_posts"]>=5]
plt.figure(figsize=(8,6))
plt.scatter(user_stats_f["active_days"]+1, user_stats_f["total_posts"], alpha=0.6)
plt.xscale("log")
plt.xlabel("Aktiv livslängd (dagar, logskala)")
plt.ylabel("Antal inlägg")
plt.title("Användarmönster: kortvarig vs långsiktig aktivitet")
saved_files.append(save_fig("12_livslangd_vs_volym.png"))

# ---------- 13: Vem svarar vem - topp20 filter
user_activity = df.groupby("username_norm")["pid"].count().sort_values(ascending=False)
top_users = user_activity[user_activity>=3].head(20).index
df_filt = df[df["username_norm"].isin(top_users)]
thread_starters = df_filt[df_filt["post_index"]==0].set_index("tid")["username_norm"]
replies = df_filt[df_filt["post_index"]>0].copy()
replies["replied_to"] = replies["tid"].map(thread_starters)
edges = replies[(replies["username_norm"]!=replies["replied_to"]) & replies["replied_to"].notna()][["username_norm","replied_to"]]
edge_counts = edges.groupby(["username_norm","replied_to"]).size().reset_index(name="weight")
G = nx.DiGraph()
for _,row in edge_counts.iterrows():
    G.add_edge(row["username_norm"], row["replied_to"], weight=row["weight"])
plt.figure(figsize=(10,10))
pos = nx.spring_layout(G, k=0.7, seed=42)
node_sizes = [df_filt[df_filt["username_norm"]==n].shape[0]*20 for n in G.nodes()]
edge_widths = [G[u][v]["weight"]*0.4 for u,v in G.edges()]
nx.draw_networkx_nodes(G,pos,node_size=node_sizes,node_color="lightblue",alpha=0.9)
nx.draw_networkx_edges(G,pos,width=edge_widths,arrowsize=12,alpha=0.6)
nx.draw_networkx_labels(G,pos,font_size=8)
plt.title("Vem svarar vem - topp 20 mest aktiva")
plt.axis("off")
saved_files.append(save_fig("13_vem_svarar_vem_top20.png"))

# ---------- 14: DCAT-fält över tid och concrete_share
FIELD_KEYWORDS = {
    "dcterms:title": [r"\btitle\b", r"\btitel\b"],
    "dcterms:publisher": [r"\bpublisher\b", r"\bpublicerande\b", r"\butgivare\b"],
    "dcat:keyword": [r"\bkeyword\b", r"\bnyckelord\b", r"\bdcat:keyword\b", r"\bsubject\b"],
    "dcat:hadRole": [r"\bhadrole\b", r"\brole\b", r"\bcurator\b", r"\bproducer\b"],
    "dcat:distribution": [r"\bdistribution\b", r"\baccessurl\b", r"\bdownload\b"],
    "dcterms:issued": [r"\bissued\b", r"\bpublished\b"],
    "dcterms:modified": [r"\bmodified\b", r"\bupdated\b"],
    "dcterms:identifier": [r"\bidentifier\b", r"\bpersistent id\b", r"\bpid\b", r"\bdoi\b"],
    "dcat:contactPoint": [r"\bcontact\b", r"\bkontakt\b"],
    "dcat:theme": [r"\btheme\b", r"\btema\b"]
}
VOCAB_TERMS = ["esco","skos","ssot","single source of truth","kontrollerad vokabulär","persistenta identifierare","rdf","sparql"]
rows=[]
for month, group in df.groupby("post_month"):
    text = " ".join(group["content_text_norm"].tolist())
    row = {"post_month":month}
    for field, patterns in FIELD_KEYWORDS.items():
        cnt = 0
        for p in patterns:
            cnt += len(re.findall(p, text))
        row[field] = cnt
    for v in VOCAB_TERMS:
        row["vocab__"+v] = len(re.findall(r"\b"+re.escape(v)+r"\b", text))
    rows.append(row)
field_df = pd.DataFrame(rows).sort_values("post_month").set_index("post_month")
plot_fields = list(FIELD_KEYWORDS.keys())
if field_df[plot_fields].sum().sum() > 0:
    plt.figure(figsize=(12,4))
    field_df[plot_fields].plot.area(figsize=(12,4))
    plt.title("Nämnanden per DCAT-fält per månad")
    plt.ylabel("Antal omnämnanden"); plt.xlabel("Månad"); plt.xticks(rotation=45)
    plt.tight_layout()
    saved_files.append(save_fig("14_dcat_fields_over_time.png"))

    concrete = ["dcterms:title","dcterms:publisher","dcterms:identifier","dcat:distribution"]
    field_df["concrete_sum"] = field_df[concrete].sum(axis=1)
    field_df["all_sum"] = field_df[plot_fields].sum(axis=1).replace(0,1)
    field_df["concrete_share"] = field_df["concrete_sum"]/field_df["all_sum"]
    plt.figure(figsize=(10,3))
    field_df["concrete_share"].plot(marker="o")
    plt.title("Andel omnämnanden som rör konkreta metadatafält")
    plt.ylabel("Andel (0-1)"); plt.xticks(rotation=45)
    saved_files.append(save_fig("15_dcat_concrete_share.png"))

    # vocab terms
    vocab_cols=[c for c in field_df.columns if c.startswith("vocab__")]
    if len(vocab_cols)>0:
        plt.figure(figsize=(10,4))
        field_df[vocab_cols].plot(kind="bar")
        plt.title("Förekomst av kontrollerade vokabulär över tid")
        plt.ylabel("Antal omnämnanden"); plt.xlabel("Vokabulär")
        saved_files.append(save_fig("16_vocab_terms_over_time.png"))

# ---------- 15: Trendande ord (TF-IDF) - ökar/sjunker
monthly_text = df.groupby("post_month")["content_text_norm"].apply(lambda x: " ".join(x)).reset_index()
vectorizer = TfidfVectorizer(min_df=3, max_df=0.7)
if monthly_text.shape[0] >= 2:
    X = vectorizer.fit_transform(monthly_text["content_text_norm"])
    terms = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(X.toarray(), index=monthly_text["post_month"], columns=terms)
    early = tfidf_df.iloc[:max(1, min(3, tfidf_df.shape[0]//2))].mean()
    late = tfidf_df.iloc[-max(1, min(3, tfidf_df.shape[0]//2)):].mean()
    inc = (late - early).sort_values(ascending=False)
    dec = (early - late).sort_values(ascending=False)
    top_inc = inc.head(6).index.tolist()
    top_dec = dec.head(6).index.tolist()
    if top_inc:
        plt.figure(figsize=(10,4))
        tfidf_df[top_inc].plot(marker="o")
        plt.title("Ord som ökar över tid")
        plt.xticks(rotation=45)
        saved_files.append(save_fig("17_words_increasing.png"))
    if top_dec:
        plt.figure(figsize=(10,4))
        tfidf_df[top_dec].plot(marker="o")
        plt.title("Ord som minskar över tid")
        plt.xticks(rotation=45)
        saved_files.append(save_fig("18_words_decreasing.png"))

# ---------- 16: Spara samlad PDF med PdfPages (alla sparade figurer blir sidor)
with PdfPages(PDF_OUT) as pdf:
    # Titel-sida
    fig = plt.figure(figsize=(11.7,8.3))
    fig.clf()
    fig.text(0.5,0.6,"Dashboard: Early Warning — Mognad i diskussioner om öppna data", ha="center", fontsize=16)
    fig.text(0.5,0.5,"Genererad automatisk från forumdata", ha="center", fontsize=10)
    pdf.savefig(fig); plt.close()

    # lägg till varje PNG som en sida (i den ordning de genererats)
    for fname in sorted(Path("figures").glob("*.png")):
        img = plt.imread(fname)
        fig = plt.figure(figsize=(11.7,8.3))
        plt.imshow(img)
        plt.axis("off")
        pdf.savefig(fig)
        plt.close()

print("✅ Alla figurer sparade i:", OUT.resolve())
print("✅ Samlad PDF skapad:", PDF_OUT.resolve())


In [3]:
 # End timer and calculate duration
end_time = time.time()
elapsed_time = end_time - start_time# Bygg audit-lager för den här etappen

# Print current date and total time
print("Date:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
minutes, seconds = divmod(elapsed_time, 60)
print("Total time elapsed: {:02.0f} minutes {:05.2f} seconds".format(minutes, seconds))


Date: 2025-12-05 05:35:22
Total time elapsed: 04 minutes 55.69 seconds
