# Framing Insights
This notebook extracts key insights from our framing data.

## Imports
Add necessary imports.

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import json, ast
from typing import Literal, Optional

## Load Data

In [None]:
topwords_df = pd.read_parquet("data/topwords_by_topic.parquet")
avg_sentiment = pd.read_csv("data/avg_sentiment_by_source_topic.csv")

## Exploratory Insights
Here, we look at cross-source contrasts for podcasts vs news. We compute a number of similarity metrics, comparing both sentiment and words used. Examplars and topic-level framing sights are displayed as output.

In [None]:
def _json_loads_deep(x, max_depth=2):
    val = x
    for _ in range(max_depth):
        if not isinstance(val, str):
            break
        try:
            val = json.loads(val)
        except Exception:
            break
    return val

# helper function to parse objects into python lists for analysis
def _parse_list_like(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        return x
    if isinstance(x, (np.ndarray, pd.Series)):
        try:
            return list(x.tolist())
        except Exception:
            try:
                return list(x)
            except Exception:
                return []
    if isinstance(x, (dict, tuple, set)):
        try:
            return list(x)
        except Exception:
            return []
    if isinstance(x, str):
        val = _json_loads_deep(x, max_depth=2)
        if isinstance(val, list):
            return val
        try:
            val2 = ast.literal_eval(x)
            if isinstance(val2, list):
                return val2
        except Exception:
            pass
    return []

def words_to_series_weighted(
    top_words,
    key_text=("text","token","word","term"),
    key_val=("value","score","weight")
):
    lst = _parse_list_like(top_words)
    pairs = {}
    for d in lst:
        if not isinstance(d, dict):
            continue
        w = None
        for kt in key_text:
            if kt in d and isinstance(d[kt], str):
                w = d[kt].strip()
                break
        v = None
        for kv in key_val:
            if kv in d:
                try:
                    v = float(d[kv])
                except Exception:
                    v = None
                break
        if w and (v is not None) and np.isfinite(v) and v > 0:
            pairs[w] = v
    return pd.Series(pairs, dtype=float)

# turns words into a series in case of a string object
def words_to_series_plain(top_words_plain):
    lst = _parse_list_like(top_words_plain)
    tokens = []
    if lst:
        tokens = [str(t).strip() for t in lst if isinstance(t, (str, int, float))]
    elif isinstance(top_words_plain, str):
        s = top_words_plain.strip()
        if "," in s:
            tokens = [t.strip() for t in s.split(",")]
        else:
            tokens = s.split()
    tokens = [t for t in tokens if t]
    return pd.Series({t: 1.0 for t in tokens}, dtype=float)

# various measures of jaccard similarity
def jaccard(a_set, b_set):
    inter = len(a_set & b_set)
    uni = len(a_set | b_set)
    return inter / uni if uni else np.nan

def weighted_jaccard(a: pd.Series, b: pd.Series):
    idx = a.index.union(b.index)
    a_ = a.reindex(idx, fill_value=0.0)
    b_ = b.reindex(idx, fill_value=0.0)
    num = float(np.minimum(a_, b_).sum())
    den = float(np.maximum(a_, b_).sum())
    return float(num / den) if den > 0 else np.nan

def _ensure_unique(df, by, agg_map=None):
    if df.duplicated(by).any():
        if agg_map is None:
            agg_map = {}
            for c in df.columns:
                if c in by:
                    continue
                agg_map[c] = "mean" if pd.api.types.is_numeric_dtype(df[c]) else "first"
        return df.groupby(by, as_index=False).agg(agg_map)
    return df

def build_pair_contrasts(
    avg_sentiment: pd.DataFrame,
    top_words_df: pd.DataFrame,
    sourceA: str,
    sourceB: str,
    id_col: Literal["source_name","source_type"] = "source_name",
    topic_col: str = "topic",
    sentiment_col: str = "avg_sentiment_score",
    q_col: Optional[str] = "quantile_sentiment_scaled",
    label_col: Optional[str] = "sentiment_label",
    min_top_words: int = 3,
    big_split_abs_delta: float = 0.10,
    low_overlap_jaccard: float = 0.20,
    high_overlap_jaccard: float = 0.60,
    verbose: bool = False,
):
    S = avg_sentiment.copy()
    T = top_words_df.copy()
    for df in (S, T):
        df[topic_col] = df[topic_col].astype(str).str.strip()

    id_topic = [id_col, topic_col]
    S = _ensure_unique(S, by=id_topic)
    T = _ensure_unique(T, by=id_topic)

    keep_sent = [topic_col, sentiment_col]
    if q_col in S.columns: keep_sent.append(q_col)
    if label_col in S.columns: keep_sent.append(label_col)

    SA = S.loc[S[id_col] == sourceA, keep_sent].rename(columns={sentiment_col:"sentA", q_col:"qA", label_col:"labelA"})
    SB = S.loc[S[id_col] == sourceB, keep_sent].rename(columns={sentiment_col:"sentB", q_col:"qB", label_col:"labelB"})

    base = SA.merge(SB, on=topic_col, how="inner")
    if verbose:
        print(f"Topics with sentiment A={SA[topic_col].nunique()}, B={SB[topic_col].nunique()}, overlap={base[topic_col].nunique()}")

    if base.empty:
        cols = ["topic","sentA","qA","labelA","sentB","qB","labelB","delta","jaccard","w_jaccard","cosine","n_topA","n_topB","twA","twB","bucket"]
        return pd.DataFrame(columns=cols)

    TwA = T.loc[T[id_col] == sourceA, [topic_col, "top_words","top_words_plain"]].rename(columns={"top_words":"twA", "top_words_plain":"twA_plain"})
    TwB = T.loc[T[id_col] == sourceB, [topic_col, "top_words","top_words_plain"]].rename(columns={"top_words":"twB", "top_words_plain":"twB_plain"})

    df = (base.merge(TwA, on=topic_col, how="left")
               .merge(TwB, on=topic_col, how="left"))

    deltas, jacs, wjacs, coss, nA_list, nB_list = [], [], [], [], [], []
    for _, r in df.iterrows():
        sA = float(r["sentA"]); sB = float(r["sentB"])
        deltas.append(sA - sB)

        serA = words_to_series_weighted(r.get("twA"))
        serB = words_to_series_weighted(r.get("twB"))

        if serA.size == 0:
            serA = words_to_series_plain(r.get("twA_plain"))
        if serB.size == 0:
            serB = words_to_series_plain(r.get("twB_plain"))

        nA, nB = int(serA.size), int(serB.size)

        nA_list.append(nA); nB_list.append(nB)

        if (nA >= min_top_words) and (nB >= min_top_words):
            j = jaccard(set(serA.index), set(serB.index))
            wj = weighted_jaccard(serA, serB)
            cs = cosine_sim(serA, serB)
        else:
            j = wj = cs = np.nan

        jacs.append(j); wjacs.append(wj); coss.append(cs)

    df["delta"] = deltas
    df["jaccard"] = jacs
    df["w_jaccard"] = wjacs
    df["cosine"] = coss
    df["n_topA"] = nA_list
    df["n_topB"] = nB_list

    absΔ = df["delta"].abs()
    no_tw = df["jaccard"].isna()
    df["bucket"] = np.select(
        [
            (absΔ >= big_split_abs_delta) & (~no_tw) & (df["jaccard"] <= low_overlap_jaccard),
            (absΔ <= 0.05) & (~no_tw) & (df["jaccard"] <= low_overlap_jaccard),
            (absΔ >= big_split_abs_delta) & (~no_tw) & (df["jaccard"] >= high_overlap_jaccard),
            (absΔ >= big_split_abs_delta) & (no_tw),
        ],
        [
            "big_split_diff_frames", "same_sent_diff_frames", "big_split_same_frames", "big_split_no_topwords",
        ],
        default="other"
    )

    order_cols = ["topic","sentA","qA","labelA","sentB","qB","labelB","delta",
                  "jaccard","w_jaccard","cosine","n_topA","n_topB","twA","twB","bucket"]
    return df.sort_values(by="delta", key=lambda s: s.abs(), ascending=False)[order_cols].reset_index(drop=True)

def parse_words_any(x, x_plain):
    s = words_to_series_weighted(x)
    if s.size == 0:
        s = words_to_series_plain(x_plain)
    if s.size > 0 and s.sum() > 0:
        s = s / s.sum()
    return s

def macro_summary(pair: pd.DataFrame, min_top_words=3):
    if pair.empty:
        return {}
    mask_tw = (pair["n_topA"] >= min_top_words) & (pair["n_topB"] >= min_top_words)
    with_tw = pair.loc[mask_tw]
    out = {
        "n_topics": int(len(pair)),
        "n_with_topwords_both": int(mask_tw.sum()),
        "median_abs_delta": float(pair["delta"].abs().median()),
        "mean_abs_delta": float(pair["delta"].abs().mean()),
        "share_A_more_positive": float((pair["delta"] > 0).mean()),
        "share_B_more_positive": float((pair["delta"] < 0).mean()),
        "median_jaccard_with_tw": float(with_tw["jaccard"].median()) if not with_tw.empty else np.nan,
        "bucket_counts": pair["bucket"].value_counts().to_dict(),
    }
    return out

def contrast_terms(row, top=5):
    sA = parse_words_any(row["twA"], row.get("twA_plain", None))
    sB = parse_words_any(row["twB"], row.get("twB_plain", None))

    idxA, idxB = set(sA.index), set(sB.index)
    common = list(idxA & idxB)
    onlyA  = list(idxA - idxB)
    onlyB  = list(idxB - idxA)

    common_sorted = sorted(common, key=lambda w: min(sA.get(w,0), sB.get(w,0)), reverse=True)[:top]
    uniqA_sorted = list(sA.sort_values(ascending=False).loc[onlyA].head(top).index) if onlyA else []
    uniqB_sorted = list(sB.sort_values(ascending=False).loc[onlyB].head(top).index) if onlyB else []

    union = sA.index.union(sB.index)
    diff = pd.Series({w: sA.get(w,0) - sB.get(w,0) for w in union})
    tiltA = list(diff.sort_values(ascending=False).head(top).index)
    tiltB = list(diff.sort_values(ascending=True).head(top).index)

    return {
        "overlap_words": ", ".join(common_sorted),
        "distinct_A": ", ".join(uniqA_sorted),
        "distinct_B": ", ".join(uniqB_sorted),
        "tilt_A": ", ".join(tiltA),
        "tilt_B": ", ".join(tiltB),
    }

def exemplars(pair_df: pd.DataFrame, bucket: str, k=5, min_top_words=3):
    m = (pair_df["bucket"] == bucket) & (pair_df["n_topA"] >= min_top_words) & (pair_df["n_topB"] >= min_top_words)
    sub = pair_df.loc[m].copy()
    if sub.empty:
        return pd.DataFrame(columns=["topic","delta","sentA","sentB","overlap_words","distinct_A","distinct_B","tilt_A","tilt_B"])
    rows = []
    for _, r in sub.sort_values(by="delta", key=lambda s: s.abs(), ascending=False).head(k).iterrows():
        terms = contrast_terms(r, top=4)
        rows.append({
            "topic": r["topic"],
            "delta": round(float(r["delta"]), 3),
            "sentA": round(float(r["sentA"]), 3),
            "sentB": round(float(r["sentB"]), 3),
            **terms
        })
    return pd.DataFrame(rows)

def exemplar_text_rows(df: pd.DataFrame, sideA_name="A", sideB_name="B"):
    lines = []
    for _, r in df.iterrows():
        line = (
            f"{r['topic']} (Δ={r['delta']:+.3f}; {sideA_name}={r['sentA']:+.3f}, {sideB_name}={r['sentB']:+.3f}) "
            f"Overlap: [{r['overlap_words']}]. "
            f"{sideA_name}-distinct: [{r['distinct_A']}]; "
            f"{sideB_name}-distinct: [{r['distinct_B']}]."
        )
        lines.append(line)
    return lines

def find_topics(pair_df, query="trump"):
    q = query.lower()
    return sorted([t for t in pair_df["topic"].unique() if q in t.lower()])

def topic_view(pair_df, topic, sideA="Podcasts", sideB="News", top=6):
    row = pair_df.loc[pair_df["topic"]==topic].iloc[0]
    a = words_to_series_weighted(row.get("twA"));  b = words_to_series_weighted(row.get("twB"))
    a = clean_token_series(a, stoplist); b = clean_token_series(b, stoplist)
    a = a/a.sum() if a.sum()>0 else a
    b = b/b.sum() if b.sum()>0 else b
    common = list(set(a.index) & set(b.index))
    common = sorted(common, key=lambda w: min(a.get(w,0), b.get(w,0)), reverse=True)[:top]
    tiltA = (a - b.reindex(a.index, fill_value=0)).sort_values(ascending=False).head(top).index.tolist()
    tiltB = (b - a.reindex(a.index, fill_value=0)).sort_values(ascending=False).head(top).index.tolist()
    return {
        "topic": topic,
        "delta": float(row["delta"]),
        sideA+"_sent": float(row["sentA"]),
        sideB+"_sent": float(row["sentB"]),
        "overlap": common,
        sideA+"_tilt": tiltA,
        sideB+"_tilt": tiltB,
        "jaccard_clean": float(row.get("jaccard_clean", np.nan)),
    }

def npr_similarity(avg_sentiment, topwords_df, podcasts, baseline="NPR", min_top_words=5):
    rows = []
    for p in podcasts:
        pair = build_pair_contrasts(avg_sentiment, topwords_df, baseline, p, id_col="source_name", verbose=False)
        if pair.empty:
            continue
        stat = global_token_stats(topwords_df, id_col="source_name")
        stoplist_names = make_stoplist(stat, min_total_topics=5, min_podcast_share=0.85)
        pairC = add_cleaned_overlap(pair, stoplist_names, min_top_words=min_top_words)
        corr = np.corrcoef(pairC["sentA"], pairC["sentB"])[0,1] if pairC["sentA"].std()>0 and pairC["sentB"].std()>0 else np.nan
        rows.append({
            "podcast": p,
            "n_topics": int(len(pairC)),
            "median_abs_delta": float(pairC["delta"].abs().median()),
            "sent_corr": float(corr),
            "median_jaccard_clean": float(pairC["jaccard_clean"].median(skipna=True)),
        })
    out = pd.DataFrame(rows)
    return out.sort_values(by=["sent_corr","median_jaccard_clean","median_abs_delta"], ascending=[False, False, True])

def compare_against(avg_sentiment, topwords_df, base_source, others, id_col="source_name", min_top_words=3):
    rows = []
    for other in others:
        pair = build_pair_contrasts(avg_sentiment, topwords_df, base_source, other, id_col=id_col, verbose=False)
        if pair.empty:
            continue
        summ = macro_summary(pair, min_top_words=min_top_words)
        rows.append({
            "A": base_source, "B": other,
            "n_topics": summ.get("n_topics", 0),
            "n_with_topwords_both": summ.get("n_with_topwords_both", 0),
            "median_abs_delta": summ.get("median_abs_delta", np.nan),
            "share_A_more_positive": summ.get("share_A_more_positive", np.nan),
            "share_big_split_diff_frames": (pair["bucket"]=="big_split_diff_frames").mean() if not pair.empty else np.nan,
        })
    return pd.DataFrame(rows).sort_values("median_abs_delta", ascending=False)

def aggregate_distinct_tokens(pair_df, subset_mask, stoplist, side="A", top=30):
    pool = []
    for _, r in pair_df.loc[subset_mask].iterrows():
        s = words_to_series_weighted(r["twA"] if side=="A" else r["twB"])
        if s.size == 0:
            s = words_to_series_plain(r["twA_plain"] if side=="A" else r["twB_plain"])
        s = clean_token_series(s, stoplist)
        s = s / s.sum() if s.sum()>0 else s
        pool.append(s)
    if not pool:
        return pd.Series([], dtype=float)
    agg = pd.concat(pool, axis=1).fillna(0).mean(axis=1)
    return agg.sort_values(ascending=False).head(top)

def make_stoplist(stat, min_total_topics=10, min_podcast_share=0.9, max_len=3, extra_regex=r"^\d|\.com$"):
    stop = set()
    for _, r in stat.iterrows():
        t = r["tokens"]
        if (len(t) <= max_len) or re.search(extra_regex, t):
            stop.add(t)
        elif (r["total_topics"] >= min_total_topics) and (r["podcast_share_topics"] >= min_podcast_share):
            stop.add(t)
    brand_seeds = {"squarespace","bluechew","quince","promo","code","sponsor","midas","com","dot","youtube","subscribe"}
    stop |= brand_seeds
    return stop

def clean_token_series(s: pd.Series, stoplist: set):
    def keep(tok):
        t = tok.lower()
        if t in stoplist: return False
        if len(t) < 3: return False
        if any(ch.isdigit() for ch in t): return False
        if re.fullmatch(r"[a-z\-]+", t) is None: return False
        return True
    s2 = s.copy()
    s2.index = [w.lower() for w in s2.index]
    s2 = s2[s2.index.map(keep)]
    return s2

def add_cleaned_overlap(pair_df: pd.DataFrame, stoplist: set, min_top_words=3):
    j, wj, cs, nA, nB = [], [], [], [], []
    for _, r in pair_df.iterrows():
        a = words_to_series_weighted(r.get("twA")); b = words_to_series_weighted(r.get("twB"))
        if a.size == 0:
            a = words_to_series_plain(r.get("twA_plain", None))
        if b.size == 0:
            b = words_to_series_plain(r.get("twB_plain", None))
        a = clean_token_series(a, stoplist); b = clean_token_series(b, stoplist)
        a = a / a.sum() if a.sum()>0 else a
        b = b / b.sum() if b.sum()>0 else b
        a_size, b_size = int(a.size), int(b.size)
        nA.append(a_size); nB.append(b_size)
        if a.size >= min_top_words and b.size >= min_top_words:
            j.append(jaccard(set(a.index), set(b.index)))
            wj.append(weighted_jaccard(a, b))
            cs.append(cosine_sim(a, b))
        else:
            j.append(np.nan); wj.append(np.nan); cs.append(np.nan)
    out = pair_df.copy()
    out["jaccard_clean"] = j; out["w_jaccard_clean"] = wj; out["cosine_clean"] = cs
    out["n_topA_clean"] = nA; out["n_topB_clean"] = nB
    return out

def extremes_by_source(S, id_col="source_name", topic_col="topic", sentiment_col="avg_sentiment_score", k=10):
    out = {}
    for src, g in S.groupby(id_col):
        g2 = g[[topic_col, sentiment_col]].dropna()
        out[src] = {
            "most_positive": g2.sort_values(sentiment_col, ascending=False).head(k).values.tolist(),
            "most_negative": g2.sort_values(sentiment_col, ascending=True).head(k).values.tolist(),
        }
    return out

def find_topics(pair_df, query="trump"):
    q = query.lower()
    return sorted([t for t in pair_df["topic"].unique() if q in t.lower()])

def topic_view(pair_df, topic, sideA="Podcasts", sideB="News", top=6):
    row = pair_df.loc[pair_df["topic"]==topic].iloc[0]
    a = words_to_series_weighted(row.get("twA"));  b = words_to_series_weighted(row.get("twB"))
    a = clean_token_series(a, stoplist); b = clean_token_series(b, stoplist)
    a = a/a.sum() if a.sum()>0 else a
    b = b/b.sum() if b.sum()>0 else b
    common = list(set(a.index) & set(b.index))
    common = sorted(common, key=lambda w: min(a.get(w,0), b.get(w,0)), reverse=True)[:top]
    tiltA = (a - b.reindex(a.index, fill_value=0)).sort_values(ascending=False).head(top).index.tolist()
    tiltB = (b - a.reindex(a.index, fill_value=0)).sort_values(ascending=False).head(top).index.tolist()
    return {
        "topic": topic,
        "delta": float(row["delta"]),
        sideA+"_sent": float(row["sentA"]),
        sideB+"_sent": float(row["sentB"]),
        "overlap": common,
        sideA+"_tilt": tiltA,
        sideB+"_tilt": tiltB,
        "jaccard_clean": float(row.get("jaccard_clean", np.nan)),
    }

def npr_similarity(avg_sentiment, topwords_df, podcasts, baseline="NPR", min_top_words=5):
    rows = []
    for p in podcasts:
        pair = build_pair_contrasts(avg_sentiment, topwords_df, baseline, p, id_col="source_name")
        if pair.empty:
            continue
        stat = global_token_stats(topwords_df, id_col="source_name")
        stoplist_names = make_stoplist(stat, min_total_topics=5, min_podcast_share=0.85)
        pairC = add_cleaned_overlap(pair, stoplist_names, min_top_words=min_top_words)
        corr = np.corrcoef(pairC["sentA"], pairC["sentB"])[0,1] if pairC["sentA"].std()>0 and pairC["sentB"].std()>0 else np.nan
        rows.append({
            "podcast": p,
            "n_topics": int(len(pairC)),
            "median_abs_delta": float(pairC["delta"].abs().median()),
            "sent_corr": float(corr),
            "median_jaccard_clean": float(pairC["jaccard_clean"].median(skipna=True)),
        })
    out = pd.DataFrame(rows)
    return out.sort_values(by=["sent_corr","median_jaccard_clean","median_abs_delta"], ascending=[False, False, True])

def consensus_gap(avg_sentiment, topwords_df, podcasts, baseline="NPR"):
    rows = []
    for p in podcasts:
        pair = build_pair_contrasts(avg_sentiment, topwords_df, baseline, p, id_col="source_name")
        rows.append(pair[["topic","delta"]].assign(podcast=p))
    allp = pd.concat(rows)
    pivot = allp.pivot(index="topic", columns="podcast", values="delta")
    sign_share = (pivot.gt(0).sum(axis=1) / pivot.shape[1]).rename("share_podcasts_more_pos")
    mean_gap = pivot.mean(axis=1).rename("mean_delta_vs_NPR")
    return pd.concat([sign_share, mean_gap], axis=1).sort_values("share_podcasts_more_pos", ascending=False)

def map_category(topic):
    t = topic.lower()
    if "election" in t or "congress" in t or "politic" in t or "trump" in t or "biden" in t: return "politics"
    if "health" in t or "care" in t or "disease" in t or "covid" in t: return "health"
    if "crime" in t or "police" in t or "court" in t: return "justice"
    if "econom" in t or "market" in t or "business" in t or "income" in t: return "economy"
    if "foreign" in t or "war" in t or "israel" in t or "ukraine" in t or "china" in t: return "world"
    return "other"

def topic_view(pair_df, topic, sideA="Podcasts", sideB="News", top=6):
    row = pair_df.loc[pair_df["topic"]==topic].iloc[0]
    a = words_to_series_weighted(row.get("twA"));  b = words_to_series_weighted(row.get("twB"))
    a = clean_token_series(a, stoplist); b = clean_token_series(b, stoplist)
    a = a/a.sum() if a.sum()>0 else a
    b = b/b.sum() if b.sum()>0 else b
    common = list(set(a.index) & set(b.index))
    common = sorted(common, key=lambda w: min(a.get(w,0), b.get(w,0)), reverse=True)[:top]
    tiltA = (a - b.reindex(a.index, fill_value=0)).sort_values(ascending=False).head(top).index.tolist()
    tiltB = (b - a.reindex(b.index, fill_value=0)).sort_values(ascending=False).head(top).index.tolist()
    return {
        "topic": topic,
        "delta": float(row["delta"]),
        sideA+"_sent": float(row["sentA"]),
        sideB+"_sent": float(row["sentB"]),
        "overlap": common,
        sideA+"_tilt": tiltA,
        sideB+"_tilt": tiltB,
        "jaccard_clean": float(row.get("jaccard_clean", np.nan)),
    }

def map_topic_to_label(topic):
    return topic
def consensus_label(topic_row):
    return topic_row

def find_topics(pair_df, query="trump"):
    q = query.lower()
    return sorted([t for t in pair_df["topic"].unique() if q in t.lower()])

def topics_overview(pair_df, query="trump"):
    topics = find_topics(pair_df, query=query)
    return [topic_view(pair_df, t) for t in topics]

def extremes_by_source_S(S, id_col="source_name", topic_col="topic", sentiment_col="avg_sentiment_score", k=10):
    out = {}
    for src, g in S.groupby(id_col):
        g2 = g[[topic_col, sentiment_col]].dropna()
        out[src] = {
            "most_positive": g2.sort_values(sentiment_col, ascending=False).head(k).values.tolist(),
            "most_negative": g2.sort_values(sentiment_col, ascending=True).head(k).values.tolist(),
        }
    return out

def topics_by_source(S, id_col="source_name"):
    return S.groupby(id_col).size().sort_values(ascending=False)