In [1]:
# Core
import os, re, hashlib, time, json, datetime as dt
from collections import defaultdict

# HTTP / parsing / data
import requests, feedparser, pandas as pd

# NLP
import spacy
nlp = spacy.load("en_core_web_sm")

from transformers import pipeline
sentiment = pipeline(
    "sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english"
)

# X (Twitter) via snscrape *as a library* (no subprocess)
from snscrape.modules import twitter as sntwitter

Device set to use mps:0


AttributeError: 'FileFinder' object has no attribute 'find_module'

In [None]:
KEYWORDS = [
  "data center","ai campus","hyperscale","substation","megawatt",
  "moratorium","rezoning","pud","special use","rate rider","economic development rate"
]

# Minimal gazetteer: expand as you learn hotspots
COUNTY_CITIES = {
  "IN:Johnson County": {"cities": ["Greenwood","Franklin","Whiteland"]},
  "MN:Scott County":   {"cities": ["Shakopee","Prior Lake"]},
  # add more counties/cities in the MISO footprint as you discover signals
}

def has_kw(text: str) -> bool:
    low = (text or "").lower()
    return any(k in low for k in KEYWORDS)

def sha(u: str) -> str:
    return hashlib.sha256((u or "").encode()).hexdigest()

In [None]:
def gdelt_search(query: str, maxrecords: int = 100):
    """Return list of {title,url,seendate} from GDELT or Google News RSS fallback."""
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    params = {
        "query": query,
        "mode": "ArtList",
        "maxrecords": maxrecords,
        "format": "JSON",
        "sort": "DateDesc",
    }
    try:
        r = requests.get(url, params=params, timeout=30)
        # Some failures return HTML, not JSON
        if "application/json" not in (r.headers.get("content-type","")):
            raise ValueError("Non-JSON from GDELT")
        data = r.json()
        arts = data.get("articles", [])
        return [
            {"title": a.get("title",""), "url": a.get("url",""), "seendate": a.get("seendate","")}
            for a in arts
            if a.get("title") and a.get("url")
        ]
    except Exception:
        # Fallback to Google News RSS (robust and fast)
        rss = feedparser.parse(
            f"https://news.google.com/rss/search?q={requests.utils.quote(query)}&hl=en-US&gl=US&ceid=US:en"
        )
        out = []
        for e in rss.entries[:maxrecords]:
            out.append({
                "title": e.title,
                "url": e.link,
                "seendate": getattr(e, "published", "")
            })
        return out

In [None]:
def normalize_gdelt(items):
    rows = []
    for a in items:
        t = a.get("title",""); u = a.get("url",""); sd = a.get("seendate","")
        if not t or not u:
            continue
        if not has_kw(t):
            continue
        rows.append({
            "source": "news",
            "title": t,
            "text": f"{sd} {t}",
            "url": u,
            "ts": sd
        })
    return rows

In [None]:
def scrape_x(keyword: str, limit: int = 200):
    tweets = []
    for i, tw in enumerate(sntwitter.TwitterSearchScraper(keyword).get_items()):
        if i >= limit: break
        tweets.append({
            "content": getattr(tw, "rawContent", None) or getattr(tw, "content", "") or "",
            "id": tw.id,
            "date": tw.date.isoformat() if getattr(tw, "date", None) else "",
        })
    return tweets

In [None]:
def normalize_x(tweets):
    rows = []
    for tw in tweets:
        t = tw.get("content","")
        if not has_kw(t):
            continue
        u = f"https://x.com/i/web/status/{tw.get('id')}"
        rows.append({
            "source": "x",
            "title": (t[:120] + "…") if len(t) > 120 else t,
            "text": t,
            "url": u,
            "ts": tw.get("date","")
        })
    return rows

In [None]:
def enrich(row):
    # NER
    doc = nlp(row["text"])
    row["entities"] = [e.text for e in doc.ents if e.label_ in ("ORG","GPE","LOC")]

    # County heuristic
    row["county"] = None
    lower_text = row["text"].lower()
    for county, bag in COUNTY_CITIES.items():
        if any(city.lower() in lower_text for city in bag["cities"]):
            row["county"] = county
            break

    # Sentiment (explicit model chosen above)
    s = sentiment(row["text"][:512])[0]  # {'label': 'POSITIVE'|'NEGATIVE', 'score':...}
    row["polarity"] = (s["label"] or "").lower()
    row["polarity_score"] = s["score"]

    # Stance (cheap rules → upgrade later to a classifier)
    if any(w in lower_text for w in ["moratorium","ban","halt","suspend"]):
        stance = "oppose"
    elif any(w in lower_text for w in ["approved","incentive","tax abatement","wins vote","passes"]):
        stance = "support"
    else:
        stance = "neutral"
    row["stance"] = stance

    # Urgency flag
    row["urgency"] = any(w in lower_text for w in [
        "agenda", "ordinance", "hearing", "vote", "1st reading", "first reading", "second reading", "rezoning"
    ])

    return row

In [None]:
def run():
    # 1) Collect
    q_news = '("data center" OR hyperscale) sourcecountry:US'
    news_items = gdelt_search(q_news, maxrecords=150)
    news_rows = normalize_gdelt(news_items)

    q_x = '("data center" OR hyperscale) (moratorium OR rezoning OR substation) lang:en'
    x_items = scrape_x(q_x, limit=300)
    x_rows = normalize_x(x_items)

    all_rows = news_rows + x_rows
    if not all_rows:
        return pd.DataFrame(), pd.Series(dtype=float)

    # 2) Deduplicate & enrich
    df = pd.DataFrame(all_rows).drop_duplicates(subset="url").reset_index(drop=True)
    if df.empty:
        return df, pd.Series(dtype=float)

    df = df.apply(enrich, axis=1, result_type="expand")

    # 3) Score each row
    def score(r):
        w = 0.0
        w += 1.0 if r["polarity"]=="negative" else 0.3 if r["polarity"]=="positive" else 0.1
        w += 1.2 if r["stance"]=="oppose"  else 0.6 if r["stance"]=="support"  else 0.0
        w += 1.5 if r["urgency"] else 0.0
        w += 0.5 if r["county"] else 0.0
        return w

    df["emi_component"] = df.apply(score, axis=1)

    # 4) Aggregate by county (drop NAs so you see mapped ones)
    agg = (df.groupby("county", dropna=True)["emi_component"]
             .sum()
             .sort_values(ascending=False))

    return df, agg

In [None]:
print("Top counties (EMI):")
print(agg.head(5))

for county in agg.head(3).index:
    print(f"\n== {county} ==")
    print(
        df[df["county"]==county][["source","title","url","stance","urgency"]]
          .head(3)
          .to_markdown(index=False)
    )