In [None]:
#@markdown # Imports and helper functions
#!uv pip install xmltodict html-to-markdown[lxml,html5lib]

import os
import time
import json
import pickle
import requests
import xmltodict
import pandas as pd
from tqdm.cli import tqdm
from bs4 import BeautifulSoup
from multiprocessing import Pool
from urllib.parse import urljoin, urlencode
from html_to_markdown import convert_to_markdown

os.chdir('/projects/moties')

tqdm.pandas()

BASE = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/" #@param {"type": "string"}

def progress_wrap(iterable, *, initial_total=64, grow_factor=2, desc="Working", unit="it", mininterval=0.1):
    """
    Wrap an iterable/generator of unknown length with a tqdm progress bar
    that grows its total by `grow_factor` whenever it would overflow.

    Parameters
    ----------
    iterable : iterable
        Your generator/iterable of unknown length.
    initial_total : int, optional
        Initial progress bar total (small on purpose). Default: 64.
    grow_factor : int/float, optional
        Factor to grow the bar total by on overflow. Default: 2.
    desc, unit, mininterval : tqdm cosmetics.

    Yields
    ------
    items from `iterable` unchanged
    """
    count = 0
    total = max(1, int(initial_total))
    with tqdm(total=total, desc=desc, unit=unit, mininterval=mininterval) as bar:
        for item in iterable:
            count += 1

            # If we've reached/passed the current total, expand the bar.
            if count > bar.total:
                new_total = max(int(bar.total * grow_factor), count)
                bar.total = new_total
                bar.refresh()  # ensure the new total is visible

            bar.update(1)
            yield item

        # When the iterable ends, snap total to the true count and finish.
        if count != bar.total:
            bar.total = count
            bar.refresh()


def odata_get(url, params=None, sleep=0.4, max_retries=4, timeout=60):
    """
    GET an OData endpoint with simple retry/backoff.
    """
    if params:
        encoded_params = urlencode(params, safe="$=(),'; ")
        url = f"{url}?{encoded_params}"
    # print(url)
    for i in range(max_retries):
        r = requests.get(url, timeout=timeout, headers={"Accept": "application/json"})
        if r.status_code == 200:
            return r.json()
        # backoff on transient issues
        if r.status_code >= 500 or r.status_code == 429:
            time.sleep(sleep * (2 ** i))
            continue
        r.raise_for_status()
    raise RuntimeError(f"Failed after retries: {url}")


def odata_paged(url, params=None, page_size=250, timeout=60):
    """
    Iterate all pages using client-driven paging with $skip.
    Tweede Kamer API caps responses at 250; it doesn't emit @odata.nextLink.
    """
    params = dict(params or {})
    # If caller set $top smaller than the page size, keep it; else use page_size
    eff_top = int(params.get("$top", page_size))
    params["$top"] = str(eff_top)

    skip = int(params.get("$skip", 0))
    while True:
        params["$skip"] = str(skip)
        # print(params)
        data = odata_get(url, params=params, timeout=timeout)
        batch = data.get("value", [])
        if not batch:
            break
        for row in batch:
            yield row
        # stop when we got fewer than requested, meaning last page
        if len(batch) < eff_top:
            break
        skip += eff_top

In [None]:
#@markdown # Fetch Besluiten

rerun = False #@param {"type": "boolean"}

if os.path.exists('besluiten.p3') and not rerun:
  with open('besluiten.p3', 'rb') as f:
    besluiten = pickle.load(f)
else:
  fields = ["Id", "Agendapunt_Id", "StemmingsSoort", "BesluitSoort",
            "BesluitTekst", "Opmerking", "Status", "AgendapuntZaakBesluitVolgorde",
            "GewijzigdOp", "ApiGewijzigdOp", "Verwijderd"]

  url = urljoin(BASE, "Besluit")

  date_from = '2023-12-06T00:00:00Z'

  params = {
    "$select": ",".join(fields),
    "$filter": (
        f"Verwijderd eq false and "
        f"ApiGewijzigdOp ge {date_from} and "
        f"StemmingsSoort ne null and "
        f"BesluitSoort ne 'Stemmen - gestaakt' and "
        f"BesluitSoort ne 'Stemmen - aangehouden' and "
        f"BesluitSoort ne 'Stemmen - uitstellen'"
    ),
    "$orderby": "ApiGewijzigdOp asc",
    "$top": "250",
  }

  count_params = dict(**params)
  count_params.update({'$top': 0, '$count': 'true'})
  total = int(odata_get(url, params=count_params)['@odata.count'])

  besluiten = list(progress_wrap(odata_paged(url, params=params), initial_total=total))

  with open('besluiten.p3', 'wb') as f:
    pickle.dump(besluiten, f)

besluiten_ = pd.DataFrame(besluiten)

In [None]:
#@markdown # Fetch Stemmingen

rerun = False #@param {"type": "boolean"}

if os.path.exists('stemmingen.p3') and not rerun:
  with open('stemmingen.p3', 'rb') as f:
    stemmingen = pickle.load(f)
else:
  fields = ["Id", "Besluit_Id", "Soort", "FractieGrootte", "ActorNaam",
          "ActorFractie", "Vergissing", "SidActorLid", "SidActorFractie",
          "Persoon_Id", "Fractie_Id", "GewijzigdOp", "ApiGewijzigdOp",
          "Verwijderd"]

  def get(id):
    return odata_get(urljoin(BASE, f"Besluit({id})/Stemming"),
                    params={"$select": ",".join(fields), "$filter": f"Verwijderd eq false"})

  with Pool(20) as pool:
    promises = [
      pool.apply_async(get, args=(id,))
      for id in besluiten_['Id']
    ]
    stemmingen = [p.get() for p in tqdm(promises, position=0)]

  with open('stemmingen.p3', 'wb') as f:
    pickle.dump(stemmingen, f)

stemmingen_ = pd.DataFrame(stemmingen)
stemmingen__ = stemmingen_.query("value.str.len() > 0")['value'].explode().apply(pd.Series).query("~Verwijderd")

assert len(set(stemmingen__['Besluit_Id']) - set(besluiten_['Id'])) == 0
stemmingen__

Unnamed: 0,Id,Besluit_Id,Soort,FractieGrootte,ActorNaam,ActorFractie,Vergissing,SidActorLid,SidActorFractie,Persoon_Id,Fractie_Id,GewijzigdOp,ApiGewijzigdOp,Verwijderd
0,c8ab49ea-ecfc-4f5b-b21f-1aa4c97f5c74,66afa68a-07d9-44d6-9b83-b9a308d7a221,Voor,24,VVD,VVD,False,,S-1-365867521-2120874753-2712795160-1237583058...,,7476e97a-3243-4122-9df6-ba7d82a5279b,2023-12-07T13:58:44.703+01:00,2023-12-07T12:59:15.8857866Z,False
0,5f5a63c9-bd88-478e-a603-23a962d7f1b2,66afa68a-07d9-44d6-9b83-b9a308d7a221,Voor,3,FVD,FVD,False,,S-1-365867521-2120874753-318435223-1117601640-...,,e0b7b638-de3c-47cc-85bd-341dd65ea33d,2023-12-07T13:58:44.703+01:00,2023-12-07T12:59:16.4639243Z,False
0,c11bc2f3-a86c-4286-8a52-39f236f568b4,66afa68a-07d9-44d6-9b83-b9a308d7a221,Voor,3,SGP,SGP,False,,S-1-365867521-2120874753-1520352372-1191064225...,,77f9b6f1-b1a9-4d1b-a05e-9936e79d8fa5,2023-12-07T13:58:44.703+01:00,2023-12-07T12:59:15.7598285Z,False
0,ce821406-c067-4508-a860-3f5d00a4e0e4,66afa68a-07d9-44d6-9b83-b9a308d7a221,Voor,7,BBB,BBB,False,,S-1-365867521-2120874753-3989062109-1173306659...,,626555ac-e836-44e3-9978-a6a7f0abc3ce,2023-12-07T13:58:44.7+01:00,2023-12-07T12:59:15.1973217Z,False
0,291cfa57-3136-436f-86de-42f1bdaee1b9,66afa68a-07d9-44d6-9b83-b9a308d7a221,Tegen,2,Volt,Volt,False,,S-1-365867521-2120874753-4235502623-1217496042...,,ae48391e-ce4d-47e0-86e3-ee310282f66f,2023-12-07T13:58:44.703+01:00,2023-12-07T12:59:16.2451799Z,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8019,145c9613-1d15-408a-80a6-f768ef50cc32,f6d53649-c2e1-46c4-a768-8b57941356d1,Tegen,37,"Roon de, R.",PVV,False,S-1-365867521-2120874753-2992288398-1183154141...,S-1-365867521-2120874753-203711267-1259409971-...,c60c655f-40ba-47b0-8e93-a2c8d1ff4743,65129918-f256-4975-9da4-488da34d6695,2025-10-03T10:25:38.89+02:00,2025-10-03T08:25:49.1778076Z,False
8019,9c0cbb6b-438f-4fb9-9f69-f851795112b3,f6d53649-c2e1-46c4-a768-8b57941356d1,Voor,3,"Flach, A.J.",SGP,False,S-1-365867521-2120874753-3980532432-1332259211...,S-1-365867521-2120874753-1520352372-1191064225...,2bea63db-715f-401f-b710-e91c7650df0c,77f9b6f1-b1a9-4d1b-a05e-9936e79d8fa5,2025-10-03T10:25:38.897+02:00,2025-10-03T08:25:51.526669Z,False
8019,9f9b6e0c-1fdd-42e2-bf5c-fc58d29b6c0b,f6d53649-c2e1-46c4-a768-8b57941356d1,Voor,3,"Bikker, M.H.",ChristenUnie,False,S-1-365867521-2120874753-1186413428-1115078385...,S-1-365867521-2120874753-1996180723-1207380569...,b53f74bf-e57c-4e01-8d65-de2f319f44db,d720f5af-0516-408a-b830-0b6ffb8a581c,2025-10-03T10:25:38.877+02:00,2025-10-03T08:25:45.9749083Z,False
8019,ae76946c-fcff-400b-bf9e-fd01c4d996f0,f6d53649-c2e1-46c4-a768-8b57941356d1,Voor,25,"Klaver, J.F.",GroenLinks-PvdA,False,S-1-365867521-2120874753-1862862533-1326983883...,S-1-365867521-2120874753-4073584695-1095381795...,0d036fcf-a456-4798-9b76-3f44eb74a451,0208097d-ef04-438a-8c29-eebb84956204,2025-10-03T10:25:38.88+02:00,2025-10-03T08:25:46.1717566Z,False


In [None]:
#@markdown # Group stemmingen to besluiten and re-fetch stemmingen with less than 150 zetels seperately

fields = ["Id", "Besluit_Id", "Soort", "FractieGrootte", "ActorNaam",
        "ActorFractie", "Vergissing", "SidActorLid", "SidActorFractie",
        "Persoon_Id", "Fractie_Id", "GewijzigdOp", "ApiGewijzigdOp",
        "Verwijderd"]

params = {"$select": ",".join(fields), "$top": "250", "$filter": f"Verwijderd eq false", }
# Assign a number of zetels (weight) to each stemming, since fractions consist
# of personal votes (1 zetel) and fraction votes (the rest)
stemmingen__['Weight'] = None
stemmingen__.loc[~stemmingen__['Persoon_Id'].isna(), 'Weight'] = 1
assert stemmingen__[stemmingen__['Weight'].isna()].groupby(['Besluit_Id', 'ActorFractie'])[[]].apply(len).max() <= 1, "After all personal votes, there are fractions left with more than 1 'fractie-vote', which is super weird because how can you distribute the left over zetels?"

personal_votes = stemmingen__.groupby(['Besluit_Id', 'ActorFractie'])['Weight'].apply(lambda x: x.notna().sum())
fraction_votes = stemmingen__.groupby(['Besluit_Id', 'ActorFractie'])['FractieGrootte'].first() - personal_votes
left = stemmingen__['Weight'].isna()
stemmingen__.loc[left, 'Weight'] = stemmingen__.loc[left].apply(lambda x: fraction_votes.loc[x['Besluit_Id'], x['ActorFractie']], axis=1)
stemmen = stemmingen__.groupby(['Besluit_Id', 'ActorFractie', 'Soort'])['Weight'].sum().reset_index().pivot(index='Besluit_Id', columns=['ActorFractie', 'Soort'], values='Weight').fillna(0).astype(int)

stemmen.sum(1).value_counts()

  stemmen = stemmingen__.groupby(['Besluit_Id', 'ActorFractie', 'Soort'])['Weight'].sum().reset_index().pivot(index='Besluit_Id', columns=['ActorFractie', 'Soort'], values='Weight').fillna(0).astype(int)


150    7199
149     586
147     182
148      53
Name: count, dtype: int64

In [None]:
#@markdown # Fetch Zaken

rerun = True #@param {"type": "boolean"}

if os.path.exists('zaken2.p3') and not rerun:
  with open('zaken2.p3', 'rb') as f:
    zaken, besluiten_ids = pickle.load(f)
else:
  fields = [
      "Id", "Nummer", "Soort", "Titel", "Citeertitel", "Alias", "Onderwerp",
      "GestartOp", "Organisatie", "Grondslagvoorhang", "Termijn", "Vergaderjaar",
      "Volgnummer", "HuidigeBehandelstatus", "Afgedaan", "GrootProject",
      "GewijzigdOp", "ApiGewijzigdOp", "Verwijderd"]

  def get(id):
    return odata_get(urljoin(BASE, f"Besluit({id})/Zaak"),
                    params={"$select": ",".join(fields)})
  besluiten_ids = pd.Series(list(set(stemmingen__['Besluit_Id']))).rename('Besluit_Id')

  with Pool(10) as pool:
    promises = [
      pool.apply_async(get, args=(id,))
      for id in besluiten_ids
    ]
    zaken = [p.get() for p in tqdm(promises, position=0)]

  with open('zaken2.p3', 'wb') as f:
    pickle.dump([zaken, besluiten_ids], f)

zaken_ = pd.DataFrame(zaken).set_index(besluiten_ids)['value'].explode().apply(pd.Series).reset_index()

assert (zaken_['Id'].value_counts() == 1).all()

stemmen = stemmen.set_index(zaken_['Id'].rename('Zaak_Id'), append=True)

100%|██████████| 8020/8020 [00:43<00:00, 184.23it/s]


In [None]:
#@markdown # Sanity checks

assert set(stemmingen__['Besluit_Id']) == set(besluiten_['Id'])
assert besluiten_['Id'].value_counts().max() <= 1
assert stemmingen__['Id'].value_counts().max() <= 1

pd.options.display.max_columns = 70

display(stemmingen__['Besluit_Id'].value_counts().value_counts().sort_index().rename('besluiten with count stemmingen').reset_index([]).T)

count,10,15,16,20,147,149,150
besluiten with count stemmingen,15,7911,2,2,2,5,83


In [None]:
#@markdown # Fetch Documenten

rerun = False #@param {"type": "boolean"}

if os.path.exists('documenten.p3') and not rerun:
  with open('documenten.p3', 'rb') as f:
    documenten = pickle.load(f)
else:
  fields = [
    "Id", "Soort", "DocumentNummer", "Titel", "Onderwerp", "Datum",
    "Vergaderjaar", "Kamer", "Volgnummer", "Citeertitel", "Alias",
    "DatumRegistratie", "DatumOntvangst", "Aanhangselnummer", "KenmerkAfzender",
    "Organisatie", "ContentType", "ContentLength", "GewijzigdOp",
    "ApiGewijzigdOp", "Verwijderd"]

  def get(id):
    return odata_get(urljoin(BASE, f"Zaak({id})/Document"),
                    params={"$select": ",".join(fields)})

  with Pool(10) as pool:
    promises = [
      pool.apply_async(get, args=(id,))
      for id in zaken_['Id']
    ]
    documenten = [p.get() for p in tqdm(promises, position=0)]

  with open('documenten.p3', 'wb') as f:
    pickle.dump(documenten, f)

documenten_ = pd.DataFrame(documenten)
documenten__ = documenten_.set_index(zaken_['Id'].rename('Zaak_Id'))['value'].explode().apply(pd.Series).reset_index().drop(0, axis=1)

assert set(zaken_['Id']) == set(documenten__['Zaak_Id'])

display(documenten__['Zaak_Id'].value_counts().value_counts().sort_index().rename('zaken with count documenten').reset_index([]).T)

count,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,24
zaken with count documenten,7804,4,1,9,12,16,33,33,21,35,24,10,7,2,4,3,1,1


In [None]:
#@markdown # Fetch Versies

rerun = False #@param {"type": "boolean"}

if os.path.exists('versies.p3') and not rerun:
  with open('versies.p3', 'rb') as f:
    versies = pickle.load(f)
else:
  fields = [
      "Id", "Status", "Versienummer", "Bestandsgrootte", "Extensie", "Datum",
      "GewijzigdOp", "ApiGewijzigdOp", "Verwijderd", "Document_Id",
      "ExterneIdentifier"]

  def get(id):
    try:
      if pd.isna(id):
        return None
      return odata_get(urljoin(BASE, f"Document({id})/DocumentVersie"),
                      params={"$select": ",".join(fields)})
    except Exception as e:
      print(e)
      pass

  with Pool(20) as pool:
    promises = [
      pool.apply_async(get, args=(id,))
      for id in documenten__['Id']
    ]
    versies = [p.get() for p in tqdm(promises)]

  with open('versies.p3', 'wb') as f:
    pickle.dump(versies, f)

versies_ = pd.Series(versies).apply(pd.Series)['value'].explode().apply(pd.Series)

# lastest version:
versies__ = versies_.dropna(
  subset=['Versienummer']
).groupby('Document_Id').apply(
  lambda x: x.sort_values('Versienummer').iloc[-1]
)

# Add Identifier to documents
identifiers = versies__['ExterneIdentifier'].dropna().to_dict()
documenten__['ExterneIdentifier'] = documenten__['Id'].apply(identifiers.get)

assert documenten__['Id'].isna().sum() == 1 # one ugly sheep :(

assert set(documenten__['Id'].dropna()) == set(versies__['Document_Id'])

#should all be 1, because we picked the latest version
display(versies__['Document_Id'].value_counts().value_counts().sort_index().rename('documents with count versies').reset_index([]).T)

  ).groupby('Document_Id').apply(


count,1
documents with count versies,9690


In [None]:
#@markdown # Collect HTML of documents

rerun = True #@param {"type": "boolean"}

urls = documenten__['ExterneIdentifier'].dropna()
urls.index = urls
urls = 'https://zoek.officielebekendmakingen.nl/' + urls + '.html'

def try_get(p):
  try: return p.get()
  except Exception as e:
    if type(e) == KeyboardInterrupt:
      raise
    print(e)
    return None

if os.path.exists('responses.p3') and not rerun:
  with open('responses.p3', 'rb') as f:
    versies = pickle.load(f)
else:
  responses = []
  with Pool(40) as pool:
    promises = urls.apply(lambda url: pool.apply_async(requests.get, args=(url,)))

    responses = promises.progress_apply(try_get)

  for _ in range(3): # retry non-200's max 3 times
    status_codes = responses.apply(lambda x: x.status_code)
    failures = status_codes != 200
    if failures.sum() == 0:
      break
    missed_responses = urls[status_codes != 200].progress_apply(requests.get)
    responses[status_codes != 200] = missed_responses

  with open('responses.p3', 'wb') as f:
    pickle.dump(responses, f)

missed_responses.apply(lambda x: x.status_code).value_counts()

100%|██████████| 9217/9217 [04:52<00:00, 31.46it/s]
100%|██████████| 1/1 [00:00<00:00, 12.97it/s]
100%|██████████| 1/1 [00:00<00:00, 11.50it/s]
100%|██████████| 1/1 [00:00<00:00, 13.42it/s]


ExterneIdentifier
404    1
Name: count, dtype: int64

In [None]:
#@markdown # Extract markdown representation, e.g. this is friendly for LLM
def to_markdown(html):
  return '\n\n'.join(convert_to_markdown(str(x)) for x in BeautifulSoup(html).select('article #broodtekst'))


with Pool(40) as pool:
    promises = pd.Series(responses).apply(lambda x: pool.apply_async(to_markdown, (x.text, )))
    markdowns = promises.progress_apply(lambda x: x.get())

documenten___ = documenten__.set_index('Zaak_Id')[[
  'Soort', 'DocumentNummer', 'Titel', 'Onderwerp', 'Datum', 'Vergaderjaar',
  'Volgnummer', 'Citeertitel', 'Alias', 'DatumRegistratie', 'DatumOntvangst',
  'Aanhangselnummer', 'KenmerkAfzender', 'Organisatie', 'ExterneIdentifier'
]]
documenten___['Markdown'] = documenten___['ExterneIdentifier'].apply(markdowns.to_dict().get)

100%|██████████| 9217/9217 [00:24<00:00, 368.77it/s]


In [None]:
zaken_ = zaken_.sort_values('Besluit_Id')
besluiten_ = besluiten_.sort_values('Id')
stemmen = stemmen.sort_index()

texts = documenten___.query('~Markdown.isnull()').groupby('Zaak_Id')['Markdown'].apply('\n\n'.join)

stemmen_ = stemmen.loc[:,texts.index, :]

In [None]:
with open('decision_data.p3', 'wb') as f:
  pickle.dump({
      'besluiten': besluiten_,
      'zaken': zaken_,
      'documenten': documenten___,
      'versies': versies__,
      'stemmen': stemmen,
      'texts': texts,
  }, f)

In [None]:
#@markdown # Set-up LLM Guided Generation / Structured output into dataframe

# pip install --upgrade openai pydantic
from google.colab import userdata
from typing import Literal, List, Optional
from pydantic import BaseModel, Field
from openai import OpenAI
import hashlib
from pathlib import Path


if 'api_key' not in globals():
  api_key = userdata.get('RUGLLM')
base_url="https://llm.hpc.rug.nl/v1"

check_cache = True #@param {"type": "boolean"}

client = OpenAI(api_key=api_key, base_url=base_url)

model='mistralai/Mistral-Small-3.2-24B-Instruct-2506'

CACHE_DIR = Path(".cache_responses")
CACHE_DIR.mkdir(exist_ok=True)


def _hash_messages(messages, model: str) -> str:
  h = hashlib.sha256(f"{model}:{json.dumps(messages)}".encode("utf-8")).hexdigest()
  return h[:16]  # shorter filename


class VoteAnnotation(BaseModel):
    topic: Literal[
  'Immigration / asylum policy',
  'Nitrogen ("stikstof") / agriculture ',
  'Environmental regulation',
  'Animal wellfare',
  'Housing and real estate / housing shortage'
  'Climate & energy policy / sustainability / decarbonisation',
  'Healthcare and long-term care funding / capacity',
  'Digitalization, software sovereignty, cybersecurity',
  'Government stability, coalition collapse, confidence motions',
  'European Council on Refugees and Exiles',
  'Parliamentary inquiries / oversight - COVID-19 response',
  'Parliamentary inquiries / oversight -Groningen gas extraction',
  'Public finances and budgeting / taxation',
  'International / foreign policy - Ukraine / Russia',
  'International / foreign policy - Palestina / Israel',
  'International / foreign policy - Middle East',
  'International / foreign policy - Other',
  'Other',
]
    summary_of_decision: str = ""
    beneficiaries_of_vote_in_favor: List[str] = []
    beneficiaries_of_vote_against: List[str] = []

    economic_cost_impact_of_vote_in_favor: Literal["lower","neutral","higher","unclear", "n/a"] = "n/a"
    economic_cost_impact_of_vote_against: Literal["lower","neutral","higher","unclear", "n/a"] = "n/a"

    environment_impact_of_vote_in_favor: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"
    environment_impact_of_vote_against: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"

    security_impact_of_vote_in_favor: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"
    security_impact_of_vote_against: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"

    social_security_impact_of_vote_in_favor: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"
    social_security_impact_of_vote_against: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"

    healthcare_impact_of_vote_in_favor: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"
    healthcare_impact_of_vote_against: Literal["improves","neutral","worsens","unclear", "n/a"] = "n/a"

    rights_impact_of_vote_against: Literal["expands","neutral","restricts","unclear", "n/a"] = "n/a"
    rights_impact_of_vote_in_favor: Literal["expands","neutral","restricts","unclear", "n/a"] = "n/a"

    fiscal_tag_of_vote_against: Literal["saves","costs","budget-neutral","unclear", "n/a"] = "n/a"
    fiscal_tag_of_vote_in_favor: Literal["saves","costs","budget-neutral","unclear", "n/a"] = "n/a"
    notes: Optional[str] = None


SYSTEM_INSTRUCTIONS = f"""
You are a parliamentary voting annotator.
Return ONLY a JSON object that matches the provided JSON Schema exactly.
If information is missing, infer conservatively and set *_impact fields to 'unclear'.

Evaluate fiscal and cost tags from the perspective of government expenditure only.
If the motion shifts cost to individuals, that means the state saves; if it shifts cost to the state, that means the state spends.

Always interpret “Pro vote” as voting in favor of the motion text, and “Con vote” as voting against it.

Base all assessments on the intended effects the motion requests from the government, not on hypothetical outcomes of other policies.

Use “unclear” rather than guessing when impacts are ambiguous.

Fiscal and cost impacts
 * Fiscal tags (fiscal_tag_pro_vote / fiscal_tag_con_vote) refer to government budget effects only — not private costs or prices.
 * "saves" → government spends less or collects more.
 * "costs" → government spends more or collects less.
 * "budget-neutral" → roughly no fiscal change.
 * "unclear" → not enough detail to tell.

Cost impacts (cost_impact_*) describe overall economic costs or financial burden on society, not just on government.
 * "lower" means overall cost burden decreases.
 * "higher" means overall cost burden increases.

Avoid automatic mirroring; evaluate pro and con sides independently.

Rights impacts
 * "expands" → extends, protects, or strengthens individual rights or access (e.g., privacy, asylum, healthcare).
 * "restricts" → limits or removes such rights.
 * "neutral" → no relevant rights dimension.

If the motion changes enforcement, data use, or oversight, treat it as affecting rights.

Environment impacts
Assess only if the motion directly or foreseeably affects environmental regulation, agriculture, nitrogen, energy, or sustainability.
Otherwise use "n/a".

Security impacts
 * "improves" → enhances safety, counter-terrorism, or law enforcement capability.
 * "worsens" → reduces or undermines security capacity.
 * "neutral" → not relevant to safety or security.
 * "unclear" → possible mixed effects.

Claimed beneficiaries
Identify who the motion explicitly aims to help or protect (“claimed beneficiaries pro vote”).
Identify who benefits from rejecting the motion or maintaining the status quo (“claimed beneficiaries con vote”).
Use groups (e.g., refugees, taxpayers, farmers, healthcare workers) rather than individuals.

Notes
 * Use "notes" for concise reasoning behind your tagging (2–4 sentences).
 * Mention key trade-offs (e.g., rights vs. security, fiscal savings vs. social cost).
 * Example micro-rule summary (can be pasted in the system prompt)

“Evaluate from the government perspective.
‘Saves’ = reduces state spending; ‘Costs’ = increases it.
Do not assume symmetry between pro and con tags.
Distinguish private costs from fiscal costs.
Mark impacts as ‘unclear’ or ‘n/a’ if not directly affected.”

Schema:
{VoteAnnotation.model_json_schema()}
"""

SUMMARIZE_SYSTEM_INSTRUCTIONS = f"""
You are a parliamentary decision making expert.

Sumarize the text the use provides such that the essence of the decision this document is about stays clear.

Summarize to approximately 5 short paragraphs at most, or less.
"""

prompt = texts.sample(1).iloc[0]


def messages_for(text):
  return [
    {"role": "system", "content": SYSTEM_INSTRUCTIONS},
    {"role": "user", "content": text},
  ]

def sumarize_messages_for(text):
  return [
    {"role": "system", "content": SUMMARIZE_SYSTEM_INSTRUCTIONS},
    {"role": "user", "content": text},
  ]


def structure(text):
  messages=messages_for(text)
  cache_file = CACHE_DIR / f"{_hash_messages(messages, model)}.json"

  if cache_file.exists():
    with open(cache_file, "r", encoding="utf-8") as f:
      cached_data = json.load(f)
    return cached_data["response"]

  completion = client.chat.completions.parse(
    model=model, messages=messages, response_format=VoteAnnotation, )

  response = completion.choices[0].message.content.strip()

  with open(cache_file, "w", encoding="utf-8") as f:
      json.dump({"messages": messages, "response": response}, f, ensure_ascii=False, indent=2)

  return response

def summarize(text):
  messages=sumarize_messages_for(text)
  cache_file = CACHE_DIR / f"{_hash_messages(messages, model)}.json"

  if cache_file.exists():
    with open(cache_file, "r", encoding="utf-8") as f:
      cached_data = json.load(f)
    return cached_data["response"]

  completion = client.chat.completions.create(
    model=model, messages=messages, max_tokens=128000)

  response = completion.choices[0].message.content.strip()

  with open(cache_file, "w", encoding="utf-8") as f:
      json.dump({"messages": messages, "response": response}, f, ensure_ascii=False, indent=2)

  return response

In [None]:
with Pool(10) as pool:
  promises = []
  for id, text in texts.items():
    if text == ""  or len(text) <= 3000 or len(text) > 10000:
      continue

    messages=sumarize_messages_for(text)
    cache_file = CACHE_DIR / f"{_hash_messages(messages, model)}.json"
    if cache_file.exists():
      continue

    promises.append([id, pool.apply_async(structure, (text,))])


  for id, promise in tqdm(promises):
    try:
      summaries[id] = promise.get()
    except KeyboardInterrupt:
      raise
    except Exception as e:
      print(text)
      print(e)
      pass


In [None]:
#@markdown # Run LLM Guided Generation / Structured output into dataframe

responses = dict()
with Pool(10) as pool:
  promises = []
  for text in texts.iloc[::-1]:
    if text == "":# or len(text) >= 10000:
      continue

    messages=messages_for(text)
    cache_file = CACHE_DIR / f"{_hash_messages(messages, model)}.json"
    if cache_file.exists():
      continue

    # try:
    #   responses[text] = structure(text)
    # except KeyboardInterrupt:
    #   raise
    # except Exception as e:
    #   print(text)
    #   print(e)
    #   pass

    #
    promises.append([text, pool.apply_async(structure, (text,))])

  for text, promise in tqdm(promises[::-1]):
    try:
      responses[text] = promise.get()
    except KeyboardInterrupt:
      raise
    except Exception as e:
      print(text)
      print(e)
      pass


  0%|          | 0/229 [00:00<?, ?it/s]Exception in thread Thread-117 (_handle_results):
Traceback (most recent call last):
  File "/home/herbert/.local/share/uv/python/cpython-3.12.0-linux-x86_64-gnu/lib/python3.12/threading.py", line 1052, in _bootstrap_inner
    self.run()
  File "/home/herbert/.local/share/uv/python/cpython-3.12.0-linux-x86_64-gnu/lib/python3.12/threading.py", line 989, in run
    self._target(*self._args, **self._kwargs)
  File "/home/herbert/.local/share/uv/python/cpython-3.12.0-linux-x86_64-gnu/lib/python3.12/multiprocessing/pool.py", line 579, in _handle_results
    task = get()
           ^^^^^
  File "/home/herbert/.local/share/uv/python/cpython-3.12.0-linux-x86_64-gnu/lib/python3.12/multiprocessing/connection.py", line 250, in recv
    return _ForkingPickler.loads(buf.getbuffer())
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
  0%|          | 0/229 [1:2

AssertionError: Cannot have cache with result_handler not alive

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/herbert/.local/share/uv/python/cpython-3.12.0-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/herbert/.local/share/uv/python/cpython-3.12.0-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()


In [None]:
structures = texts.progress_apply(structure)
structured = structures.apply(lambda x: pd.Series(json.loads(x.choices[0].message.content)))