# DR Municipal Election Candidate Scraper - Enhanced Version
This notebook scrapes comprehensive candidate information including:
- Basic info (name, party, municipality)
- Policy priorities (m√¶rkesager)
- **19 candidate test answers (svars)**
- **Personal info (Om section): Uddannelse, Bop√¶l, social media**

## Installation

In [1]:
%pip install selenium beautifulsoup4 pandas webdriver-manager requests lxml

Note: you may need to restart the kernel to use updated packages.


## Import Libraries

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import json
from typing import Dict, List, Any

## Initialize WebDriver

In [3]:
def setup_driver(headless=True):
    """Set up Chrome WebDriver with options"""
    options = Options()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36')
    
    # Auto-install ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Initialize driver
driver = setup_driver(headless=True)
print("‚úÖ WebDriver initialized successfully")

‚úÖ WebDriver initialized successfully


## Helper Functions for Data Extraction

In [4]:
def extract_basic_info(soup: BeautifulSoup, url: str) -> Dict[str, str]:
    """
    Extract basic candidate information from page
    """
    # Extract candidate ID from URL
    url_parts = url.split('/')[-1]
    candidate_id = url_parts.split('-')[0] if '-' in url_parts else ''
    
    # Get title (Name (Party) Municipality | KV25 | DR)
    page_title = soup.find('title')
    title_text = page_title.text if page_title else ''
    
    name = ''
    party = ''
    municipality = ''
    
    if title_text:
        parts = title_text.split('|')[0].strip()
        if '(' in parts and ')' in parts:
            name = parts.split('(')[0].strip()
            party = parts.split('(')[1].split(')')[0].strip()
            municipality = parts.split(')')[1].strip() if len(parts.split(')')) > 1 else ''
    
    return {
        'candidate_id': candidate_id,
        'name': name,
        'party': party,
        'municipality': municipality,
        'url': url
    }


def extract_om_section(soup: BeautifulSoup) -> Dict[str, str]:
    """
    Extract 'Om' (About) section: Uddannelse, Bop√¶l, etc.
    """
    om_data = {
        'uddannelse': '',
        'bopael': '',
        'alder': '',
        'erhverv': '',
        'sociale_medier': []
    }
    
    # Look for definition list (dl) elements which contain the Om information
    dls = soup.find_all('dl')
    
    for dl in dls:
        dts = dl.find_all('dt')
        dds = dl.find_all('dd')
        
        for dt, dd in zip(dts, dds):
            key = dt.get_text(strip=True).lower()
            value = dd.get_text(strip=True)
            
            if 'uddannelse' in key:
                om_data['uddannelse'] = value
            elif 'bop√¶l' in key or 'bopael' in key:
                om_data['bopael'] = value
            elif 'alder' in key:
                om_data['alder'] = value
            elif 'erhverv' in key or 'job' in key or 'besk√¶ftigelse' in key:
                om_data['erhverv'] = value
    
    # Extract social media links
    social_media_section = soup.find(string=re.compile('Sociale medier', re.I))
    if social_media_section:
        parent = social_media_section.find_parent()
        if parent:
            links = parent.find_all('a', href=True)
            om_data['sociale_medier'] = [link['href'] for link in links]
    
    return om_data


def extract_priorities(soup: BeautifulSoup) -> List[Dict[str, Any]]:
    """
    Extract policy priorities (m√¶rkesager)
    """
    priorities = []
    text_content = soup.get_text(separator='\n', strip=True)
    lines = text_content.split('\n')
    
    current_priority = {}
    for line in lines:
        line = line.strip()
        if line.isdigit() and int(line) <= 10:
            if current_priority:
                priorities.append(current_priority)
            current_priority = {'number': int(line), 'text': ''}
        elif current_priority and line and not line.isdigit():
            if current_priority['text']:
                current_priority['text'] += ' '
            current_priority['text'] += line
    
    if current_priority:
        priorities.append(current_priority)
    
    return priorities


def extract_candidate_test_answers(driver, soup: BeautifulSoup) -> Dict[int, str]:
    """
    Extract all 19 answers from the candidate test (svars)
    These are typically loaded dynamically, so we need to interact with the page
    """
    answers = {}
    
    try:
        # Wait for page to fully load
        time.sleep(2)
        
        # Try to find answer elements - these might be in various formats
        # Common patterns: buttons, divs with specific classes, etc.
        
        # Method 1: Look for button elements with answer text
        answer_buttons = driver.find_elements(By.CSS_SELECTOR, 'button[data-answer], .answer-button, .kandidat-svar')
        
        # Method 2: Look for structured answer data in the page
        # Check if there's a JSON data structure embedded
        scripts = soup.find_all('script')
        for script in scripts:
            script_text = script.string
            if script_text and 'svar' in script_text.lower():
                # Try to extract JSON data
                try:
                    # Look for patterns like {"question": 1, "answer": "..."}
                    json_match = re.search(r'\{[^}]*"svar"[^}]*\}', script_text, re.IGNORECASE)
                    if json_match:
                        # This would need to be parsed based on actual structure
                        pass
                except:
                    pass
        
        # Method 3: Look for specific DOM elements that contain answers
        # Update selectors based on actual page structure
        for i in range(1, 20):  # Try to find 19 questions
            try:
                # Try different selectors
                selectors = [
                    f'[data-question="{i}"]',
                    f'#question-{i}',
                    f'.question-{i}',
                    f'[data-question-id="{i}"]'
                ]
                
                for selector in selectors:
                    try:
                        element = driver.find_element(By.CSS_SELECTOR, selector)
                        answer_text = element.text.strip()
                        if answer_text:
                            answers[i] = answer_text
                            break
                    except NoSuchElementException:
                        continue
            except:
                continue
        
        # Method 4: Look for all elements that might contain Q&A
        # Pattern: Look for numbered questions or specific text patterns
        text_content = soup.get_text()
        
        # Look for patterns like "Sp√∏rgsm√•l 1:", "Question 1:", etc.
        question_pattern = re.compile(r'(?:Sp√∏rgsm√•l|Question)\s*(\d+)[:\s]+(.*?)(?=(?:Sp√∏rgsm√•l|Question)\s*\d+|$)', re.DOTALL | re.IGNORECASE)
        matches = question_pattern.findall(text_content)
        
        for match in matches:
            q_num = int(match[0])
            q_text = match[1].strip()[:500]  # Limit length
            if q_num <= 19 and q_text:
                answers[q_num] = q_text
        
    except Exception as e:
        print(f"Warning: Could not extract candidate test answers: {e}")
    
    return answers

## Main Scraping Function - Enhanced

In [5]:
def scrape_candidate_enhanced(candidate_url: str, driver, wait_time=10) -> Dict[str, Any]:
    """
    Comprehensive scraping of candidate page including:
    - Basic info
    - Om section (Uddannelse, Bop√¶l, etc.)
    - Priorities (m√¶rkesager)
    - 19 candidate test answers
    """
    print(f"Scraping: {candidate_url}")
    
    try:
        driver.get(candidate_url)
        
        # Wait for main content to load
        time.sleep(3)
        
        # Scroll to load dynamic content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)
        
        # Get page source
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Extract all data
        candidate_data = extract_basic_info(soup, candidate_url)
        
        # Add Om section
        om_data = extract_om_section(soup)
        candidate_data.update(om_data)
        
        # Add priorities
        priorities = extract_priorities(soup)
        candidate_data['priorities'] = priorities
        candidate_data['num_priorities'] = len(priorities)
        
        # Add candidate test answers (19 svars)
        test_answers = extract_candidate_test_answers(driver, soup)
        candidate_data['test_answers'] = test_answers
        candidate_data['num_test_answers'] = len(test_answers)
        
        # Add individual answer fields for easier DataFrame creation
        for i in range(1, 20):
            candidate_data[f'svar_{i}'] = test_answers.get(i, '')
        
        print(f"  ‚úì Found {len(priorities)} priorities, {len(test_answers)} test answers")
        
        return candidate_data
        
    except Exception as e:
        print(f"  ‚úó Error scraping {candidate_url}: {e}")
        return {
            'url': candidate_url,
            'error': str(e)
        }

## Get Candidate Links from Municipality

In [6]:
def get_candidate_links(municipality_url: str, driver, wait_time=10) -> List[str]:
    """
    Extract all candidate links from a municipality page
    """
    print(f"\nFetching candidate links from: {municipality_url}")
    driver.get(municipality_url)
    time.sleep(3)
    
    # Scroll to load all candidates
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    
    try:
        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((By.TAG_NAME, "a"))
        )
        
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        candidate_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '/kandidater/kommune/' in href:
                full_url = urljoin('https://www.dr.dk', href)
                if full_url not in candidate_links:
                    candidate_links.append(full_url)
        
        print(f"‚úì Found {len(candidate_links)} candidate links")
        return candidate_links
        
    except Exception as e:
        print(f"‚úó Error finding candidate links: {e}")
        return []

## Main Scraping Workflow

In [7]:
def scrape_municipality_enhanced(municipality_url: str, max_candidates=None) -> List[Dict[str, Any]]:
    """
    Scrape all candidates from a municipality with enhanced data
    """
    candidate_links = get_candidate_links(municipality_url, driver)
    
    if max_candidates:
        candidate_links = candidate_links[:max_candidates]
        print(f"Limiting to {max_candidates} candidates for testing")
    
    all_candidates = []
    for i, link in enumerate(candidate_links, 1):
        print(f"\n[{i}/{len(candidate_links)}] ", end='')
        candidate_data = scrape_candidate_enhanced(link, driver)
        all_candidates.append(candidate_data)
        time.sleep(1)  # Be polite
    
    return all_candidates

## Example: Scrape Test Candidate

In [8]:
# Test with a single candidate first
test_url = "https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/7250-pernille-rosenkrantz-theil"

print("Testing with single candidate...\n")
test_candidate = scrape_candidate_enhanced(test_url, driver)

# Display results
print(f"\n{'='*60}")
print("TEST RESULTS")
print(f"{'='*60}")
print(f"Name: {test_candidate.get('name', 'N/A')}")
print(f"Party: {test_candidate.get('party', 'N/A')}")
print(f"Municipality: {test_candidate.get('municipality', 'N/A')}")
print(f"Uddannelse: {test_candidate.get('uddannelse', 'N/A')}")
print(f"Bop√¶l: {test_candidate.get('bopael', 'N/A')}")
print(f"Priorities found: {test_candidate.get('num_priorities', 0)}")
print(f"Test answers found: {test_candidate.get('num_test_answers', 0)}")
print(f"{'='*60}")

Testing with single candidate...

Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/7250-pernille-rosenkrantz-theil
  ‚úì Found 6 priorities, 0 test answers

TEST RESULTS
Name: Pernille Rosenkrantz-Theil
Party: A
Municipality: K√∏benhavns Kommune
Uddannelse: Bachelor-/diplomuddannelse
Bop√¶l: Br√∏nsh√∏j
Priorities found: 6
Test answers found: 0


## Scrape Municipality 124 (Copenhagen)

In [9]:
# Scrape municipality 124 with limit for testing
municipality_url = "https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/124"

# Start with 3 candidates to test
candidates = scrape_municipality_enhanced(municipality_url, max_candidates=3)

print(f"\n{'='*60}")
print(f"Successfully scraped {len([c for c in candidates if 'error' not in c])} candidates")
print(f"{'='*60}")


Fetching candidate links from: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/124
‚úì Found 261 candidate links
Limiting to 3 candidates for testing

[1/3] Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/7250-pernille-rosenkrantz-theil
  ‚úì Found 6 priorities, 0 test answers

[2/3] Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6116-andreas-keil
  ‚úì Found 6 priorities, 0 test answers

[3/3] Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6132-laura-rosenvinge
  ‚úì Found 6 priorities, 0 test answers

Successfully scraped 3 candidates


## Create Comprehensive DataFrames

In [10]:
# Create main DataFrame with all basic info and Om section
df_main = pd.DataFrame([{
    'candidate_id': c.get('candidate_id', ''),
    'name': c.get('name', ''),
    'party': c.get('party', ''),
    'municipality': c.get('municipality', ''),
    'uddannelse': c.get('uddannelse', ''),
    'bopael': c.get('bopael', ''),
    'alder': c.get('alder', ''),
    'erhverv': c.get('erhverv', ''),
    'sociale_medier': ', '.join(c.get('sociale_medier', [])),
    'num_priorities': c.get('num_priorities', 0),
    'num_test_answers': c.get('num_test_answers', 0),
    'url': c.get('url', '')
} for c in candidates if 'error' not in c])

print("\nüìä Main Candidate Information:")
display(df_main)

# Save main DataFrame
df_main.to_csv('candidates_main.csv', index=False, encoding='utf-8')
print("\n‚úì Saved to candidates_main.csv")


üìä Main Candidate Information:


Unnamed: 0,candidate_id,name,party,municipality,uddannelse,bopael,alder,erhverv,sociale_medier,num_priorities,num_test_answers,url
0,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,Bachelor-/diplomuddannelse,Br√∏nsh√∏j,,,,6,0,https://www.dr.dk/nyheder/politik/kommunalvalg...
1,6116,Andreas Keil,A,K√∏benhavns Kommune,Erhvervsuddannelse,K√∏benhavn,,,,6,0,https://www.dr.dk/nyheder/politik/kommunalvalg...
2,6132,Laura Rosenvinge,A,K√∏benhavns Kommune,Kandidat-/masteruddannelse,K√∏benhavn,,,,6,0,https://www.dr.dk/nyheder/politik/kommunalvalg...



‚úì Saved to candidates_main.csv


## Create DataFrame with 19 Test Answers (Svars)

In [11]:
# Create DataFrame with all 19 svars as columns
svar_columns = ['candidate_id', 'name', 'party', 'municipality'] + [f'svar_{i}' for i in range(1, 20)]

df_svars = pd.DataFrame([{
    'candidate_id': c.get('candidate_id', ''),
    'name': c.get('name', ''),
    'party': c.get('party', ''),
    'municipality': c.get('municipality', ''),
    **{f'svar_{i}': c.get(f'svar_{i}', '') for i in range(1, 20)}
} for c in candidates if 'error' not in c])

print("\nüìä Candidate Test Answers (19 Svars):")
display(df_svars)

# Save svars DataFrame
df_svars.to_csv('candidates_svars.csv', index=False, encoding='utf-8')
print("\n‚úì Saved to candidates_svars.csv")


üìä Candidate Test Answers (19 Svars):


Unnamed: 0,candidate_id,name,party,municipality,svar_1,svar_2,svar_3,svar_4,svar_5,svar_6,...,svar_10,svar_11,svar_12,svar_13,svar_14,svar_15,svar_16,svar_17,svar_18,svar_19
0,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,,,,,,,...,,,,,,,,,,
1,6116,Andreas Keil,A,K√∏benhavns Kommune,,,,,,,...,,,,,,,,,,
2,6132,Laura Rosenvinge,A,K√∏benhavns Kommune,,,,,,,...,,,,,,,,,,



‚úì Saved to candidates_svars.csv


## Create Long-Format DataFrame for Test Answers

In [12]:
# Create long-format DataFrame (one row per answer)
svar_rows = []

for c in candidates:
    if 'error' not in c and 'test_answers' in c:
        for q_num, answer_text in c['test_answers'].items():
            svar_rows.append({
                'candidate_id': c.get('candidate_id', ''),
                'name': c.get('name', ''),
                'party': c.get('party', ''),
                'municipality': c.get('municipality', ''),
                'question_number': q_num,
                'answer': answer_text
            })

df_svars_long = pd.DataFrame(svar_rows)

if not df_svars_long.empty:
    print("\nüìä Test Answers (Long Format):")
    display(df_svars_long.head(10))
    
    df_svars_long.to_csv('candidates_svars_long.csv', index=False, encoding='utf-8')
    print("\n‚úì Saved to candidates_svars_long.csv")
else:
    print("\n‚ö†Ô∏è No test answers found (they may require additional interaction or different selectors)")


‚ö†Ô∏è No test answers found (they may require additional interaction or different selectors)


## Create DataFrame for Priorities

In [13]:
# Create priorities DataFrame
priority_rows = []

for c in candidates:
    if 'error' not in c and 'priorities' in c:
        for p in c['priorities']:
            priority_rows.append({
                'candidate_id': c.get('candidate_id', ''),
                'name': c.get('name', ''),
                'party': c.get('party', ''),
                'municipality': c.get('municipality', ''),
                'priority_number': p['number'],
                'priority_text': p['text']
            })

df_priorities = pd.DataFrame(priority_rows)

print("\nüìä Priorities (M√¶rkesager):")
display(df_priorities)

df_priorities.to_csv('candidates_priorities.csv', index=False, encoding='utf-8')
print("\n‚úì Saved to candidates_priorities.csv")


üìä Priorities (M√¶rkesager):


Unnamed: 0,candidate_id,name,party,municipality,priority_number,priority_text
0,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,1,/19 | √∏konomi | K√∏benhavns Kommune Flere opgav...
1,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,2,/19 | social & velf√¶rd | K√∏benhavns Kommune Of...
2,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,3,/19 | trafik & transport | K√∏benhavns Kommune ...
3,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,1,1.\tGratis vuggestuer og b√∏rnehaver : Det er b...
4,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,2,Sikre parkeringspladser nok i K√∏benhavn : Jeg ...
5,7250,Pernille Rosenkrantz-Theil,A,K√∏benhavns Kommune,3,Et mere retf√¶rdigt boligmarked : Vi har f√•et a...
6,6116,Andreas Keil,A,K√∏benhavns Kommune,1,/19 | √∏konomi | K√∏benhavns Kommune Flere opgav...
7,6116,Andreas Keil,A,K√∏benhavns Kommune,2,/19 | social & velf√¶rd | K√∏benhavns Kommune Of...
8,6116,Andreas Keil,A,K√∏benhavns Kommune,3,/19 | trafik & transport | K√∏benhavns Kommune ...
9,6116,Andreas Keil,A,K√∏benhavns Kommune,1,B√∏rne- ungdomspolitik : Alt for mange b√∏rn mis...



‚úì Saved to candidates_priorities.csv


## Save Complete Raw Data

In [14]:
# Save complete raw data to JSON
with open('candidates_complete.json', 'w', encoding='utf-8') as f:
    json.dump(candidates, f, ensure_ascii=False, indent=2)

print("\n‚úì Saved complete data to candidates_complete.json")


‚úì Saved complete data to candidates_complete.json


## Summary Statistics

In [15]:
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)

print(f"\nTotal candidates scraped: {len(df_main)}")
print(f"\nCandidates by party:")
print(df_main['party'].value_counts())

print(f"\nCandidates by education level:")
print(df_main['uddannelse'].value_counts())

print(f"\nAverage priorities per candidate: {df_main['num_priorities'].mean():.2f}")
print(f"Average test answers per candidate: {df_main['num_test_answers'].mean():.2f}")

print(f"\nTop 5 locations (Bop√¶l):")
print(df_main['bopael'].value_counts().head())


SUMMARY STATISTICS

Total candidates scraped: 3

Candidates by party:
party
A    3
Name: count, dtype: int64

Candidates by education level:
uddannelse
Bachelor-/diplomuddannelse    1
Erhvervsuddannelse            1
Kandidat-/masteruddannelse    1
Name: count, dtype: int64

Average priorities per candidate: 6.00
Average test answers per candidate: 0.00

Top 5 locations (Bop√¶l):
bopael
K√∏benhavn    2
Br√∏nsh√∏j     1
Name: count, dtype: int64


## Function to Scrape Multiple Municipalities

In [None]:
def scrape_all_municipalities(municipality_ids: List[int], max_candidates_per_muni=None) -> List[Dict[str, Any]]:
    """
    Scrape candidates from multiple municipalities
    """
    all_candidates = []
    
    for i, muni_id in enumerate(municipality_ids, 1):
        print(f"\n{'='*60}")
        print(f"[{i}/{len(municipality_ids)}] Processing Municipality: {muni_id}")
        print(f"{'='*60}")
        
        muni_url = f"https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/{muni_id}"
        candidates = scrape_municipality_enhanced(muni_url, max_candidates_per_muni)
        all_candidates.extend(candidates)
        
        print(f"\nTotal candidates so far: {len(all_candidates)}")
        time.sleep(2)
    
    return all_candidates

# Example usage (uncomment to run):
# municipality_ids = [124, 101, 147]  # Copenhagen, Copenhagen suburbs, Aarhus, etc.
# all_candidates = scrape_all_municipalities(municipality_ids, max_candidates_per_muni=5)

## Data Quality Check

In [16]:
# Check data completeness
print("\nüìä DATA COMPLETENESS REPORT")
print("="*60)

for col in df_main.columns:
    non_empty = (df_main[col] != '').sum()
    pct = (non_empty / len(df_main) * 100) if len(df_main) > 0 else 0
    print(f"{col:20s}: {non_empty:3d}/{len(df_main)} ({pct:5.1f}%)")

# Check for missing critical data
print("\n‚ö†Ô∏è Potential Issues:")
if df_main['uddannelse'].eq('').sum() > 0:
    print(f"  - {df_main['uddannelse'].eq('').sum()} candidates missing education data")
if df_main['bopael'].eq('').sum() > 0:
    print(f"  - {df_main['bopael'].eq('').sum()} candidates missing residence data")
if df_main['num_test_answers'].eq(0).sum() > 0:
    print(f"  - {df_main['num_test_answers'].eq(0).sum()} candidates missing test answers (may need manual extraction)")


üìä DATA COMPLETENESS REPORT
candidate_id        :   3/3 (100.0%)
name                :   3/3 (100.0%)
party               :   3/3 (100.0%)
municipality        :   3/3 (100.0%)
uddannelse          :   3/3 (100.0%)
bopael              :   3/3 (100.0%)
alder               :   0/3 (  0.0%)
erhverv             :   0/3 (  0.0%)
sociale_medier      :   0/3 (  0.0%)
num_priorities      :   3/3 (100.0%)
num_test_answers    :   3/3 (100.0%)
url                 :   3/3 (100.0%)

‚ö†Ô∏è Potential Issues:
  - 3 candidates missing test answers (may need manual extraction)


## Export Summary Report

In [17]:
# Create a summary report
summary_report = {
    'scrape_date': pd.Timestamp.now().isoformat(),
    'total_candidates': len(df_main),
    'municipalities': df_main['municipality'].unique().tolist(),
    'parties': df_main['party'].value_counts().to_dict(),
    'files_created': [
        'candidates_main.csv',
        'candidates_svars.csv',
        'candidates_svars_long.csv',
        'candidates_priorities.csv',
        'candidates_complete.json'
    ],
    'data_completeness': {
        'uddannelse': (df_main['uddannelse'] != '').sum(),
        'bopael': (df_main['bopael'] != '').sum(),
        'priorities': df_main['num_priorities'].sum(),
        'test_answers': df_main['num_test_answers'].sum()
    }
}

with open('scrape_summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary_report, f, ensure_ascii=False, indent=2)

print("\n‚úì Summary report saved to scrape_summary.json")
print("\n" + json.dumps(summary_report, indent=2, ensure_ascii=False))

TypeError: Object of type int64 is not JSON serializable

## Clean Up: Close WebDriver

In [18]:
# Close the driver when done
driver.quit()
print("\n‚úì WebDriver closed")
print("\n" + "="*60)
print("SCRAPING COMPLETE!")
print("="*60)


‚úì WebDriver closed

SCRAPING COMPLETE!
