# DR Candidate Scraper - REFINED VERSION
## Enhanced with Interactive Elements

This version includes:
- ✅ **Clicks 'vis alle' button** to reveal all 19 test answers
- ✅ **Improved mærkesager extraction** with better parsing
- ✅ Complete Om section (Uddannelse, Bopæl, etc.)
- ✅ Robust error handling and retry logic

## Installation

In [None]:
%pip install selenium webdriver-manager beautifulsoup4 pandas lxml

## Import Libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import json
from typing import Dict, List, Any, Optional

## Initialize WebDriver

In [None]:
def setup_driver(headless=True):
    """Set up Chrome WebDriver with optimized options"""
    options = Options()
    if headless:
        options.add_argument('--headless=new')  # Use new headless mode
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    # Disable automation flags
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    
    # Remove webdriver property
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            })
        '''
    })
    
    return driver

driver = setup_driver(headless=True)
print("✅ WebDriver initialized successfully")

## Helper Functions - Improved Extraction

In [None]:
def extract_basic_info(soup: BeautifulSoup, url: str) -> Dict[str, str]:
    """
    Extract basic candidate information
    """
    url_parts = url.split('/')[-1]
    candidate_id = url_parts.split('-')[0] if '-' in url_parts else ''
    
    page_title = soup.find('title')
    title_text = page_title.text if page_title else ''
    
    name = ''
    party = ''
    municipality = ''
    
    if title_text:
        parts = title_text.split('|')[0].strip()
        if '(' in parts and ')' in parts:
            name = parts.split('(')[0].strip()
            party = parts.split('(')[1].split(')')[0].strip()
            municipality = parts.split(')')[1].strip() if len(parts.split(')')) > 1 else ''
    
    return {
        'candidate_id': candidate_id,
        'name': name,
        'party': party,
        'municipality': municipality,
        'url': url
    }


def extract_om_section(soup: BeautifulSoup) -> Dict[str, str]:
    """
    Extract Om (About) section with improved parsing
    """
    om_data = {
        'uddannelse': '',
        'bopael': '',
        'alder': '',
        'erhverv': '',
        'sociale_medier': []
    }
    
    # Find Om section heading
    om_heading = soup.find(string=re.compile(r'Om\s+.+', re.IGNORECASE))
    
    if om_heading:
        # Get parent section
        om_section = om_heading.find_parent()
        if om_section:
            # Look for dl elements in this section or nearby
            dls = om_section.find_all('dl') or soup.find_all('dl')
            
            for dl in dls:
                dts = dl.find_all('dt')
                dds = dl.find_all('dd')
                
                for dt, dd in zip(dts, dds):
                    key = dt.get_text(strip=True).lower()
                    value = dd.get_text(strip=True)
                    
                    if 'uddannelse' in key:
                        om_data['uddannelse'] = value
                    elif 'bopæl' in key or 'bopael' in key:
                        om_data['bopael'] = value
                    elif 'alder' in key or 'år' in key:
                        om_data['alder'] = value
                    elif 'erhverv' in key or 'job' in key or 'beskæftigelse' in key or 'stilling' in key:
                        om_data['erhverv'] = value
    
    # Extract social media links
    social_heading = soup.find(string=re.compile('Sociale medier', re.I))
    if social_heading:
        parent = social_heading.find_parent()
        if parent:
            # Look in parent and siblings
            for sibling in parent.find_next_siblings():
                links = sibling.find_all('a', href=True)
                for link in links:
                    href = link['href']
                    if any(platform in href for platform in ['facebook', 'twitter', 'instagram', 'linkedin', 'x.com']):
                        om_data['sociale_medier'].append(href)
    
    return om_data


def extract_maerkesager_improved(soup: BeautifulSoup, driver) -> List[Dict[str, Any]]:
    """
    Improved extraction of mærkesager (policy priorities)
    """
    priorities = []
    
    # Method 1: Find mærkesager section by heading
    maerkesager_heading = soup.find(string=re.compile(r'mærkesager', re.IGNORECASE))
    
    if maerkesager_heading:
        # Get the parent container
        container = maerkesager_heading.find_parent()
        
        # Look for list items
        if container:
            # Try to find ul/ol with list items
            priority_list = container.find_next('ul') or container.find_next('ol')
            
            if priority_list:
                list_items = priority_list.find_all('li', recursive=False)
                
                for idx, li in enumerate(list_items, 1):
                    text = li.get_text(separator=' ', strip=True)
                    
                    # Remove leading number if present
                    text = re.sub(r'^\d+\.?\s*', '', text)
                    
                    # Try to split title from description
                    if ':' in text:
                        parts = text.split(':', 1)
                        title = parts[0].strip()
                        description = parts[1].strip() if len(parts) > 1 else ''
                    else:
                        # Take first sentence as title
                        sentences = text.split('.')
                        title = sentences[0].strip()
                        description = '. '.join(sentences[1:]).strip() if len(sentences) > 1 else ''
                    
                    priorities.append({
                        'number': idx,
                        'title': title,
                        'description': description,
                        'full_text': text
                    })
    
    # Method 2: Fallback - parse from text content if Method 1 fails
    if not priorities:
        text_content = soup.get_text(separator='\n', strip=True)
        lines = text_content.split('\n')
        
        current_priority = None
        for line in lines:
            line = line.strip()
            
            # Check if line is just a number (1-10)
            if line.isdigit() and 1 <= int(line) <= 10:
                if current_priority:
                    priorities.append(current_priority)
                current_priority = {
                    'number': int(line),
                    'title': '',
                    'description': '',
                    'full_text': ''
                }
            elif current_priority is not None and line and not line.isdigit():
                if current_priority['full_text']:
                    current_priority['full_text'] += ' '
                current_priority['full_text'] += line
                
                # Parse title and description
                if ':' in line and not current_priority['title']:
                    parts = line.split(':', 1)
                    current_priority['title'] = parts[0].strip()
                    current_priority['description'] = parts[1].strip() if len(parts) > 1 else ''
        
        if current_priority:
            priorities.append(current_priority)
    
    return priorities

## Click 'Vis Alle' Button and Extract Test Answers

In [None]:
def click_vis_alle_button(driver, max_attempts=3) -> bool:
    """
    Click the 'vis alle' (show all) button to reveal all 19 test answers
    Returns True if successful, False otherwise
    """
    # Possible button texts and selectors
    button_texts = [
        'vis alle',
        'vis alle svar',
        'se alle svar',
        'vis svar',
        'show all',
        'se alle'
    ]
    
    for attempt in range(max_attempts):
        try:
            # Scroll down to trigger any lazy loading
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
            time.sleep(1)
            
            # Try multiple methods to find and click the button
            button_found = False
            
            # Method 1: Find by button text (case insensitive)
            for text in button_texts:
                try:
                    button = driver.find_element(By.XPATH, 
                        f"//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅ', 'abcdefghijklmnopqrstuvwxyzæøå'), '{text}')]")
                    
                    # Scroll to button
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
                    time.sleep(0.5)
                    
                    # Try to click
                    try:
                        button.click()
                    except ElementClickInterceptedException:
                        # Try JavaScript click if regular click fails
                        driver.execute_script("arguments[0].click();", button)
                    
                    print(f"  ✓ Clicked '{text}' button")
                    time.sleep(2)  # Wait for content to load
                    button_found = True
                    break
                    
                except NoSuchElementException:
                    continue
            
            if button_found:
                return True
            
            # Method 2: Find buttons with specific classes
            try:
                buttons = driver.find_elements(By.TAG_NAME, 'button')
                for button in buttons:
                    button_text = button.text.lower()
                    if any(text in button_text for text in button_texts):
                        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
                        time.sleep(0.5)
                        driver.execute_script("arguments[0].click();", button)
                        print(f"  ✓ Clicked button: '{button.text}'")
                        time.sleep(2)
                        return True
            except Exception as e:
                pass
            
            # Method 3: Look for expandable sections
            try:
                expandable = driver.find_element(By.CSS_SELECTOR, '[aria-expanded="false"]')
                driver.execute_script("arguments[0].click();", expandable)
                print("  ✓ Clicked expandable section")
                time.sleep(2)
                return True
            except:
                pass
                
        except Exception as e:
            if attempt < max_attempts - 1:
                time.sleep(1)
                continue
    
    print("  ⚠ Could not find 'vis alle' button (answers may be visible by default)")
    return False


def extract_test_answers_after_click(driver, soup: BeautifulSoup) -> Dict[int, Dict[str, str]]:
    """
    Extract all 19 test answers after clicking 'vis alle'
    Returns dict with question number as key and dict with question text and answer
    """
    answers = {}
    
    # Get updated page source after button click
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    # Method 1: Look for question/answer pairs in structured elements
    # Common patterns: divs with data attributes, article elements, etc.
    
    # Try finding by data attributes
    for i in range(1, 20):
        try:
            # Multiple possible selectors
            selectors = [
                f'[data-question="{i}"]',
                f'[data-question-number="{i}"]',
                f'[data-question-id="{i}"]',
                f'#question-{i}',
                f'.question-{i}',
            ]
            
            for selector in selectors:
                try:
                    element = driver.find_element(By.CSS_SELECTOR, selector)
                    question_text = element.get_attribute('data-question-text') or ''
                    answer_text = element.text or element.get_attribute('data-answer') or ''
                    
                    if answer_text:
                        answers[i] = {
                            'question': question_text,
                            'answer': answer_text
                        }
                        break
                except:
                    continue
        except:
            continue
    
    # Method 2: Parse from visible text structure
    if len(answers) < 19:
        # Look for sections that might contain Q&A
        # Pattern: Find elements that contain both questions and answers
        
        # Try finding all article or section elements
        containers = soup.find_all(['article', 'section', 'div'], class_=re.compile(r'(question|answer|test|kandidat)', re.I))
        
        for container in containers:
            text = container.get_text(separator='\n', strip=True)
            
            # Look for numbered patterns
            # Pattern: "1. Question text" followed by answer
            lines = text.split('\n')
            
            current_q_num = None
            current_question = ''
            current_answer = ''
            
            for line in lines:
                # Check if line starts with a number (1-19)
                match = re.match(r'^(\d+)\.?\s+(.+)$', line.strip())
                if match:
                    # Save previous Q&A
                    if current_q_num and current_q_num not in answers:
                        answers[current_q_num] = {
                            'question': current_question,
                            'answer': current_answer
                        }
                    
                    # Start new question
                    current_q_num = int(match.group(1))
                    if current_q_num <= 19:
                        current_question = match.group(2)
                        current_answer = ''
                elif current_q_num:
                    # Add to current answer
                    if current_answer:
                        current_answer += ' '
                    current_answer += line.strip()
            
            # Save last question
            if current_q_num and current_q_num not in answers:
                answers[current_q_num] = {
                    'question': current_question,
                    'answer': current_answer
                }
    
    # Method 3: Look for specific HTML structure based on DR's layout
    if len(answers) < 19:
        # Find all elements that might be answer buttons or indicators
        answer_elements = soup.find_all(['button', 'div', 'span'], 
                                       class_=re.compile(r'(answer|svar|choice)', re.I))
        
        for elem in answer_elements:
            # Look for data attributes or text that indicates question number
            q_num_match = re.search(r'(\d+)', str(elem.get('class', '')) + str(elem.get('id', '')))
            if q_num_match:
                q_num = int(q_num_match.group(1))
                if 1 <= q_num <= 19 and q_num not in answers:
                    answers[q_num] = {
                        'question': '',
                        'answer': elem.get_text(strip=True)
                    }
    
    return answers

## Main Scraping Function - REFINED

In [None]:
def scrape_candidate_refined(candidate_url: str, driver, wait_time=15) -> Dict[str, Any]:
    """
    Refined scraping with button clicking and improved extraction
    """
    print(f"Scraping: {candidate_url}")
    
    try:
        driver.get(candidate_url)
        
        # Wait for initial page load
        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)
        
        # Scroll to load content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)
        
        # Get initial page source
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Extract basic info
        candidate_data = extract_basic_info(soup, candidate_url)
        print(f"  ✓ Basic info: {candidate_data['name']} ({candidate_data['party']})")
        
        # Extract Om section
        om_data = extract_om_section(soup)
        candidate_data.update(om_data)
        print(f"  ✓ Om section: Uddannelse={bool(om_data['uddannelse'])}, Bopæl={bool(om_data['bopael'])}")
        
        # Extract mærkesager (improved)
        priorities = extract_maerkesager_improved(soup, driver)
        candidate_data['priorities'] = priorities
        candidate_data['num_priorities'] = len(priorities)
        print(f"  ✓ Found {len(priorities)} mærkesager")
        
        # Click 'vis alle' button to reveal test answers
        button_clicked = click_vis_alle_button(driver)
        
        # Extract test answers
        test_answers = extract_test_answers_after_click(driver, soup)
        candidate_data['test_answers'] = test_answers
        candidate_data['num_test_answers'] = len(test_answers)
        print(f"  ✓ Extracted {len(test_answers)} test answers")
        
        # Add individual answer fields
        for i in range(1, 20):
            if i in test_answers:
                candidate_data[f'svar_{i}_question'] = test_answers[i].get('question', '')
                candidate_data[f'svar_{i}_answer'] = test_answers[i].get('answer', '')
            else:
                candidate_data[f'svar_{i}_question'] = ''
                candidate_data[f'svar_{i}_answer'] = ''
        
        return candidate_data
        
    except Exception as e:
        print(f"  ✗ Error: {e}")
        return {
            'url': candidate_url,
            'error': str(e)
        }

## Get Candidate Links

In [None]:
def get_candidate_links(municipality_url: str, driver, wait_time=10) -> List[str]:
    """
    Extract all candidate links from municipality page
    """
    print(f"\nFetching candidate links from: {municipality_url}")
    driver.get(municipality_url)
    
    WebDriverWait(driver, wait_time).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )
    time.sleep(3)
    
    # Scroll to load all candidates
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    candidate_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/kandidater/kommune/' in href:
            full_url = urljoin('https://www.dr.dk', href)
            if full_url not in candidate_links:
                candidate_links.append(full_url)
    
    print(f"✓ Found {len(candidate_links)} candidate links")
    return candidate_links


def scrape_municipality_refined(municipality_url: str, max_candidates: Optional[int] = None) -> List[Dict[str, Any]]:
    """
    Scrape all candidates from municipality
    """
    candidate_links = get_candidate_links(municipality_url, driver)
    
    if max_candidates:
        candidate_links = candidate_links[:max_candidates]
        print(f"Limiting to {max_candidates} candidates\n")
    
    all_candidates = []
    for i, link in enumerate(candidate_links, 1):
        print(f"\n{'='*60}")
        print(f"[{i}/{len(candidate_links)}]")
        candidate_data = scrape_candidate_refined(link, driver)
        all_candidates.append(candidate_data)
        time.sleep(2)  # Be polite to server
    
    return all_candidates

## TEST: Single Candidate

In [None]:
# Test with single candidate
test_url = "https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/7250-pernille-rosenkrantz-theil"

print("\n" + "="*60)
print("TESTING WITH SINGLE CANDIDATE")
print("="*60)

test_candidate = scrape_candidate_refined(test_url, driver)

# Display results
print("\n" + "="*60)
print("TEST RESULTS")
print("="*60)
print(f"Name: {test_candidate.get('name', 'N/A')}")
print(f"Party: {test_candidate.get('party', 'N/A')}")
print(f"Municipality: {test_candidate.get('municipality', 'N/A')}")
print(f"\nOm Section:")
print(f"  Uddannelse: {test_candidate.get('uddannelse', 'N/A')}")
print(f"  Bopæl: {test_candidate.get('bopael', 'N/A')}")
print(f"  Alder: {test_candidate.get('alder', 'N/A')}")
print(f"  Erhverv: {test_candidate.get('erhverv', 'N/A')}")
print(f"\nData Extracted:")
print(f"  Mærkesager: {test_candidate.get('num_priorities', 0)}")
print(f"  Test answers: {test_candidate.get('num_test_answers', 0)}")

if test_candidate.get('priorities'):
    print(f"\nFirst Mærkesag:")
    first = test_candidate['priorities'][0]
    print(f"  Title: {first.get('title', 'N/A')[:80]}...")
    print(f"  Description: {first.get('description', 'N/A')[:100]}...")

if test_candidate.get('test_answers'):
    print(f"\nSample Test Answer:")
    first_q = list(test_candidate['test_answers'].keys())[0]
    print(f"  Question {first_q}: {test_candidate['test_answers'][first_q]['question'][:80]}")
    print(f"  Answer: {test_candidate['test_answers'][first_q]['answer'][:80]}")

print("="*60)

## Scrape Full Municipality

In [23]:
# Scrape municipality (start with small number to test)
municipality_url = "https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/49"

print("\n" + "="*60)
print("SCRAPING MUNICIPALITY 49 (FBG)")
print("="*60)

# Start with 3 candidates to test
#candidates = scrape_municipality_refined(municipality_url, max_candidates=3)
# Ful municipality scrape
candidates = scrape_municipality_refined(municipality_url)

successful = [c for c in candidates if 'error' not in c]
print(f"\n\n{'='*60}")
print(f"✓ Successfully scraped {len(successful)}/{len(candidates)} candidates")
print(f"{'='*60}")


SCRAPING MUNICIPALITY 49 (FBG)

Fetching candidate links from: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/49
✓ Found 149 candidate links

[1/149]
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/5721-michael-vindfeldt
  ✓ Basic info: Michael Vindfeldt (A)
  ✓ Om section: Uddannelse=True, Bopæl=True
  ✓ Found 3 mærkesager
  ✓ Clicked expandable section
  ✓ Extracted 4 test answers

[2/149]
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/5725-sine-heltberg
  ✓ Basic info: Sine Heltberg (A)
  ✓ Om section: Uddannelse=True, Bopæl=True
  ✓ Found 3 mærkesager
  ✓ Clicked expandable section
  ✓ Extracted 4 test answers

[3/149]
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/5719-malte-mathies-loecke
  ✓ Basic info: Malte Mathies Løcke (A)
  ✓ Om section: Uddannelse=True, Bopæl=True
  ✓ Found 3 mærkesager
  ✓ Clicked expandable section
  ✓

## Create DataFrames

In [24]:
# Main DataFrame
df_main = pd.DataFrame([{
    'candidate_id': c.get('candidate_id', ''),
    'name': c.get('name', ''),
    'party': c.get('party', ''),
    'municipality': c.get('municipality', ''),
    'uddannelse': c.get('uddannelse', ''),
    'bopael': c.get('bopael', ''),
    'alder': c.get('alder', ''),
    'erhverv': c.get('erhverv', ''),
    'sociale_medier': ', '.join(c.get('sociale_medier', [])),
    'num_priorities': c.get('num_priorities', 0),
    'num_test_answers': c.get('num_test_answers', 0),
    'url': c.get('url', '')
} for c in candidates if 'error' not in c])

print("\n📊 Main Candidate DataFrame:")
display(df_main)

# Mærkesager DataFrame with improved structure
maerkesager_rows = []
for c in candidates:
    if 'error' not in c and 'priorities' in c:
        for p in c['priorities']:
            maerkesager_rows.append({
                'candidate_id': c.get('candidate_id', ''),
                'name': c.get('name', ''),
                'party': c.get('party', ''),
                'municipality': c.get('municipality', ''),
                'priority_number': p['number'],
                'priority_title': p.get('title', ''),
                'priority_description': p.get('description', ''),
                'priority_full_text': p.get('full_text', '')
            })

df_maerkesager = pd.DataFrame(maerkesager_rows)
print("\n📊 Mærkesager DataFrame:")
display(df_maerkesager)

# Test Answers DataFrame (wide format)
svar_columns = ['candidate_id', 'name', 'party', 'municipality']
for i in range(1, 20):
    svar_columns.extend([f'svar_{i}_question', f'svar_{i}_answer'])

df_svars_wide = pd.DataFrame([{
    'candidate_id': c.get('candidate_id', ''),
    'name': c.get('name', ''),
    'party': c.get('party', ''),
    'municipality': c.get('municipality', ''),
    **{f'svar_{i}_question': c.get(f'svar_{i}_question', '') for i in range(1, 20)},
    **{f'svar_{i}_answer': c.get(f'svar_{i}_answer', '') for i in range(1, 20)}
} for c in candidates if 'error' not in c])

print("\n📊 Test Answers DataFrame (Wide Format):")
print(f"Shape: {df_svars_wide.shape}")
display(df_svars_wide[['name', 'party', 'svar_1_question', 'svar_1_answer']].head())

# Test Answers DataFrame (long format)
svar_rows = []
for c in candidates:
    if 'error' not in c and 'test_answers' in c:
        for q_num, qa in c['test_answers'].items():
            svar_rows.append({
                'candidate_id': c.get('candidate_id', ''),
                'name': c.get('name', ''),
                'party': c.get('party', ''),
                'municipality': c.get('municipality', ''),
                'question_number': q_num,
                'question_text': qa.get('question', ''),
                'answer_text': qa.get('answer', '')
            })

df_svars_long = pd.DataFrame(svar_rows)
print("\n📊 Test Answers DataFrame (Long Format):")
display(df_svars_long.head(10))


📊 Main Candidate DataFrame:


Unnamed: 0,candidate_id,name,party,municipality,uddannelse,bopael,alder,erhverv,sociale_medier,num_priorities,num_test_answers,url
0,5721,Michael Vindfeldt,A,Frederiksberg Kommune,Kandidat-/masteruddannelse,Frederiksberg,,,"//instagram.com/michaelv, //facebook.com/vindf...",3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
1,5725,Sine Heltberg,A,Frederiksberg Kommune,Kandidat-/masteruddannelse,Frederiksberg,,,"https://www.instagram.com/sineheltberg, https:...",3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
2,5719,Malte Mathies Løcke,A,Frederiksberg Kommune,Kandidat-/masteruddannelse,Frederiksberg,,,"https://www.instagram.com/maltemathieslocke/, ...",3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
3,5710,Christina Sylvest-Noer,A,Frederiksberg Kommune,Kandidat-/masteruddannelse,Frederiksberg Kommune,,,"https://www.instagram.com/sylvest_noer/, https...",3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
4,5729,Thomas Frank,A,Frederiksberg Kommune,Erhvervsuddannelse,Frederiksberg,,,"https://www.instagram.com/stilladsarbejderen/,...",3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
...,...,...,...,...,...,...,...,...,...,...,...,...
144,4469,Mette Bram,Å,Frederiksberg Kommune,Kandidat-/masteruddannelse,FREDERIKSBERG,,,"https://www.instagram.com/alternativmette/, ht...",3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
145,4470,Kim Hjerrild,Å,Frederiksberg Kommune,Kandidat-/masteruddannelse,Frederiksberg,,,"https://www.instagram.com/kim_hjerrild/, https...",3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
146,4472,Tim Tue Wodskou,Å,Frederiksberg Kommune,Kandidat-/masteruddannelse,Frederiksberg C,,,"https://www.facebook.com/ Tim Wodskou, https:...",2,4,https://www.dr.dk/nyheder/politik/kommunalvalg...
147,4471,Annedorthe Kring,Å,Frederiksberg Kommune,Kandidat-/masteruddannelse,Frederiksberg C,,,https://www.facebook.com/annedorthe.kring/,3,4,https://www.dr.dk/nyheder/politik/kommunalvalg...



📊 Mærkesager DataFrame:


Unnamed: 0,candidate_id,name,party,municipality,priority_number,priority_title,priority_description,priority_full_text
0,5721,Michael Vindfeldt,A,Frederiksberg Kommune,1,Skolemad i skolerne og renovering af skoler og...,,Skolemad i skolerne og renovering af skoler og...
1,5721,Michael Vindfeldt,A,Frederiksberg Kommune,2,Rengøring hver uge til hjemmehjælpsmodtagere m...,,Rengøring hver uge til hjemmehjælpsmodtagere m...
2,5721,Michael Vindfeldt,A,Frederiksberg Kommune,3,"400 nye træer de næste fire år, flere lommepar...",,"400 nye træer de næste fire år, flere lommepar..."
3,5725,Sine Heltberg,A,Frederiksberg Kommune,1,Værdigt seniorliv,"Rengøring hver 2. uge i stedet for hver 3., me...",Værdigt seniorliv : Rengøring hver 2. uge i st...
4,5725,Sine Heltberg,A,Frederiksberg Kommune,2,Bedre forhold for børn og unge,Bedre normeringer i daginstitutionerne og max ...,Bedre forhold for børn og unge : Bedre normeri...
...,...,...,...,...,...,...,...,...
359,4471,Annedorthe Kring,Å,Frederiksberg Kommune,2,Fokus på dyrevelfærd i kommunenes indkøbspolitik,Frederiksbeg kommune bør benytte sin rolle som...,Fokus på dyrevelfærd i kommunenes indkøbspolit...
360,4471,Annedorthe Kring,Å,Frederiksberg Kommune,3,Åben Åen / Bispengbuen delvis ned,Ønsker sammen med Københavns Kommune at der ar...,Åben Åen / Bispengbuen delvis ned : Ønsker sam...
361,4473,Thomas Mølgaard Andersen,Å,Frederiksberg Kommune,1,Grøn udvikling,Frederiksberg skal være en grøn frontløber med...,Grøn udvikling : Frederiksberg skal være en gr...
362,4473,Thomas Mølgaard Andersen,Å,Frederiksberg Kommune,2,Social udvikling,Alternativet på Frederiksberg tror på mere til...,Social udvikling : Alternativet på Frederiksbe...



📊 Test Answers DataFrame (Wide Format):
Shape: (149, 42)


Unnamed: 0,name,party,svar_1_question,svar_1_answer
0,Michael Vindfeldt,A,,
1,Sine Heltberg,A,,
2,Malte Mathies Løcke,A,,
3,Christina Sylvest-Noer,A,,
4,Thomas Frank,A,,



📊 Test Answers DataFrame (Long Format):


Unnamed: 0,candidate_id,name,party,municipality,question_number,question_text,answer_text
0,5721,Michael Vindfeldt,A,Frederiksberg Kommune,6,,Se Michaels svar1/19| økonomi| Frederiksberg K...
1,5721,Michael Vindfeldt,A,Frederiksberg Kommune,2,,UenigMichaelssvarEnig
2,5721,Michael Vindfeldt,A,Frederiksberg Kommune,8,,UenigMichaelssvarEnig
3,5721,Michael Vindfeldt,A,Frederiksberg Kommune,1,,
4,5725,Sine Heltberg,A,Frederiksberg Kommune,6,,Se Sines svar1/19| økonomi| Frederiksberg Komm...
5,5725,Sine Heltberg,A,Frederiksberg Kommune,2,,UenigSinessvarEnig
6,5725,Sine Heltberg,A,Frederiksberg Kommune,8,,UenigSinessvarEnig
7,5725,Sine Heltberg,A,Frederiksberg Kommune,1,,
8,5719,Malte Mathies Løcke,A,Frederiksberg Kommune,6,,Se Malte Mathies' svar1/19| økonomi| Frederiks...
9,5719,Malte Mathies Løcke,A,Frederiksberg Kommune,2,,UenigMalte Mathies'svarEnig


## Save All Data

In [25]:
# Save to CSV
df_main.to_csv('candidates_main_refined.csv', index=False, encoding='utf-8')
df_maerkesager.to_csv('candidates_maerkesager_refined.csv', index=False, encoding='utf-8')
df_svars_wide.to_csv('candidates_svars_wide_refined.csv', index=False, encoding='utf-8')
if not df_svars_long.empty:
    df_svars_long.to_csv('candidates_svars_long_refined.csv', index=False, encoding='utf-8')

# Save raw JSON
with open('candidates_complete_refined.json', 'w', encoding='utf-8') as f:
    json.dump(candidates, f, ensure_ascii=False, indent=2)

print("\n✅ All data saved:")
print("  - candidates_main_refined.csv")
print("  - candidates_maerkesager_refined.csv")
print("  - candidates_svars_wide_refined.csv")
if not df_svars_long.empty:
    print("  - candidates_svars_long_refined.csv")
print("  - candidates_complete_refined.json")


✅ All data saved:
  - candidates_main_refined.csv
  - candidates_maerkesager_refined.csv
  - candidates_svars_wide_refined.csv
  - candidates_svars_long_refined.csv
  - candidates_complete_refined.json


## Summary Statistics

In [27]:
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)

print(f"\nTotal candidates: {len(df_main)}")
print(f"\nData Completeness:")
print(f"  Name: {(df_main['name'] != '').sum()}/{len(df_main)}")
print(f"  Uddannelse: {(df_main['uddannelse'] != '').sum()}/{len(df_main)}")
print(f"  Bopæl: {(df_main['bopael'] != '').sum()}/{len(df_main)}")
print(f"  Mærkesager: {df_main['num_priorities'].sum()} total")
print(f"  Test answers: {df_main['num_test_answers'].sum()} total")

print(f"\nAverage per candidate:")
print(f"  Mærkesager: {df_main['num_priorities'].mean():.1f}")
print(f"  Test answers: {df_main['num_test_answers'].mean():.1f}")

if not df_maerkesager.empty:
    print(f"\nMærkesager with titles: {(df_maerkesager['priority_title'] != '').sum()}/{len(df_maerkesager)}")

print("\n" + "="*60)


SUMMARY STATISTICS

Total candidates: 149

Data Completeness:
  Name: 149/149
  Uddannelse: 110/149
  Bopæl: 113/149
  Mærkesager: 364 total
  Test answers: 488 total

Average per candidate:
  Mærkesager: 2.4
  Test answers: 3.3

Mærkesager with titles: 340/364



## Function to Scrape Multiple Municipalities

In [None]:
def scrape_multiple_municipalities(municipality_ids: List[int], max_per_muni: Optional[int] = None):
    """
    Scrape candidates from multiple municipalities
    """
    all_candidates = []
    
    for i, muni_id in enumerate(municipality_ids, 1):
        print(f"\n{'='*60}")
        print(f"MUNICIPALITY {i}/{len(municipality_ids)}: {muni_id}")
        print(f"{'='*60}")
        
        muni_url = f"https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/{muni_id}"
        candidates = scrape_municipality_refined(muni_url, max_per_muni)
        all_candidates.extend(candidates)
        
        print(f"\nTotal candidates so far: {len(all_candidates)}")
        time.sleep(3)
    
    return all_candidates

# Example (uncomment to use):
municipality_ids = [
    165,  # Albertslund
    153,  # Brøndby
    159,  # Gladsaxe
    161,  # Glostrup
    151,  # Ballerup
    190,  # Furesø
    253,  # Greve
    183,  # Ishøj
    173,  # Lyngby-Taarbæk
    230,  # Rudersdal
    187   # Vallensbæk
]
#all_candidates = scrape_multiple_municipalities(municipality_ids, max_per_muni=5)
#all_candidates = scrape_multiple_municipalities(municipality_ids)



MUNICIPALITY 1/11: 165

Fetching candidate links from: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/165
✓ Found 177 candidate links

[1/177]
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/7261-peter-rahbaek-juel
  ✓ Basic info: Peter Rahbæk Juel (A)
  ✓ Om section: Uddannelse=True, Bopæl=True
  ✓ Found 3 mærkesager
  ✓ Clicked expandable section
  ✓ Extracted 4 test answers

[2/177]
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6362-helle-raun-oddershede
  ✓ Basic info: Helle Raun Oddershede (A)
  ✓ Om section: Uddannelse=True, Bopæl=True
  ✓ Found 5 mærkesager
  ✓ Clicked expandable section
  ✓ Extracted 4 test answers

[3/177]
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6366-mads-thomsen
  ✓ Basic info: Mads Thomsen (A)
  ✓ Om section: Uddannelse=True, Bopæl=True
  ✓ Found 3 mærkesager
  ✓ Clicked expandable section
  ✓ Extr

## Close WebDriver

In [28]:
# Always close when done
driver.quit()
print("\n✅ WebDriver closed")
print("\n" + "="*60)
print("SCRAPING COMPLETE!")
print("="*60)


✅ WebDriver closed

SCRAPING COMPLETE!
