# DR Municipal Election Candidate Scraper
This notebook scrapes candidate information from DR's municipal election pages.

## Installation
Run this cell first to install required packages:

In [None]:
%pip install selenium beautifulsoup4 pandas webdriver-manager requests lxml

## Import Libraries

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import json

## Initialize WebDriver
Using Chrome in headless mode for efficiency

In [7]:
def setup_driver(headless=True):
    """Set up Chrome WebDriver with options"""
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    driver = webdriver.Chrome(options=options)
    return driver

# Initialize driver
driver = setup_driver(headless=True)
print("WebDriver initialized successfully")

WebDriver initialized successfully


## Function to Extract Candidate Links from a Municipality Page

In [8]:
def get_candidate_links(municipality_url, driver, wait_time=10):
    """
    Extract all candidate links from a municipality page
    
    Args:
        municipality_url: URL of the municipality page (e.g., .../124)
        driver: Selenium WebDriver instance
        wait_time: Time to wait for page elements to load
    
    Returns:
        List of candidate URLs
    """
    print(f"\nFetching candidate links from: {municipality_url}")
    driver.get(municipality_url)
    
    # Wait for page to load
    time.sleep(3)
    
    # Try multiple selectors to find candidate links
    candidate_links = []
    
    try:
        # Wait for candidate elements to load
        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((By.TAG_NAME, "a"))
        )
        
        # Get page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Look for links containing '/kandidater/kommune/'
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '/kandidater/kommune/' in href:
                full_url = urljoin('https://www.dr.dk', href)
                if full_url not in candidate_links:
                    candidate_links.append(full_url)
        
        print(f"Found {len(candidate_links)} candidate links")
        
    except Exception as e:
        print(f"Error finding candidate links: {e}")
    
    return candidate_links

## Function to Scrape Individual Candidate Data

In [10]:
def scrape_candidate_data(candidate_url, driver, wait_time=10):
    """
    Scrape data from an individual candidate page
    
    Args:
        candidate_url: URL of the candidate page
        driver: Selenium WebDriver instance
        wait_time: Time to wait for page elements to load
    
    Returns:
        Dictionary containing candidate data
    """
    print(f"Scraping: {candidate_url}")
    
    try:
        driver.get(candidate_url)
        time.sleep(2)
        
        # Get page source
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Extract candidate ID and name from URL
        url_parts = candidate_url.split('/')[-1]
        candidate_id = url_parts.split('-')[0] if '-' in url_parts else ''
        
        # Extract page title (contains name and party)
        page_title = soup.find('title')
        title_text = page_title.text if page_title else ''
        
        # Parse name and party from title
        name = ''
        party = ''
        municipality = ''
        
        if title_text:
            # Format: "Name (Party) Municipality | KV25 | DR"
            parts = title_text.split('|')[0].strip()
            if '(' in parts and ')' in parts:
                name_part = parts.split('(')[0].strip()
                party_part = parts.split('(')[1].split(')')[0].strip()
                municipality_part = parts.split(')')[1].strip() if len(parts.split(')')) > 1 else ''
                
                name = name_part
                party = party_part
                municipality = municipality_part
        
        # Extract policy priorities (numbered items)
        priorities = []
        
        # Look for numbered content
        text_content = soup.get_text(separator='\n', strip=True)
        lines = text_content.split('\n')
        
        current_priority = {}
        for i, line in enumerate(lines):
            line = line.strip()
            # Check if line is a number (priority number)
            if line.isdigit() and int(line) <= 10:
                # If we have a previous priority, save it
                if current_priority:
                    priorities.append(current_priority)
                current_priority = {'number': int(line), 'text': ''}
            elif current_priority and line and not line.isdigit():
                # Add text to current priority
                if current_priority['text']:
                    current_priority['text'] += ' '
                current_priority['text'] += line
        
        # Add last priority
        if current_priority:
            priorities.append(current_priority)
        
        # Extract any contact information
        email = ''
        phone = ''
        website = ''
        
        # Look for email
        email_match = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text_content)
        if email_match:
            email = email_match.group(0)
        
        # Look for phone numbers
        phone_match = re.search(r'\+?\d[\d\s-]{7,}\d', text_content)
        if phone_match:
            phone = phone_match.group(0)
        
        # Compile all data
        candidate_data = {
            'url': candidate_url,
            'candidate_id': candidate_id,
            'name': name,
            'party': party,
            'municipality': municipality,
            'email': email,
            'phone': phone,
            'website': website,
            'priorities': priorities,
            'num_priorities': len(priorities)
        }
        
        return candidate_data
        
    except Exception as e:
        print(f"Error scraping {candidate_url}: {e}")
        return {
            'url': candidate_url,
            'error': str(e)
        }

## Main Scraping Function

In [11]:
def scrape_municipality(municipality_url, max_candidates=None):
    """
    Scrape all candidates from a municipality
    
    Args:
        municipality_url: URL of the municipality page
        max_candidates: Maximum number of candidates to scrape (None for all)
    
    Returns:
        List of candidate data dictionaries
    """
    # Get candidate links
    candidate_links = get_candidate_links(municipality_url, driver)
    
    if max_candidates:
        candidate_links = candidate_links[:max_candidates]
    
    # Scrape each candidate
    all_candidates = []
    for i, link in enumerate(candidate_links, 1):
        print(f"\nProcessing candidate {i}/{len(candidate_links)}")
        candidate_data = scrape_candidate_data(link, driver)
        all_candidates.append(candidate_data)
        
        # Be polite to the server
        time.sleep(1)
    
    return all_candidates

## Example: Scrape Municipality 124

In [12]:
# Example municipality URL
municipality_url = "https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/124"

# Scrape candidates (limit to 5 for testing)
candidates = scrape_municipality(municipality_url, max_candidates=5)

print(f"\n{'='*60}")
print(f"Scraped {len(candidates)} candidates")
print(f"{'='*60}")


Fetching candidate links from: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/124
Found 261 candidate links

Processing candidate 1/5
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/7250-pernille-rosenkrantz-theil
Found 261 candidate links

Processing candidate 1/5
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/7250-pernille-rosenkrantz-theil

Processing candidate 2/5
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6116-andreas-keil

Processing candidate 2/5
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6116-andreas-keil

Processing candidate 3/5
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6132-laura-rosenvinge

Processing candidate 3/5
Scraping: https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/kandidater/kommune/6132-laura-r

## Convert to DataFrame and Display

In [13]:
# Create DataFrame with basic info
df_basic = pd.DataFrame([{
    'candidate_id': c['candidate_id'],
    'name': c['name'],
    'party': c['party'],
    'municipality': c['municipality'],
    'email': c['email'],
    'phone': c['phone'],
    'num_priorities': c['num_priorities'],
    'url': c['url']
} for c in candidates if 'error' not in c])

print("\nBasic Candidate Information:")
display(df_basic)

# Show detailed priorities for first candidate
if candidates and 'priorities' in candidates[0]:
    print(f"\n\nDetailed Priorities for {candidates[0].get('name', 'Unknown')}:")
    for priority in candidates[0]['priorities']:
        print(f"\n{priority['number']}. {priority['text'][:200]}...")


Basic Candidate Information:


Unnamed: 0,candidate_id,name,party,municipality,email,phone,num_priorities,url
0,7250,Pernille Rosenkrantz-Theil,A,Københavns Kommune,,35 20 30 40,6,https://www.dr.dk/nyheder/politik/kommunalvalg...
1,6116,Andreas Keil,A,Københavns Kommune,,35 20 30 40,6,https://www.dr.dk/nyheder/politik/kommunalvalg...
2,6132,Laura Rosenvinge,A,Københavns Kommune,,35 20 30 40,6,https://www.dr.dk/nyheder/politik/kommunalvalg...
3,6125,Jakob Hougaard,A,Københavns Kommune,,35 20 30 40,6,https://www.dr.dk/nyheder/politik/kommunalvalg...
4,6137,Maria Kornbek,A,Københavns Kommune,,35 20 30 40,5,https://www.dr.dk/nyheder/politik/kommunalvalg...




Detailed Priorities for Pernille Rosenkrantz-Theil:

1. /19 | økonomi | Københavns Kommune Flere opgaver i den offentlige sektor skal fremover løses af private virksomheder Uenig Pernilles svar Enig "Vores velfærd og driften af byen skal være på fællesskab...

2. /19 | social & velfærd | Københavns Kommune Offentlige institutioner tager for mange hensyn til religiøse minoriteter, for eksempel ved at tilbyde måltider uden svinekød Uenig Pernilles svar Enig "Køb...

3. /19 | trafik & transport | Københavns Kommune Investering i vejnettet haster mere end investering i kollektiv trafik Uenig Pernilles svar Enig "Man skal kunne have bil i København. Derfor vil vi også ...

1. 1.	Gratis vuggestuer og børnehaver : Det er blevet alt for dyrt at bo i København, og mange presses ud af byen. Man kan ikke sænke boligpriserne med et fingerknips, men jeg vil gøre det billigere at b...

2. Sikre parkeringspladser nok i København : Jeg vil sikre, at der er parkeringspladser nok i København. For mang

## Expand Priorities into Separate DataFrame

In [14]:
# Create expanded DataFrame with one row per priority
priority_rows = []

for candidate in candidates:
    if 'error' not in candidate and 'priorities' in candidate:
        for priority in candidate['priorities']:
            priority_rows.append({
                'candidate_id': candidate['candidate_id'],
                'name': candidate['name'],
                'party': candidate['party'],
                'municipality': candidate['municipality'],
                'priority_number': priority['number'],
                'priority_text': priority['text']
            })

df_priorities = pd.DataFrame(priority_rows)

print("\nExpanded Priorities DataFrame:")
display(df_priorities.head(10))

print(f"\nTotal priorities: {len(df_priorities)}")


Expanded Priorities DataFrame:


Unnamed: 0,candidate_id,name,party,municipality,priority_number,priority_text
0,7250,Pernille Rosenkrantz-Theil,A,Københavns Kommune,1,/19 | økonomi | Københavns Kommune Flere opgav...
1,7250,Pernille Rosenkrantz-Theil,A,Københavns Kommune,2,/19 | social & velfærd | Københavns Kommune Of...
2,7250,Pernille Rosenkrantz-Theil,A,Københavns Kommune,3,/19 | trafik & transport | Københavns Kommune ...
3,7250,Pernille Rosenkrantz-Theil,A,Københavns Kommune,1,1.\tGratis vuggestuer og børnehaver : Det er b...
4,7250,Pernille Rosenkrantz-Theil,A,Københavns Kommune,2,Sikre parkeringspladser nok i København : Jeg ...
5,7250,Pernille Rosenkrantz-Theil,A,Københavns Kommune,3,Et mere retfærdigt boligmarked : Vi har fået a...
6,6116,Andreas Keil,A,Københavns Kommune,1,/19 | økonomi | Københavns Kommune Flere opgav...
7,6116,Andreas Keil,A,Københavns Kommune,2,/19 | social & velfærd | Københavns Kommune Of...
8,6116,Andreas Keil,A,Københavns Kommune,3,/19 | trafik & transport | Københavns Kommune ...
9,6116,Andreas Keil,A,Københavns Kommune,1,Børne- ungdomspolitik : Alt for mange børn mis...



Total priorities: 29


## Save Data to Files

In [15]:
# Save to CSV
df_basic.to_csv('candidates_basic.csv', index=False, encoding='utf-8')
df_priorities.to_csv('candidates_priorities.csv', index=False, encoding='utf-8')

# Save raw data to JSON
with open('candidates_raw.json', 'w', encoding='utf-8') as f:
    json.dump(candidates, f, ensure_ascii=False, indent=2)

print("Data saved to:")
print("- candidates_basic.csv")
print("- candidates_priorities.csv")
print("- candidates_raw.json")

Data saved to:
- candidates_basic.csv
- candidates_priorities.csv
- candidates_raw.json


## Function to Scrape Multiple Municipalities

In [None]:
def scrape_multiple_municipalities(municipality_ids, max_candidates_per_municipality=None):
    """
    Scrape candidates from multiple municipalities
    
    Args:
        municipality_ids: List of municipality IDs (e.g., [124, 101, 147])
        max_candidates_per_municipality: Max candidates per municipality
    
    Returns:
        Combined list of all candidates
    """
    all_candidates = []
    
    for muni_id in municipality_ids:
        print(f"\n{'='*60}")
        print(f"Processing Municipality: {muni_id}")
        print(f"{'='*60}")
        
        muni_url = f"https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/{muni_id}"
        candidates = scrape_municipality(muni_url, max_candidates_per_municipality)
        all_candidates.extend(candidates)
        
        print(f"\nTotal candidates so far: {len(all_candidates)}")
        time.sleep(2)  # Be polite between municipalities
    
    return all_candidates

# Example: Scrape multiple municipalities
# municipality_ids = [124, 101, 147]  # Add your municipality IDs
# all_candidates = scrape_multiple_municipalities(municipality_ids, max_candidates_per_municipality=5)

## Data Analysis Examples

In [None]:
# Count candidates by party
print("Candidates by Party:")
print(df_basic['party'].value_counts())

print("\nCandidates by Municipality:")
print(df_basic['municipality'].value_counts())

# Average number of priorities per candidate
print(f"\nAverage priorities per candidate: {df_basic['num_priorities'].mean():.2f}")

## Text Analysis of Priorities

In [None]:
# Find most common words in priorities
from collections import Counter
import re

# Combine all priority texts
all_priority_text = ' '.join(df_priorities['priority_text'].astype(str))

# Simple word frequency (you can enhance this with proper NLP)
words = re.findall(r'\b\w+\b', all_priority_text.lower())
word_freq = Counter(words)

# Remove common Danish stop words (add more as needed)
stop_words = {'og', 'i', 'til', 'at', 'det', 'er', 'en', 'for', 'med', 'på', 'som', 'der', 'af', 'de', 'vi'}
filtered_words = {word: count for word, count in word_freq.items() if word not in stop_words and len(word) > 3}

print("\nTop 20 Most Common Words in Priorities:")
for word, count in sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(f"{word}: {count}")

## Clean Up: Close WebDriver

In [None]:
# Always close the driver when done
driver.quit()
print("WebDriver closed")