# DR Candidate Scraper (Simple Version)
This is a simpler version using requests + BeautifulSoup (no Selenium required).
**Note:** This may not work if the website requires JavaScript to load content.

## Installation

In [None]:
!pip install requests beautifulsoup4 pandas lxml

## Import Libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import json

## Set Up Session

In [None]:
# Create session with headers
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})

print("Session created")

## Function to Get Candidate Links

In [None]:
def get_candidate_links_simple(municipality_url):
    """
    Extract candidate links from municipality page using requests
    """
    print(f"\nFetching: {municipality_url}")
    
    try:
        response = session.get(municipality_url, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'lxml')
        
        candidate_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '/kandidater/kommune/' in href:
                full_url = urljoin('https://www.dr.dk', href)
                if full_url not in candidate_links:
                    candidate_links.append(full_url)
        
        print(f"Found {len(candidate_links)} candidate links")
        return candidate_links
        
    except Exception as e:
        print(f"Error: {e}")
        return []

## Function to Scrape Candidate Data

In [None]:
def scrape_candidate_simple(candidate_url):
    """
    Scrape individual candidate page
    """
    print(f"Scraping: {candidate_url}")
    
    try:
        response = session.get(candidate_url, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'lxml')
        
        # Extract candidate ID from URL
        url_parts = candidate_url.split('/')[-1]
        candidate_id = url_parts.split('-')[0] if '-' in url_parts else ''
        
        # Get title
        page_title = soup.find('title')
        title_text = page_title.text if page_title else ''
        
        # Parse name, party, municipality from title
        name = ''
        party = ''
        municipality = ''
        
        if title_text:
            parts = title_text.split('|')[0].strip()
            if '(' in parts and ')' in parts:
                name = parts.split('(')[0].strip()
                party = parts.split('(')[1].split(')')[0].strip()
                municipality = parts.split(')')[1].strip() if len(parts.split(')')) > 1 else ''
        
        # Extract all text content
        text_content = soup.get_text(separator='\n', strip=True)
        
        # Parse priorities
        priorities = []
        lines = text_content.split('\n')
        
        current_priority = {}
        for line in lines:
            line = line.strip()
            if line.isdigit() and int(line) <= 10:
                if current_priority:
                    priorities.append(current_priority)
                current_priority = {'number': int(line), 'text': ''}
            elif current_priority and line and not line.isdigit():
                if current_priority['text']:
                    current_priority['text'] += ' '
                current_priority['text'] += line
        
        if current_priority:
            priorities.append(current_priority)
        
        # Extract contact info
        email_match = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text_content)
        email = email_match.group(0) if email_match else ''
        
        phone_match = re.search(r'\+?\d[\d\s-]{7,}\d', text_content)
        phone = phone_match.group(0) if phone_match else ''
        
        return {
            'url': candidate_url,
            'candidate_id': candidate_id,
            'name': name,
            'party': party,
            'municipality': municipality,
            'email': email,
            'phone': phone,
            'priorities': priorities,
            'num_priorities': len(priorities)
        }
        
    except Exception as e:
        print(f"Error scraping {candidate_url}: {e}")
        return {'url': candidate_url, 'error': str(e)}

## Main Scraping Function

In [None]:
def scrape_municipality_simple(municipality_url, max_candidates=None):
    """
    Scrape all candidates from a municipality
    """
    candidate_links = get_candidate_links_simple(municipality_url)
    
    if max_candidates:
        candidate_links = candidate_links[:max_candidates]
    
    all_candidates = []
    for i, link in enumerate(candidate_links, 1):
        print(f"\nProcessing candidate {i}/{len(candidate_links)}")
        candidate_data = scrape_candidate_simple(link)
        all_candidates.append(candidate_data)
        time.sleep(1)  # Be polite
    
    return all_candidates

## Run Scraper

In [None]:
# Test with municipality 124
municipality_url = "https://www.dr.dk/nyheder/politik/kommunalvalg/din-stemmeseddel/124"
candidates = scrape_municipality_simple(municipality_url, max_candidates=5)

print(f"\n{'='*60}")
print(f"Scraped {len(candidates)} candidates")

## Convert to DataFrame

In [None]:
# Basic info DataFrame
df = pd.DataFrame([{
    'candidate_id': c.get('candidate_id', ''),
    'name': c.get('name', ''),
    'party': c.get('party', ''),
    'municipality': c.get('municipality', ''),
    'email': c.get('email', ''),
    'phone': c.get('phone', ''),
    'num_priorities': c.get('num_priorities', 0),
    'url': c.get('url', '')
} for c in candidates if 'error' not in c])

display(df)

# Save to CSV
df.to_csv('candidates_simple.csv', index=False, encoding='utf-8')
print("\nSaved to candidates_simple.csv")

## Priorities DataFrame

In [None]:
# Expand priorities
priority_rows = []
for c in candidates:
    if 'error' not in c and 'priorities' in c:
        for p in c['priorities']:
            priority_rows.append({
                'candidate_id': c['candidate_id'],
                'name': c['name'],
                'party': c['party'],
                'priority_number': p['number'],
                'priority_text': p['text']
            })

df_priorities = pd.DataFrame(priority_rows)
display(df_priorities)

# Save
df_priorities.to_csv('priorities_simple.csv', index=False, encoding='utf-8')
print("\nSaved to priorities_simple.csv")