<a href="https://colab.research.google.com/github/ryouy/election2026/blob/main/2026_Election_Yomiuri.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 候補者詳細URLを収集

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re # Import regex module

def get_prefecture_pages(main_url):
    try:
        response = requests.get(main_url)
        response.raise_for_status()  # Check for HTTP errors

        soup = BeautifulSoup(response.text, 'html.parser')

        prefecture_data = []
        # Regex to match prefecture-specific pages like 'https://www.yomiuri.co.jp/election/shugiin/YA01XXXXXX000/'
        # The (0[1-9]|[1-3][0-9]|4[0-7]) part matches numbers from 01 to 47 for prefectures.
        prefecture_pattern = r'^https://www.yomiuri.co.jp/election/shugiin/YA(0[1-9]|[1-3][0-9]|4[0-7])XXXXXX000/$'

        all_links = soup.find_all('a', href=True)

        for link_tag in all_links:
            absolute_link = urljoin(main_url, link_tag.get('href'))
            # Check if the link matches the prefecture pattern
            if re.match(prefecture_pattern, absolute_link):
                prefecture_name = link_tag.get_text(strip=True)
                if prefecture_name: # Ensure there's actual text for the prefecture name
                    prefecture_data.append({'prefecture_name': prefecture_name, 'prefecture_url': absolute_link})

        return prefecture_data

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {main_url}: {e}")
        return []
    except Exception as e:
        print(f"An error occurred during parsing {main_url}: {e}")
        return []

# Main URL for the Yomiuri Shimbun election page
main_election_url = 'https://www.yomiuri.co.jp/election/shugiin/'

# Call the function to get prefecture pages
prefecture_pages = get_prefecture_pages(main_election_url)

# Print the results to verify
if prefecture_pages:
    print(f"Found {len(prefecture_pages)} prefecture pages.")
    print("First 5 entries:")
    for i, item in enumerate(prefecture_pages[:5]):
        print(f"  {i+1}. Name: {item['prefecture_name']}, URL: {item['prefecture_url']}")
else:
    print(f"No prefecture pages found on {main_election_url}.")


Found 105 prefecture pages.
First 5 entries:
  1. Name: #選挙・東京, URL: https://www.yomiuri.co.jp/election/shugiin/YA13XXXXXX000/
  2. Name: 北海道, URL: https://www.yomiuri.co.jp/election/shugiin/YA01XXXXXX000/
  3. Name: 青森, URL: https://www.yomiuri.co.jp/election/shugiin/YA02XXXXXX000/
  4. Name: 岩手, URL: https://www.yomiuri.co.jp/election/shugiin/YA03XXXXXX000/
  5. Name: 宮城, URL: https://www.yomiuri.co.jp/election/shugiin/YA04XXXXXX000/


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re # Import regex module

def get_candidates_for_prefecture(prefecture_url):
    try:
        response = requests.get(prefecture_url)
        response.raise_for_status()  # Check for HTTP errors

        soup = BeautifulSoup(response.text, 'html.parser')

        candidate_data = []
        # Regex to match candidate-specific pages within a prefecture, e.g., 'https://www.yomiuri.co.jp/election/shugiin/2026/YA01XXXXXX000/135847/'
        # The \d+ matches one or more digits for the unique candidate ID.
        # The prefecture code (e.g., 01 for Hokkaido) is dynamically inserted into the pattern using string formatting.
        # Extract the prefecture code from the prefecture_url for accurate matching.
        prefecture_code_match = re.search(r'YA(0[1-9]|[1-3][0-9]|4[0-7])XXXXXX000', prefecture_url)
        if not prefecture_code_match:
            print(f"Could not extract prefecture code from URL: {prefecture_url}")
            return []
        prefecture_code = prefecture_code_match.group(1)

        candidate_pattern = rf'^https://www.yomiuri.co.jp/election/shugiin/2026/YA{prefecture_code}XXXXXX000/\d+/$'

        all_links = soup.find_all('a', href=True)

        for link_tag in all_links:
            absolute_link = urljoin(prefecture_url, link_tag.get('href'))
            # Check if the link matches the candidate pattern
            if re.match(candidate_pattern, absolute_link):
                candidate_name_full = link_tag.get_text(strip=True)
                # The name might include age, party, etc. Try to extract just the name if possible.
                # A simple heuristic: take the first part before the age or party info.
                name_match = re.match(r'([\w\s\u3000\u4e00-\u9fff]+)\d+歳', candidate_name_full)
                candidate_name = name_match.group(1).strip() if name_match else candidate_name_full

                if candidate_name and candidate_name != '#選挙・茨城': # Filter out the initial non-candidate link if it appears as a name
                    candidate_data.append({'candidate_name': candidate_name, 'candidate_url': absolute_link})

        # Remove duplicate entries based on candidate_url to avoid redundant articles.
        unique_candidate_data = []
        seen_urls = set()
        for item in candidate_data:
            if item['candidate_url'] not in seen_urls:
                unique_candidate_data.append(item)
                seen_urls.add(item['candidate_url'])

        return unique_candidate_data

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {prefecture_url}: {e}")
        return []
    except Exception as e:
        print(f"An error occurred during parsing {prefecture_url}: {e}")
        return []

# Example usage for one prefecture to verify the function (e.g., Hokkaido, which is the second entry in prefecture_pages)
# Ensure prefecture_pages is available from the previous cell's execution
if 'prefecture_pages' in globals() and prefecture_pages:
    # Using the second entry from prefecture_pages (index 1) which is Hokkaido
    hokkaido_entry = prefecture_pages[1]
    hokkaido_name = hokkaido_entry['prefecture_name']
    hokkaido_url = hokkaido_entry['prefecture_url']

    print(f"\n--- Scraping candidates for {hokkaido_name} ({hokkaido_url}) ---")
    hokkaido_candidates = get_candidates_for_prefecture(hokkaido_url)

    if hokkaido_candidates:
        print(f"Found {len(hokkaido_candidates)} candidates for {hokkaido_name}.")
        print("First 5 candidates:")
        for i, candidate in enumerate(hokkaido_candidates[:5]):
            print(f"  {i+1}. Name: {candidate['candidate_name']}, URL: {candidate['candidate_url']}")
    else:
        print(f"No candidates found for {hokkaido_name}.")
else:
    print("prefecture_pages variable not found or is empty. Please run the previous cell.")



--- Scraping candidates for 北海道 (https://www.yomiuri.co.jp/election/shugiin/YA01XXXXXX000/) ---
Found 38 candidates for 北海道.
First 5 candidates:
  1. Name: 道下　大樹5, URL: https://www.yomiuri.co.jp/election/shugiin/2026/YA01XXXXXX000/135847/
  2. Name: 臼木　秀剛4, URL: https://www.yomiuri.co.jp/election/shugiin/2026/YA01XXXXXX000/137660/
  3. Name: 森　英士4, URL: https://www.yomiuri.co.jp/election/shugiin/2026/YA01XXXXXX000/138866/
  4. Name: 加藤　貴弘4, URL: https://www.yomiuri.co.jp/election/shugiin/2026/YA01XXXXXX000/136367/
  5. Name: 加納　千津子5, URL: https://www.yomiuri.co.jp/election/shugiin/2026/YA01XXXXXX000/139207/


In [None]:
import os
import json

def save_candidates_to_json(prefecture_name, candidates_data, base_dir='prefecture_data'):
    """
    Creates a directory for the prefecture and saves candidate data to a JSON file.
    """
    prefecture_dir = os.path.join(base_dir, prefecture_name)
    os.makedirs(prefecture_dir, exist_ok=True)

    file_path = os.path.join(prefecture_dir, f"{prefecture_name}_candidates.json")
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(candidates_data, f, ensure_ascii=False, indent=4)
    return len(candidates_data)


# Initialize counters
total_prefectures_processed = 0
total_candidates_saved = 0
all_scraped_data = {}

# Ensure prefecture_pages is available from previous cells
if 'prefecture_pages' in globals() and prefecture_pages:
    print(f"\nProcessing candidates for {len(prefecture_pages)} prefectures...")
    for prefecture_entry in prefecture_pages:
        prefecture_name = prefecture_entry['prefecture_name']
        prefecture_url = prefecture_entry['prefecture_url']

        # Skip any entry where the prefecture name starts with '#'
        if prefecture_name.startswith('#'):
            print(f"Skipping non-prefecture entry: {prefecture_name}")
            continue

        print(f"  Scraping candidates for {prefecture_name}...")
        candidates = get_candidates_for_prefecture(prefecture_url)

        if candidates:
            candidates_count = save_candidates_to_json(prefecture_name, candidates)
            total_prefectures_processed += 1
            total_candidates_saved += candidates_count
            all_scraped_data[prefecture_name] = candidates # Store for potential future use or summary
            print(f"    Saved {candidates_count} candidates for {prefecture_name}.")
        else:
            print(f"    No candidates found or failed to scrape for {prefecture_name}.")

    print(f"\n--- Summary ---")
    print(f"Total prefectures processed: {total_prefectures_processed}")
    print(f"Total candidate entries saved: {total_candidates_saved}")
else:
    print("prefecture_pages variable not found or is empty. Please ensure previous steps ran successfully.")


Processing candidates for 105 prefectures...
Skipping non-prefecture entry: #選挙・東京
  Scraping candidates for 北海道...
    Saved 38 candidates for 北海道.
  Scraping candidates for 青森...
    Saved 11 candidates for 青森.
  Scraping candidates for 岩手...
    Saved 10 candidates for 岩手.
  Scraping candidates for 宮城...
    Saved 21 candidates for 宮城.
  Scraping candidates for 秋田...
    Saved 10 candidates for 秋田.
  Scraping candidates for 山形...
    Saved 10 candidates for 山形.
  Scraping candidates for 福島...
    Saved 15 candidates for 福島.
  Scraping candidates for 茨城...
    Saved 23 candidates for 茨城.
  Scraping candidates for 栃木...
    Saved 20 candidates for 栃木.
  Scraping candidates for 群馬...
    Saved 17 candidates for 群馬.
  Scraping candidates for 埼玉...
    Saved 55 candidates for 埼玉.
  Scraping candidates for 東京...
    Saved 154 candidates for 東京.
  Scraping candidates for 千葉...
    Saved 52 candidates for 千葉.
  Scraping candidates for 神奈川...
    Saved 70 candidates for 神奈川.
  Scraping cand

In [None]:
import os
import shutil

def clean_invalid_prefecture_folders(base_dir='prefecture_data'):
    if not os.path.exists(base_dir):
        print(f"Directory '{base_dir}' does not exist. Nothing to clean.")
        return

    print(f"Checking for and removing invalid prefecture folders in '{base_dir}'...")
    for item in os.listdir(base_dir):
        full_path = os.path.join(base_dir, item)
        if os.path.isdir(full_path) and item.startswith('#選挙・'):
            print(f"  Removing folder: {item}")
            try:
                shutil.rmtree(full_path)
                print(f"  Successfully removed '{item}'.")
            except Exception as e:
                print(f"  Error removing '{item}': {e}")
        else:
            # print(f"  Keeping folder: {item}") # Optional: to see which folders are kept
            pass
    print("Cleanup complete.")

# Call the cleanup function
clean_invalid_prefecture_folders()


Checking for and removing invalid prefecture folders in 'prefecture_data'...
Cleanup complete.


In [None]:
unique_names = set()
for prefecture, candidates_list in all_scraped_data.items():
    for candidate in candidates_list:
        # The name might still contain age or other details, try to clean it.
        # Re-using the regex from `get_candidates_for_prefecture` for consistency.
        name_full = candidate['candidate_name']
        name_match = re.match(r'([\w\s\u3000\u4e00-\u9fff]+)\d+歳', name_full)
        cleaned_name = name_match.group(1).strip() if name_match else name_full
        unique_names.add(cleaned_name)

total_unique_names = len(unique_names)
print(f"ユニークな候補者名の総数: {total_unique_names}")

ユニークな候補者名の総数: 1118


In [None]:
proportional_district_urls = []

for i in range(81, 92):  # Loop from 81 to 91 (inclusive)
    district_code = f"{i:02d}"  # Format to two digits (e.g., 81, 91)
    url = f"https://www.yomiuri.co.jp/election/shugiin/YC{district_code}XXXXXX000/"
    proportional_district_urls.append(url)

print(f"Generated {len(proportional_district_urls)} proportional district URLs:")
for url in proportional_district_urls:
    print(url)

Generated 11 proportional district URLs:
https://www.yomiuri.co.jp/election/shugiin/YC81XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC82XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC83XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC84XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC85XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC86XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC87XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC88XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC89XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC90XXXXXX000/
https://www.yomiuri.co.jp/election/shugiin/YC91XXXXXX000/


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re # Import regex module

def get_candidates_for_proportional_district(proportional_district_url):
    try:
        response = requests.get(proportional_district_url)
        response.raise_for_status()  # Check for HTTP errors

        soup = BeautifulSoup(response.text, 'html.parser')

        candidate_data = []
        # Regex to match proportional district code (81-91)
        district_code_match = re.search(r'YC(8[1-9]|9[0-1])XXXXXX000', proportional_district_url)
        if not district_code_match:
            print(f"Could not extract proportional district code from URL: {proportional_district_url}")
            return []
        district_code = district_code_match.group(1)

        # Construct the pattern for candidate detail pages within this proportional district
        # Example: 'https://www.yomiuri.co.jp/election/shugiin/2026/YC81XXXXXX000/135847/'
        candidate_pattern = rf'^https://www.yomiuri.co.jp/election/shugiin/2026/YC{district_code}XXXXXX000/\d+/$'

        all_links = soup.find_all('a', href=True)

        for link_tag in all_links:
            absolute_link = urljoin(proportional_district_url, link_tag.get('href'))
            # Check if the link matches the candidate pattern
            if re.match(candidate_pattern, absolute_link):
                candidate_name_full = link_tag.get_text(strip=True)
                # Attempt to extract just the name before age or other details
                name_match = re.match(r'([\w\s\u3000\u4e00-\u9fff]+)\d+歳', candidate_name_full)
                candidate_name = name_match.group(1).strip() if name_match else candidate_name_full

                if candidate_name: # Ensure a name exists
                    candidate_data.append({'candidate_name': candidate_name, 'candidate_url': absolute_link})

        # Remove duplicate entries based on candidate_url
        unique_candidate_data = []
        seen_urls = set()
        for item in candidate_data:
            if item['candidate_url'] not in seen_urls:
                unique_candidate_data.append(item)
                seen_urls.add(item['candidate_url'])

        return unique_candidate_data

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {proportional_district_url}: {e}")
        return []
    except Exception as e:
        print(f"An error occurred during parsing {proportional_district_url}: {e}")
        return []

print("Function `get_candidates_for_proportional_district` defined.")

Function `get_candidates_for_proportional_district` defined.


In [None]:
import os
import shutil
import re

def clean_invalid_proportional_folders(base_dir='proportional_data'):
    if not os.path.exists(base_dir):
        print(f"Directory '{base_dir}' does not exist. Nothing to clean.")
        return

    print(f"Checking for and removing invalid proportional folders in '{base_dir}'...")
    for item in os.listdir(base_dir):
        full_path = os.path.join(base_dir, item)
        # Check if it's a directory and matches the YCXX pattern
        if os.path.isdir(full_path) and re.match(r'^YC\d+$', item):
            print(f"  Removing folder: {item}")
            try:
                shutil.rmtree(full_path)
                print(f"  Successfully removed '{item}'.")
            except Exception as e:
                print(f"  Error removing '{item}': {e}")
    print("Cleanup complete.")

# Call the cleanup function
clean_invalid_proportional_folders()

Directory 'proportional_data' does not exist. Nothing to clean.


In [None]:
all_proportional_candidates = {}

print(f"\nScraping candidates from {len(proportional_district_urls)} proportional district pages...")

for url in proportional_district_urls:
    # Extract district name (e.g., 'YC81') from the URL for dictionary key
    district_name_match = re.search(r'(YC\d+)XXXXXX000', url)
    district_name = district_name_match.group(1) if district_name_match else url

    print(f"  Scraping candidates for {district_name} ({url})...")
    candidates_list = get_candidates_for_proportional_district(url)

    if candidates_list:
        all_proportional_candidates[district_name] = candidates_list
        print(f"    Found {len(candidates_list)} candidates for {district_name}.")
    else:
        print(f"    No candidates found for {district_name}.")

print("\n--- Proportional District Scrape Summary ---")
total_proportional_candidates = sum(len(v) for v in all_proportional_candidates.values())
print(f"Total proportional districts scraped: {len(all_proportional_candidates)}")
print(f"Total candidates found in proportional districts: {total_proportional_candidates}")



Scraping candidates from 11 proportional district pages...
  Scraping candidates for YC81 (https://www.yomiuri.co.jp/election/shugiin/YC81XXXXXX000/)...
    Found 44 candidates for YC81.
  Scraping candidates for YC82 (https://www.yomiuri.co.jp/election/shugiin/YC82XXXXXX000/)...
    Found 67 candidates for YC82.
  Scraping candidates for YC83 (https://www.yomiuri.co.jp/election/shugiin/YC83XXXXXX000/)...
    Found 93 candidates for YC83.
  Scraping candidates for YC84 (https://www.yomiuri.co.jp/election/shugiin/YC84XXXXXX000/)...
    Found 118 candidates for YC84.
  Scraping candidates for YC85 (https://www.yomiuri.co.jp/election/shugiin/YC85XXXXXX000/)...
    Found 103 candidates for YC85.
  Scraping candidates for YC86 (https://www.yomiuri.co.jp/election/shugiin/YC86XXXXXX000/)...
    Found 57 candidates for YC86.
  Scraping candidates for YC87 (https://www.yomiuri.co.jp/election/shugiin/YC87XXXXXX000/)...
    Found 109 candidates for YC87.
  Scraping candidates for YC88 (https://w

In [None]:
proportional_district_name_map = {
    'YC81': '北海道ブロック',
    'YC82': '東北ブロック',
    'YC83': '北関東ブロック',
    'YC84': '東京ブロック',
    'YC85': '南関東ブロック',
    'YC86': '北陸信越ブロック',
    'YC87': '東海ブロック',
    'YC88': '近畿ブロック',
    'YC89': '中国ブロック',
    'YC90': '四国ブロック',
    'YC91': '九州ブロック',
}

比例代表地区名マップが定義されました。


In [None]:
import os
import json

def save_candidates_to_json(district_name, candidates_data, base_dir='proportional_data'):
    """
    Creates a directory for the proportional district and saves candidate data to a JSON file.
    """
    # Use the mapped district name for the directory and file
    mapped_district_name = proportional_district_name_map.get(district_name, district_name) # Fallback to original if not found
    district_dir = os.path.join(base_dir, mapped_district_name)
    os.makedirs(district_dir, exist_ok=True)

    file_path = os.path.join(district_dir, f"{mapped_district_name}_candidates.json")
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(candidates_data, f, ensure_ascii=False, indent=4)
    return len(candidates_data)


# Initialize counters for proportional districts
total_proportional_districts_processed = 0
total_proportional_candidates_saved = 0

# Ensure all_proportional_candidates is available from previous cells
if 'all_proportional_candidates' in globals() and all_proportional_candidates:
    print(f"\nSaving candidates for {len(all_proportional_candidates)} proportional districts...")
    for district_name, candidates_list in all_proportional_candidates.items():
        print(f"  Saving candidates for {district_name}...")
        if candidates_list:
            candidates_count = save_candidates_to_json(district_name, candidates_list, base_dir='proportional_data')
            total_proportional_districts_processed += 1
            total_proportional_candidates_saved += candidates_count
            print(f"    Saved {candidates_count} candidates for {district_name}.")
        else:
            print(f"    No candidates found to save for {district_name}.")

    print(f"\n--- Proportional Data Save Summary ---")
    print(f"Total proportional districts processed: {total_proportional_districts_processed}")
    print(f"Total proportional candidate entries saved: {total_proportional_candidates_saved}")
else:
    print("all_proportional_candidates variable not found or is empty. Please ensure previous steps ran successfully.")


Saving candidates for 11 proportional districts...
  Saving candidates for YC81...
    Saved 44 candidates for YC81.
  Saving candidates for YC82...
    Saved 67 candidates for YC82.
  Saving candidates for YC83...
    Saved 93 candidates for YC83.
  Saving candidates for YC84...
    Saved 118 candidates for YC84.
  Saving candidates for YC85...
    Saved 103 candidates for YC85.
  Saving candidates for YC86...
    Saved 57 candidates for YC86.
  Saving candidates for YC87...
    Saved 109 candidates for YC87.
  Saving candidates for YC88...
    Saved 138 candidates for YC88.
  Saving candidates for YC89...
    Saved 55 candidates for YC89.
  Saving candidates for YC90...
    Saved 33 candidates for YC90.
  Saving candidates for YC91...
    Saved 98 candidates for YC91.

--- Proportional Data Save Summary ---
Total proportional districts processed: 11
Total proportional candidate entries saved: 915


In [None]:
print(f"\n--- Final Summary for Proportional Districts ---")
print(f"Total proportional districts processed: {total_proportional_districts_processed}")
print(f"Total proportional candidate entries saved: {total_proportional_candidates_saved}")


--- Final Summary for Proportional Districts ---
Total proportional districts processed: 11
Total proportional candidate entries saved: 915


# 設問と回答を収集

In [None]:
!apt-get update -y
!apt-get install -y \
  libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 \
  libgtk-3-0 libgbm1 libnss3 libnspr4 \
  libcups2 libxkbcommon0 libxshmfence1 \
  libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxdamage1 libxrandr2 \
  libxext6 libxfixes3 \
  libpango-1.0-0 libpangocairo-1.0-0 libcairo2 \
  libdrm2 libdbus-1-3 \
  libasound2


0% [Working]            Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Waiting for headers] [Waiting for headers] [2 InRelease 3,632 B/3,632 B 1000% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connected                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [3 InRelease 14.2 kB/129 kB 11%] [Waiting for headers]                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://cli.github.com/packages stable/main a

In [None]:
!pip -q install --upgrade playwright
!playwright install chromium


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Chrome for Testing 145.0.7632.6 (playwright chromium v1208)[2m from https://cdn.playwright.dev/chrome-for-testing-public/145.0.7632.6/linux64/chrome-linux64.zip[22m
[1G167.3 MiB [] 0% 0.0s[0K[1G167.3 MiB [] 0% 26.7s[0K[1G167.3 MiB [] 0% 14.4s[0K[1G167.3 MiB [] 0% 13.3s[0K[1G167.3 MiB [] 0% 8.8s[0K[1G167.3 MiB [] 1% 6.1s[0K[1G167.3 MiB [] 1% 5.1s[0K[1G167.3 MiB [] 3% 3.8s[0K[1G167.3 MiB [] 3% 3.5s[0K[1G167.3 MiB [] 4% 3.4s[0K[1G167.3 MiB [] 5% 2.9s[0K[1G167.3 MiB [] 6% 2.6s[0K[1G167.3 MiB [] 7% 2.5s[0K[1G167.3 MiB [] 8% 2.3s[0K[1G167.3 MiB [] 9% 2.2s[0K[1G167.3 MiB [] 10% 2.1s[0K[1G167.3 MiB [] 12% 2.0s[0K[1G167.3 MiB [] 13% 1.9s[0K[1G167.3 MiB [] 14% 1.8s[0K[1G167.3 MiB [] 15% 1.8s[0K[1G167.3 MiB [] 16% 1.8s[0K[1G167.3 MiB [] 17% 1.7s[0K[1G167.3 MiB [] 18% 1.6s[0K[1G167.3 MiB [] 19% 1.6s[0K[1G167.3 Mi

In [None]:
import os, json, re, hashlib, random
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# =========================================================
# 設定（直列・安定版）
# =========================================================
LIMIT_PER_UNIT = None   # 各県/各ブロックで先頭N件だけ。Noneなら全件
DATA_ROOT_PREF = "prefecture_data"
DATA_ROOT_PROP = "proportional_data"

DRIVE_BASE_DIR = "/content/drive/MyDrive/yomiuri_enquete_2026"
OUT_DIR_PREF = os.path.join(DRIVE_BASE_DIR, "prefecture")
OUT_DIR_PROP = os.path.join(DRIVE_BASE_DIR, "proportional")
OUT_DIR_ALL  = os.path.join(DRIVE_BASE_DIR, "all")

# URL単位キャッシュ（再起動対策）
CACHE_DIR = os.path.join(DRIVE_BASE_DIR, "cache_candidates")  # 1候補者=1ファイル

# 最後に URL 重複を消す（pref/proportional両方に出る想定）
DROP_DUPLICATE_BY_URL = True

# ブロック回避（直列でも少し散らす）
JITTER_SEC = 1.2

# リトライ
RETRIES = 1
BASE_BACKOFF_SEC = 1.0

# タイムアウト
GOTO_TIMEOUT_MS = 120000
WAIT_Q_WRAPPER_TIMEOUT_MS = 5000
WAIT_ANSWER_EVIDENCE_MAX_MS = 1500  # 回答が入るまで最大待つ時間
WAIT_ANSWER_POLL_MS = 500

# 回答が1つも取れない場合は fail 扱いにする（空OKキャッシュを防ぐ）
TREAT_EMPTY_ANSWERS_AS_FAIL = False

# ★Q24は固定で 1..11 を必ず列として救う（未回答は空欄）
FORCE_Q24_SUBKEYS = 11

# =========================================================

for d in [OUT_DIR_PREF, OUT_DIR_PROP, OUT_DIR_ALL, CACHE_DIR]:
    os.makedirs(d, exist_ok=True)

# ---------------------------------------------------------
# キャッシュ
# ---------------------------------------------------------
def url_to_cache_path(url: str) -> str:
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()
    return os.path.join(CACHE_DIR, f"{h}.json")

def load_cached_row(url: str):
    path = url_to_cache_path(url)
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return None
    return None

def save_cached_row(url: str, row: dict):
    path = url_to_cache_path(url)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(row, f, ensure_ascii=False)

# ---------------------------------------------------------
# 文字処理
# ---------------------------------------------------------
def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def first_int(text: str) -> str:
    m = re.search(r"\d+", text or "")
    return m.group(0) if m else ""

# ---------------------------------------------------------
# プロフィール抽出
# ---------------------------------------------------------
def extract_party_dom_first(soup: BeautifulSoup) -> str:
    for dt in soup.select("dt"):
        if dt.get_text(strip=True) in ("党派", "政党"):
            dd = dt.find_next_sibling("dd")
            if dd:
                v = clean_text(dd.get_text(" ", strip=True))
                if v:
                    return v

    for th in soup.select("th"):
        if th.get_text(strip=True) in ("党派", "政党"):
            td = th.find_next_sibling("td")
            if td:
                v = clean_text(td.get_text(" ", strip=True))
                if v:
                    return v

    return ""

def extract_profile(soup: BeautifulSoup) -> dict:
    text_all = soup.get_text("\n", strip=True)

    name = ""
    h1 = soup.select_one("h1")
    if h1:
        name = clean_text(h1.get_text(" ", strip=True))

    age = ""
    m_age = re.search(r"年齢\s*([0-9]{1,3})\s*歳", text_all)
    if m_age:
        age = m_age.group(1)

    party = extract_party_dom_first(soup)
    if not party:
        m_party = re.search(r"(党派|政党)\s*([^\n]+)", text_all)
        if m_party:
            party = clean_text(m_party.group(2))

    if party:
        party = re.split(r"\s+新旧|\s+新|\s+前|\s+当選|\s+経歴", party)[0].strip()

    return {"氏名": name, "年齢": age, "政党": party}

# ---------------------------------------------------------
# 回答抽出（設問番号→回答番号）
# ---------------------------------------------------------
def parse_answers(soup: BeautifulSoup) -> dict:
    out = {}
    for wrap in soup.select("div.candidate_profile_enquete_wrapper[id^='question']"):
        m = re.search(r"question(\d+)", wrap.get("id", ""))
        if not m:
            continue
        qnum = m.group(1)

        # Q1（順位）
        q01_ranks = wrap.select("li.candidate_profile_enquete_q01_answer_rank")
        if q01_ranks:
            rank_no = 1
            for li in q01_ranks:
                span = li.select_one("div.candidate_profile_enquete_q01_answer_text span")
                out[f"Q{qnum}-{rank_no}"] = first_int(span.get_text(strip=True)) if span else ""
                rank_no += 1
            continue

        # Q9（a-f）→ 連番
        themes = wrap.select("li.candidate_profile_enquete_q09_answer_theme")
        ans_spans = wrap.select("li.candidate_profile_enquete_q09_answer_text span")
        if themes and ans_spans:
            idx = 1
            for sp in ans_spans:
                out[f"Q{qnum}-{idx}"] = first_int(sp.get_text(strip=True))
                idx += 1
            continue

        # 複数選択（Q24など）
        multi_spans = wrap.select(f"#answer{qnum} .candidate_profile_enquete_answer_text span")
        if len(multi_spans) >= 2:
            i = 1
            for sp in multi_spans:
                out[f"Q{qnum}-{i}"] = first_int(sp.get_text(strip=True))
                i += 1
            continue

        # Q25：active（0-10）
        q25_graphs = wrap.select("div.candiate_profile_enquete_q25_answer_1axis_graph")
        q25_names = wrap.select("div.candiate_profile_enquete_q25_name")
        if q25_graphs and q25_names:
            idx = 1
            for graph in q25_graphs:
                active = graph.select_one(".scales .scale.active")
                ans_num = ""
                if active:
                    cls = " ".join(active.get("class", []))
                    mscale = re.search(r"scale(\d+)", cls)
                    if mscale:
                        ans_num = mscale.group(1)
                out[f"Q{qnum}-{idx}"] = ans_num
                idx += 1
            continue

        # 通常 単一
        span = wrap.select_one("p.candidate_profile_enquete_answer_text span")
        out[f"Q{qnum}"] = first_int(span.get_text(strip=True)) if span else ""

    return out

# ---------------------------------------------------------
# Playwright：回答が「入った」ことを待つ（重要）
# ---------------------------------------------------------
ANSWER_EVIDENCE_SELECTORS = [
    "p.candidate_profile_enquete_answer_text span",           # 通常回答
    "li.candidate_profile_enquete_q01_answer_rank span",      # Q1順位
    "li.candidate_profile_enquete_q09_answer_text span",      # Q9(a-f)
    "div.candidate_profile_enquete_answer_text span",         # 複数選択系
    "div.candiate_profile_enquete_q25_answer_1axis_graph .scale.active",  # Q25
]

from playwright.async_api import TimeoutError as PlaywrightTimeoutError

async def wait_answers_ready(page) -> bool:
    """
    質問ブロック(wrapper)が出るのを待つが、
    timeoutしても例外にせず False を返す（＝未回答/構造違い扱い）
    """
    try:
        await page.wait_for_selector(
            "div.candidate_profile_enquete_wrapper[id^='question']",
            timeout=WAIT_Q_WRAPPER_TIMEOUT_MS
        )
    except PlaywrightTimeoutError:
        return False

    loops = max(1, WAIT_ANSWER_EVIDENCE_MAX_MS // WAIT_ANSWER_POLL_MS)
    for _ in range(int(loops)):
        for sel in ANSWER_EVIDENCE_SELECTORS:
            try:
                if await page.locator(sel).count() > 0:
                    return True
            except:
                pass
        await page.wait_for_timeout(WAIT_ANSWER_POLL_MS)

    return False

async def fetch_candidate_row(browser, url: str) -> dict:
    page = await browser.new_page()
    try:
        await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT_MS)
        await wait_answers_ready(page)
        html = await page.content()
    finally:
        await page.close()

    soup = BeautifulSoup(html, "html.parser")
    row = {}
    row.update(extract_profile(soup))

    answers = parse_answers(soup)
    row.update(answers)
    row["URL"] = url

    if TREAT_EMPTY_ANSWERS_AS_FAIL and len(answers) == 0:
        raise RuntimeError("no answers extracted (possibly JS not ready / blocked)")

    return row

# ---------------------------------------------------------
# DF生成（直列＋キャッシュ＋進捗表示）
# ---------------------------------------------------------
def q_sort_key(k: str):
    m = re.match(r"Q(\d+)(?:-(\d+))?$", k)
    if not m:
        return (10**9, 10**9, k)
    qn = int(m.group(1))
    sub = int(m.group(2)) if m.group(2) else 0
    return (qn, sub, k)

async def page_sleep(browser, sec: float):
    if sec and sec > 0:
        import asyncio
        await asyncio.sleep(sec)

async def build_df(URLS, group_label: str = ""):
    total = len(URLS)
    rows = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=["--no-sandbox", "--disable-dev-shm-usage"]
        )

        try:
            for i, url in enumerate(URLS, start=1):
                await page_sleep(browser, random.uniform(0, JITTER_SEC))

                cached = load_cached_row(url)
                if cached and cached.get("_status") == "ok":
                    cached["グループ"] = group_label
                    rows.append(cached)
                    print(f"[{group_label}] [{i}/{total}] SKIP(cached): {cached.get('氏名','')}", flush=True)
                    continue

                last_err = None
                for attempt in range(1, RETRIES + 1):
                    try:
                        row = await fetch_candidate_row(browser, url)
                        row["グループ"] = group_label
                        row["_status"] = "ok"
                        save_cached_row(url, row)
                        rows.append(row)
                        print(f"[{group_label}] [{i}/{total}] OK: {row.get('氏名','')}", flush=True)
                        break
                    except Exception as e:
                        last_err = e
                        backoff = BASE_BACKOFF_SEC * (2 ** (attempt - 1)) + random.uniform(0, JITTER_SEC)
                        print(
                            f"[{group_label}] [{i}/{total}] RETRY {attempt}/{RETRIES}: {type(e).__name__} sleep {backoff:.1f}s",
                            flush=True
                        )
                        await page_sleep(browser, backoff)

                else:
                    fail_row = {"URL": url, "グループ": group_label, "_status": "fail", "_error": str(last_err)}
                    save_cached_row(url, fail_row)
                    rows.append(fail_row)
                    print(f"[{group_label}] [{i}/{total}] FAIL: {url}", flush=True)

        finally:
            await browser.close()

    df = pd.DataFrame(rows)

    # Q24固定列（未回答でも列だけ救う）
    for i in range(1, FORCE_Q24_SUBKEYS + 1):
        col = f"Q24-{i}"
        if col not in df.columns:
            df[col] = ""
    df = df.drop(columns=["Q24"], errors="ignore")

    all_cols = set(df.columns)
    base_cols = [c for c in ["グループ", "氏名", "年齢", "政党"] if c in all_cols]
    q_cols = sorted([c for c in all_cols if re.match(r"^Q\d+(-\d+)?$", c)], key=q_sort_key)
    other_cols = [c for c in ["URL", "_status", "_error"] if c in all_cols]
    df = df.reindex(columns=base_cols + q_cols + other_cols)

    df = df.drop(columns=["Q1"], errors="ignore")
    return df.reset_index(drop=True)

# ---------------------------------------------------------
# JSON走査
# ---------------------------------------------------------
def collect_json_files(root_dir: str):
    json_files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for fn in filenames:
            if fn.endswith(".json"):
                json_files.append(os.path.join(dirpath, fn))
    return sorted(json_files)

def load_urls_from_json(json_path: str):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    urls = [item.get("candidate_url") for item in data if item.get("candidate_url")]
    if LIMIT_PER_UNIT is not None:
        urls = urls[:LIMIT_PER_UNIT]
    return urls

def safe_unit_name(json_path: str):
    return os.path.basename(os.path.dirname(json_path))

def save_df_csv(df: pd.DataFrame, out_path: str):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df.to_csv(out_path, index=False)

# ---------------------------------------------------------
# 政党「多い順」ソート（party csv）
# ---------------------------------------------------------
def sort_by_party_frequency(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty or "政党" not in df.columns:
        return df.reset_index(drop=True)

    tmp = df.copy()
    tmp["政党"] = tmp["政党"].fillna("").astype(str)
    tmp["氏名"] = tmp.get("氏名", "").fillna("").astype(str)

    counts = tmp["政党"].value_counts()
    tmp["_party_count"] = tmp["政党"].map(counts).fillna(0).astype(int)
    tmp["_is_empty_party"] = (tmp["政党"] == "").astype(int)

    tmp = tmp.sort_values(
        by=["_is_empty_party", "_party_count", "政党", "氏名"],
        ascending=[True, False, True, True],
        kind="stable"
    ).drop(columns=["_party_count", "_is_empty_party"], errors="ignore")

    return tmp.reset_index(drop=True)

# ---------------------------------------------------------
# ★追加：ユニット全URLがキャッシュokか判定
# ---------------------------------------------------------
def unit_all_cached_ok(urls) -> bool:
    if not urls:
        return False
    for url in urls:
        cached = load_cached_row(url)
        if not cached or cached.get("_status") != "ok":
            return False
    return True

# ---------------------------------------------------------
# ★追加：ユニットCSVが読めるか（安全策）
# ---------------------------------------------------------
def try_read_unit_csv(path: str):
    try:
        if os.path.exists(path):
            return pd.read_csv(path)
    except Exception:
        pass
    return None

# ---------------------------------------------------------
# メイン：単位CSV → 全体CSV → party版CSV
# ---------------------------------------------------------
async def run_all():
    # --- prefecture ---
    pref_dfs = []
    pref_jsons = collect_json_files(DATA_ROOT_PREF)
    print(f"prefecture json files: {len(pref_jsons)}")

    for jp in pref_jsons:
        unit = safe_unit_name(jp)
        urls = load_urls_from_json(jp)
        if not urls:
            print(f"[pref:{unit}] skip (no urls)")
            continue

        unit_csv_path = os.path.join(OUT_DIR_PREF, f"{unit}.csv")

        # ★安全策：全URLがキャッシュokならPlaywright起動せず、CSVがあればそれを読んで集計に入れる
        if unit_all_cached_ok(urls):
            df_cached = try_read_unit_csv(unit_csv_path)
            if df_cached is not None and not df_cached.empty:
                pref_dfs.append(df_cached)
                print(f"[pref:{unit}] SKIP (all cached ok, use CSV): {len(urls)} urls", flush=True)
                continue
            # CSVが無い/壊れてるなら作り直す（Playwrightは起動する）
            print(f"[pref:{unit}] all cached ok but CSV missing/broken -> rebuild CSV", flush=True)

        df_unit = await build_df(urls, group_label=f"pref:{unit}")
        save_df_csv(df_unit, unit_csv_path)
        pref_dfs.append(df_unit)

    df_pref_all = pd.concat(pref_dfs, ignore_index=True) if pref_dfs else pd.DataFrame()
    if not df_pref_all.empty:
        save_df_csv(df_pref_all, os.path.join(OUT_DIR_PREF, "_prefecture_all.csv"))
        save_df_csv(sort_by_party_frequency(df_pref_all), os.path.join(OUT_DIR_PREF, "_prefecture_all_party.csv"))

    # --- proportional ---
    prop_dfs = []
    prop_jsons = collect_json_files(DATA_ROOT_PROP)
    print(f"proportional json files: {len(prop_jsons)}")

    for jp in prop_jsons:
        unit = safe_unit_name(jp)
        urls = load_urls_from_json(jp)
        if not urls:
            print(f"[prop:{unit}] skip (no urls)")
            continue

        unit_csv_path = os.path.join(OUT_DIR_PROP, f"{unit}.csv")

        # ★安全策：全URLがキャッシュokならPlaywright起動せず、CSVがあればそれを読んで集計に入れる
        if unit_all_cached_ok(urls):
            df_cached = try_read_unit_csv(unit_csv_path)
            if df_cached is not None and not df_cached.empty:
                prop_dfs.append(df_cached)
                print(f"[prop:{unit}] SKIP (all cached ok, use CSV): {len(urls)} urls", flush=True)
                continue
            print(f"[prop:{unit}] all cached ok but CSV missing/broken -> rebuild CSV", flush=True)

        df_unit = await build_df(urls, group_label=f"prop:{unit}")
        save_df_csv(df_unit, unit_csv_path)
        prop_dfs.append(df_unit)

    df_prop_all = pd.concat(prop_dfs, ignore_index=True) if prop_dfs else pd.DataFrame()
    if not df_prop_all.empty:
        save_df_csv(df_prop_all, os.path.join(OUT_DIR_PROP, "_proportional_all.csv"))
        save_df_csv(sort_by_party_frequency(df_prop_all), os.path.join(OUT_DIR_PROP, "_proportional_all_party.csv"))

    # --- 全候補者（pref + prop） ---
    frames = [df for df in [df_pref_all, df_prop_all] if not df.empty]
    df_all = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

    if not df_all.empty and DROP_DUPLICATE_BY_URL and "URL" in df_all.columns:
        df_all = df_all.drop_duplicates(subset=["URL"], keep="first").reset_index(drop=True)

    if not df_all.empty:
        save_df_csv(df_all, os.path.join(OUT_DIR_ALL, "_all_candidates.csv"))
        save_df_csv(sort_by_party_frequency(df_all), os.path.join(OUT_DIR_ALL, "_all_candidates_party.csv"))

    return df_pref_all, df_prop_all, df_all

# 実行（別セルで）
# df_pref_all, df_prop_all, df_all = await run_all()


任意：driveのキャッシュをリセットするため

In [None]:
import glob, json, os
"""
n_del = 0
for path in glob.glob("/content/drive/MyDrive/yomiuri_enquete_2026/cache_candidates/*.json"):
    with open(path, encoding="utf-8") as f:
        d = json.load(f)
    has_q = any(k.startswith("Q") for k in d.keys())
    if d.get("_status") == "ok" and (not has_q):
        os.remove(path)
        n_del += 1

print("deleted:", n_del)
"""

In [None]:
df_pref_all, df_prop_all, df_all = await run_all()

prefecture json files: 47
[pref:三重] SKIP (all cached ok, use CSV): 14 urls
[pref:京都] SKIP (all cached ok, use CSV): 27 urls
[pref:佐賀] SKIP (all cached ok, use CSV): 5 urls
[pref:兵庫] SKIP (all cached ok, use CSV): 53 urls
[pref:北海道] SKIP (all cached ok, use CSV): 38 urls
[pref:千葉] SKIP (all cached ok, use CSV): 52 urls
[pref:和歌山] SKIP (all cached ok, use CSV): 9 urls
[pref:埼玉] SKIP (all cached ok, use CSV): 55 urls
[pref:大分] SKIP (all cached ok, use CSV): 12 urls
[pref:大阪] SKIP (all cached ok, use CSV): 84 urls
[pref:奈良] SKIP (all cached ok, use CSV): 11 urls
[pref:宮城] SKIP (all cached ok, use CSV): 21 urls
[pref:宮崎] SKIP (all cached ok, use CSV): 9 urls
[pref:富山] SKIP (all cached ok, use CSV): 10 urls
[pref:山口] SKIP (all cached ok, use CSV): 10 urls
[pref:山形] SKIP (all cached ok, use CSV): 10 urls
[pref:山梨] SKIP (all cached ok, use CSV): 6 urls
[pref:岐阜] SKIP (all cached ok, use CSV): 20 urls
[pref:岡山] SKIP (all cached ok, use CSV): 15 urls
[pref:岩手] SKIP (all cached ok, use CSV): 10 u