In [None]:
import requests
from datetime import datetime, timedelta
import pandas as pd
from dotenv import load_dotenv
import os
from tqdm import tqdm
import zipfile
from pathlib import Path
import glob
import jquantsapi
import inspect
from bs4 import BeautifulSoup
from io import StringIO
import re
import json

In [None]:
url = "https://tochidai.info/"

# ページを取得
response = requests.get(url)
response.raise_for_status()  # HTTPエラーがあれば例外を発生させる

# HTMLをパース
soup = BeautifulSoup(response.content, 'html.parser')

# table id="prefecture-list" を取得
table = soup.find('table', id='prefecture-list')

# tbody内のすべての行を取得
tbody = table.find('tbody')

# データを格納するリスト
prefecture_data = []

# 各行を処理
rows = tbody.find_all('tr')
for row in rows:
    # td class="prefecture" を取得
    prefecture_td = row.find('td', class_='prefecture')
    if prefecture_td:
        # a タグを取得
        a_tag = prefecture_td.find('a')
        if a_tag:
            href = a_tag.get('href', '')
            text = a_tag.get_text(strip=True)
        else:
            href = ''
            text = ''
    else:
        href = ''
        text = ''
    
    # td class="land-price" を取得
    land_price_td = row.find('td', class_='land-price')
    if land_price_td:
        price = land_price_td.get_text(strip=True)
    else:
        price = ''
    
    # データを追加（空でない場合）
    if href or text or price:
        prefecture_data.append({
            'href': href,
            'name': text,
            'price': price
        })

In [None]:
import requests
from bs4 import BeautifulSoup
import time

# 市区町村データを格納するリスト
all_city_data = []

# 各都道府県に対して処理
for prefecture in prefecture_data:
    prefecture_name = prefecture['name']
    prefecture_href = prefecture['href']
    prefecture_price = prefecture['price']
    
    # URLを構築
    city_url = f"https://tochidai.info{prefecture_href}"
    
    print(f"Processing: {prefecture_name} - {city_url}")
    
    # ページを取得
    response = requests.get(city_url)
    response.raise_for_status()
    
    # HTMLをパース
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # table id="city-list" を取得
    table = soup.find('table', id='city-list')
    
    if table:
        # tbody内のすべての行を取得
        tbody = table.find('tbody')
        
        if tbody:
            # 各行を処理
            rows = tbody.find_all('tr')
            for row in rows:
                # td class="city" を取得
                city_td = row.find('td', class_='city')
                if city_td:
                    # a タグを取得
                    a_tag = city_td.find('a')
                    if a_tag:
                        city_href = a_tag.get('href', '')
                        city_name = a_tag.get_text(strip=True)
                    else:
                        city_href = ''
                        city_name = ''
                else:
                    city_href = ''
                    city_name = ''
                
                # td class="land-price" を取得
                land_price_td = row.find('td', class_='land-price')
                if land_price_td:
                    city_price = land_price_td.get_text(strip=True)
                else:
                    city_price = ''
                
                # データを追加（空でない場合）
                if city_href or city_name or city_price:
                    all_city_data.append({
                        'prefecture_name': prefecture_name,
                        'prefecture_href': prefecture_href,
                        'prefecture_price': prefecture_price,
                        'city_href': city_href,
                        'city_name': city_name,
                        'city_price': city_price
                    })
    
    # サーバーに負荷をかけないように少し待つ
    time.sleep(0.5)
    
print(f"Total cities collected: {len(all_city_data)}")

In [None]:
import requests
from bs4 import BeautifulSoup
import time

# エリアデータと地点データを格納するリスト
all_area_data = []
all_point_data = []

# 各市区町村に対して処理
for city in all_city_data:
    prefecture_name = city['prefecture_name']
    prefecture_href = city['prefecture_href']
    city_name = city['city_name']
    city_href = city['city_href']
    city_price = city['city_price']
    
    # URLを構築
    detail_url = f"https://tochidai.info{prefecture_href}{city_href}"
    
    print(f"Processing: {prefecture_name} - {city_name} - {detail_url}")
    
    # ページを取得
    response = requests.get(detail_url)
    response.raise_for_status()
    
    # HTMLをパース
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 1. エリア地価を取得
    area_table = soup.find('table', id='area-list')
    if area_table:
        tbody = area_table.find('tbody')
        if tbody:
            rows = tbody.find_all('tr')
            for row in rows:
                # td class="area" を取得
                area_td = row.find('td', class_='area')
                if area_td:
                    # a タグを取得
                    a_tag = area_td.find('a')
                    if a_tag:
                        area_href = a_tag.get('href', '')
                        area_name = a_tag.get_text(strip=True)
                    else:
                        area_href = ''
                        area_name = ''
                else:
                    area_href = ''
                    area_name = ''
                
                # td class="land-price" を取得
                land_price_td = row.find('td', class_='land-price')
                if land_price_td:
                    area_price = land_price_td.get_text(strip=True)
                else:
                    area_price = ''
                
                # データを追加（空でない場合）
                if area_href or area_name or area_price:
                    all_area_data.append({
                        'prefecture_name': prefecture_name,
                        'prefecture_href': prefecture_href,
                        'city_name': city_name,
                        'city_href': city_href,
                        'city_price': city_price,
                        'area_href': area_href,
                        'area_name': area_name,
                        'area_price': area_price
                    })
    
    # 2. 地点地価を取得
    point_table = soup.find('table', id='point-list')
    if point_table:
        tbody = point_table.find('tbody')
        if tbody:
            rows = tbody.find_all('tr')
            for row in rows:
                # td class="address" を取得（aタグなし、直接テキスト）
                address_td = row.find('td', class_='address')
                if address_td:
                    address_text = address_td.get_text(strip=True)
                else:
                    address_text = ''
                
                # td class="land-price" を取得
                land_price_td = row.find('td', class_='land-price')
                if land_price_td:
                    point_price = land_price_td.get_text(strip=True)
                else:
                    point_price = ''
                
                # データを追加（空でない場合）
                if address_text or point_price:
                    all_point_data.append({
                        'prefecture_name': prefecture_name,
                        'prefecture_href': prefecture_href,
                        'city_name': city_name,
                        'city_href': city_href,
                        'city_price': city_price,
                        'address': address_text,
                        'point_price': point_price
                    })
    
    # サーバーに負荷をかけないように少し待つ
    time.sleep(0.5)
    
print(f"Total areas collected: {len(all_area_data)}")
print(f"Total points collected: {len(all_point_data)}")


In [None]:
all_data = {
    'prefectures': prefecture_data,
    'cities': all_city_data,
    'areas': all_area_data,
    'points': all_point_data,
}

with open('land_price.json', 'w') as f:
    json.dump(all_data, f)

In [None]:
all_df = pd.DataFrame()

tmp_df = pd.DataFrame(prefecture_data)
all_df = pd.concat([all_df, tmp_df[['name', 'price']]], axis=0, ignore_index=True)

tmp_df = pd.DataFrame(all_city_data)
tmp_df['name'] = tmp_df['prefecture_name'] + tmp_df['city_name']
tmp_df['price'] = tmp_df['city_price']
all_df = pd.concat([all_df, tmp_df[['name', 'price']]], axis=0, ignore_index=True)

tmp_df = pd.DataFrame(all_area_data)
tmp_df['name'] = tmp_df['prefecture_name'] + tmp_df['city_name'] + tmp_df['area_name']
tmp_df['price'] = tmp_df['area_price']
all_df = pd.concat([all_df, tmp_df[['name', 'price']]], axis=0, ignore_index=True)

tmp_df = pd.DataFrame(all_point_data)
tmp_df['name'] = tmp_df['prefecture_name'] + tmp_df['city_name'] + tmp_df['address']
tmp_df['price'] = tmp_df['point_price']
all_df = pd.concat([all_df, tmp_df[['name', 'price']]], axis=0, ignore_index=True)

all_df = all_df.drop_duplicates().copy()

def convert_price_to_number(price_str):
    """
    価格文字列を数値に変換する関数
    '37万6222円m2' → 376222
    '3800円/m2' → 3800
    """
    if pd.isna(price_str):
        return None
    
    # 文字列型に変換
    price_str = str(price_str)
    
    # 「万」が含まれている場合の処理
    if '万' in price_str:
        # 例: '37万6222円' → 37万 + 6222
        match = re.search(r'(\d+)万(\d*)', price_str)
        if match:
            man = int(match.group(1)) * 10000  # 万の部分
            remainder = int(match.group(2)) if match.group(2) else 0  # 残りの部分
            return man + remainder
    else:
        # 「万」が含まれていない場合、数字部分を抽出
        match = re.search(r'(\d+)', price_str)
        if match:
            return int(match.group(1))
    
    return None

# データフレームに適用
all_df['price'] = all_df['price'].apply(convert_price_to_number)

all_df.to_excel('land_price.xlsx', index=False)