# 一 安装环境

In [None]:
# 执行代码
# !uv init . --name "shanghai_road_clustering_analysis"
# !uv sync

# 二 收集并且检查数据
原始数据分为两个，解压获取。
[道路数据：](data/SHP格式路网.7z)
[省份与市区县的字典原始数据](data/省份与市区县的数据.7z)

## 道路数据
图简单用了网上下载的数据：https://mp.weixin.qq.com/s/TEOYs2PJwX4PaN6tKJ3udg
> 说明：“本次分享上海市路网数据，包含SHP格式和“模型格式”。GCJ02坐标系，共363398条路段，累计36666公里。已经处理好拓扑关系和连通性，可导入ArcMap、TransCAD或SUMO等交通模型工具生成路网模型，用于路径规划和交通仿真。路网示意图见图1。”

In [None]:
# 检查现有的 `SH_LINK.shp` 数据质量

import geopandas as gpd

roads_gdf = gpd.read_file(r'data\上海路网数据\上海路网数据\SHP格式路网\SH_LINK.shp') 
roads_gdf.head()

In [None]:
# 检查空间参考 显示是4326 因为我们做路网聚类，我就不去深究坐标系对不对了
roads_gdf.crs

In [None]:
# 预处理数据
# 选取需要的字段
roads_gdf = roads_gdf[['ROAD', 'geometry']]
# 重命名字段
roads_gdf.rename(columns={'ROAD': 'name'}, inplace=True)

roads_gdf.head()

In [None]:
# 找找南京西路
roads_gdf.query("name == '南京西路'")

In [None]:
# 删掉 name 为空的行
roads_gdf = roads_gdf[roads_gdf['name'].notna()]
roads_gdf.head()

### 限定研究范围到上海市中心的几个区

In [None]:
# 1. 限定道路范围 只对上海中心城区进行匹配，这是1945年那场大规模路名改革的核心地带
districts_gdf = gpd.read_file(r'data\省份与市区县的数据\分年龄、性别的人口_区县等级.shp')

In [None]:
# 检查空间参考
districts_gdf.crs

In [None]:
districts_gdf = districts_gdf[districts_gdf['省级'] == '上海市'].copy()

# 选择需要的字段
districts_gdf = districts_gdf[['地名', 'geometry']].copy()

CENTRAL_DISTRICTS = ['黄浦区', '徐汇区', '长宁区', '静安区', '普陀区', '虹口区', '杨浦区']

# 过滤出中心城区
central_districts_gdf = districts_gdf[districts_gdf["地名"].isin(CENTRAL_DISTRICTS)]

# 合并中心城区边界以创建研究范围 (A
aoi_polygon = central_districts_gdf.union_all()

aoi_polygon

In [None]:
aoi_polygon_df = gpd.GeoDataFrame(geometry=[aoi_polygon], crs=central_districts_gdf.crs)

In [None]:
output_filename = "data/shanghai_roads_merged.gpkg"

aoi_polygon_df.to_file(output_filename, layer='shanghai_selected_districts', driver="GPKG", engine='fiona')

In [None]:
# 使用空间索引筛选位于研究范围内的道路
roads_in_aoi_gdf = gpd.sjoin(roads_gdf, gpd.GeoDataFrame(geometry=[aoi_polygon], crs=central_districts_gdf.crs), how="inner", predicate='intersects')
# sjoin会添加一个'index_right'列，我们可以把它去掉
roads_in_aoi_gdf = roads_in_aoi_gdf.drop(columns=['index_right'])

In [None]:
roads_in_aoi_gdf.plot()

In [None]:
roads_in_aoi_gdf.crs

从OpenStreetMap等标准来源下载的路网数据，为了保证网络的拓扑关系（即，知道哪条路和哪条路是连通的），在每一个交叉口，路段（LineString）都必须被打断成独立的小段。如果我们为“南京西路”的每一小段都计算一个质心，那么在“南京西路”这条路上就会产生密密麻麻几十个点，这会给聚类分析带来巨大的噪声和权重偏差。也会产生可视化混乱。
要解决这个问题，我们需要在分析流程中增加一个关键的数据预处理步骤——道路合并。

### 道路合并
这个操作的核心思想是：将所有拥有相同路名 (name 字段相同) 并且在空间上能够首尾相接的零散线段，合并成一个单一的、更长的几何对象（MultiLineString）。

In [None]:
# 使用 Geopandas 的 Dissolve 功能
# 我们告诉geopandas，按'name'字段进行分组。
# 对于每个分组（即所有同名的路段），它会自动将它们的几何图形合并。
# reset_index() 是为了将'name'从索引变回普通的列
dissolved_roads_in_aoi = roads_in_aoi_gdf.dissolve(by='name').reset_index()
dissolved_roads_in_aoi.head()

In [None]:
# 查看道路合并前后的数据差距
print(f"原始数据行数: {len(roads_in_aoi_gdf)}")
print(f"合并后数据行数: {len(dissolved_roads_in_aoi)}")

In [None]:
# 处理合并后的几何对象 尝试将 MultiLineString 合并为单一的 LineString
from shapely.ops import linemerge, unary_union

def merge_lines(geom):
    """
    尝试将MultiLineString合并为单一LineString
    """
    try:
        # 检查几何类型
        if geom.geom_type == 'LineString':
            # 已经是LineString，直接返回
            return geom
        elif geom.geom_type == 'MultiLineString':
            # 先使用unary_union清理几何对象，处理可能的重叠或自相交
            cleaned_geom = unary_union(geom)
            
            # 尝试使用linemerge合并连接的线段
            merged = linemerge(cleaned_geom)
            
            # linemerge可能返回LineString或MultiLineString
            return merged
        else:
            # 其他几何类型，直接返回
            return geom
    except Exception as e:
        print(f"合并失败，保留原始几何: {e}")
        return geom

# 应用合并函数前，先检查几何类型分布
print("=== 合并前几何类型分布 ===")
geom_types = dissolved_roads_in_aoi['geometry'].apply(lambda x: x.geom_type).value_counts()
print(geom_types)

# 应用合并函数
print("\n正在处理几何合并...")
dissolved_roads_in_aoi['geometry'] = dissolved_roads_in_aoi['geometry'].apply(merge_lines)

# 检查合并后的几何类型分布
print("\n=== 合并后几何类型分布 ===")
geom_types_after = dissolved_roads_in_aoi['geometry'].apply(lambda x: x.geom_type).value_counts()
print(geom_types_after)

print("✅ 几何合并处理完成！")

In [None]:
print("✅ 道路合并处理完成！")

# --- 4. 保存结果 ---

dissolved_roads_in_aoi.to_file(output_filename, layer='merged_roads_in_aoi', driver="GPKG", engine='fiona') # 确保使用 'fiona' 引擎 不然用arcgispro打不开 bug？
print(f"✅ 合并后的路网数据已保存到 '{output_filename}'")

print("\n📊 合并后数据预览:")
print(dissolved_roads_in_aoi[['name', 'geometry']].head().to_string())
print(f"\n📈 数据统计:")
print(f"原始道路数量: {len(roads_in_aoi_gdf)}")
print(f"合并后道路数量: {len(dissolved_roads_in_aoi)}")
print(f"数据压缩率: {(1 - len(dissolved_roads_in_aoi)/len(roads_in_aoi_gdf))*100:.1f}%")

In [None]:
# 原始路网我们融合之后也保存一份
all_roads_gdf_dissolve = roads_gdf.dissolve(by='name').reset_index()
all_roads_gdf_dissolve.to_file(output_filename, layer='all_shanghai_roads', driver="GPKG", engine='fiona')
print(f"✅ 原始路网数据已保存到 '{output_filename}'")


## 构建地名词典
从以往的省市县shp中提取


In [None]:
places_gdf = gpd.read_file(r"data\省份与市区县的数据\分年龄、性别的人口_区县等级.shp")

places_gdf.head()

In [None]:
# 检查空间参考
places_gdf.crs

In [None]:
# 先切片再复制
places_gdf = places_gdf[['地名', '省级', 'geometry']].copy()

# 重命名字段
places_gdf.rename(columns={'地名': 'place_name', '省级': 'province'}, inplace=True)

In [None]:
places_gdf

In [None]:
import pandas as pd

province_gdf = places_gdf[['province']].drop_duplicates().copy()

# 剔除上海市
province_gdf = province_gdf[province_gdf['province'] != '上海市'].copy()

# 将省份本身也加入到地名总表中
province_gdf['place_name'] = province_gdf['province']


# 调整列顺序与原表一致
province_gdf = province_gdf[['place_name', 'province']]

# 合并省份和原有地名表
places_gdf_all = pd.concat([places_gdf, province_gdf], ignore_index=True)

places_gdf_all

In [None]:
# 按省份分组 按place_name排序
places_gdf_all.groupby('province')['place_name'].apply(lambda x: ', '.join(sorted(x))).reset_index()

In [None]:
# 最后一步清洗 为了方便匹配路名（南京路），需要把plance_name中的 省 自治区 市 地区去掉，而province不变
def clean_place_name(name):
    # 去掉省、市、自治区等后缀
    suffixes = ['自治区', '自治州', '自治县', '地区', '省', '县', '区','市']
    for suffix in suffixes:
        if name.endswith(suffix):
            return name[:-len(suffix)]
    return name
# 应用清洗函数
places_gdf_all['cleaned_place_name'] = places_gdf_all['place_name'].apply(clean_place_name)

In [None]:
places_gdf_all.groupby('province')['cleaned_place_name'].apply(lambda x: ', '.join(sorted(x))).reset_index()


In [None]:
# 删除 province为不统计的 以及港澳台
places_gdf_all = places_gdf_all[~places_gdf_all['province'].isin(['上海市', '港澳台', '澳门特别行政区', '香港特别行政区', '不统计', '台湾省'])].copy()

In [None]:
places_gdf_all.groupby('province')['cleaned_place_name'].apply(lambda x: ', '.join(sorted(x))).reset_index()


In [None]:
# 可选 保存数据
places_gdf_all.to_file(output_filename, layer='province_and_places', driver="GPKG", engine='fiona')

## 三 开始匹配
思考：
1. 地名来源多样性：路名核心词既可能是城市，也可能是省份。例如“南京路”对应城市，“西藏中路”则直接对应省份。
2. 后缀词汇丰富: 不仅仅是“路”，还有“道”、“街”、“巷”、“弄”、“浜”、“桥”等等，一个固定的后缀列表很难做到完全覆盖。
3. 前后缀并存：“陕西南路”有前缀“南”和后缀“路”；“东宝兴路”的“东”是方位词；这都增加了提取核心词的难度。
核心解決措施：不尝试去“猜”和“剥离”路名的前后缀。相反，我们拿着一个权威的、包含所有可能地名（城市+省份）的“字典” ，去路名这个“长字符串”里，寻找最长、最优先的匹配子串。
所以我叫：**基于优先级的最大子串匹配**，为此还需要重新处理地名表 places_gdf_all


In [None]:
# 计算地名长度，并按长度降序排列 后续先匹配地名长的
# 因为总表是按长度降序的，所以第一个找到的匹配项，必然是这条路名中可能存在的最长地名。例如，对于“陕西南路”，它会先尝试匹配“陕西”，一旦成功，就立即返回“陕西”，绝不会有机会去匹配更短的“陕”。

places_gdf_all['name_len'] = places_gdf_all['cleaned_place_name'].str.len()
master_gazetteer_df = places_gdf_all.sort_values(by='name_len', ascending=False).reset_index(drop=True)

In [None]:
master_gazetteer_df

In [None]:
# 预处理：提取地名和省份为列表
place_names = master_gazetteer_df['cleaned_place_name'].tolist()
provinces = master_gazetteer_df['province'].tolist()

def find_best_match_fast(road_name):
    if not isinstance(road_name, str):
        return None, None
    for place, province in zip(place_names, provinces):
        if place in road_name:
            return place, province
    return None, None

In [None]:
# # 迭代匹配（慢）
# from tqdm import tqdm
#
# # tqdm显示进度条
# match_results = [find_best_match_fast(name) for name in tqdm(roads_gdf['name'], desc="地名匹配")]
#
# match_results_df = pd.DataFrame(match_results, columns=['matched_place', 'province'])
# final_gdf = pd.concat([roads_gdf, match_results_df], axis=1)
# final_gdf = final_gdf.dropna(subset=['province'])
# print(f"精确匹配完成！共找到 {len(final_gdf)} 条与中国地名相关的道路。")

In [None]:
# final_gdf

上述结果不是很好：
1.  **自我参照噪声**:
    *   `121, G1503上海绕城高速, 上海, 上海市`
    *   `939, 上海东路, 海东, 青海省`
    *   **病因**: 路名中包含“上海”本身。这些道路并不是以外地地名命名的，它们是我们研究中的“噪声”，必须被剔除。第二个例子更糟糕，它把“上海东路”错误地匹配给了青海的“海东”，这是算法的误判。

2.  **贪婪的子串匹配**:
    *   `376, 金沙江西路, 江西, 江西省`
    *   **病因**: 这是最经典、最棘手的错误。“金沙江”是一个完整的地理名词（一条江），但我们的算法因为字典里有“江西”，就贪婪地把它匹配上了。算法没有理解“词”的边界。

3.  **不合理的优先顺序**:
    *   `982, 松江中山东路, 山东, 山东省`
    *   **病因**: 这条路的核心词明显是“中山”，但算法却匹配了“山东”。这可能是因为“山东”和“中山”长度一样，而在我们的排序中，“山东”排在了“中山”前面，导致了错误匹配。也可以看出来松江区可能不是我们的研究重点，需要剔除上海外围地区。


## 优化匹配
优化思路
去掉路名常见后缀，减少干扰；
用 Aho-Corasick 自动机一次性批量匹配，提升查找效率；
收集所有匹配结果后，按最长优先、最早出现位置来选取最优候选。

In [None]:
import ahocorasick

# 1. 清洗路名后缀 字数多的在前！
_suffixes = ['路桥', '辅路', '北路', '东路', '南路', "西路", "大道", '路', '道', '街', '巷', '弄', '浜', '桥', '线', '段']
def _clean_road(road):
    # 如果只有两个字 则不去掉后缀
    if len(road) <= 2:
        return road
     # 遍历所有后缀，找到匹配的后缀并去掉
     # 注意：这里假设后缀列表是按长度降序排列的，这样可以确保最长的后缀优先匹配
     # 如果有多个后缀匹配，返回第一个匹配的结果
    for s in _suffixes:
        if road.endswith(s):
            return road[:-len(s)]
    return road

# 针对路名的最小匹配长度
MIN_MATCH_LEN = 2

# 构建 Aho-Corasick 自动机，只添加长度 ≥ MIN_MATCH_LEN 的地名
A = ahocorasick.Automaton()

for name, prov in zip(place_names, provinces):
    if len(name) >= MIN_MATCH_LEN:
        A.add_word(name, (name, prov))
A.make_automaton()

# 3. 最优匹配函数：最长优先、最早出现
def find_best_match_aho(road):
    if not isinstance(road, str):
        return None, None

    # 返回清理的文本
    text = _clean_road(road)
    best = None  # (place, prov, length, pos)

    for end, (place, prov) in A.iter(text):
        length = len(place)

        start = end - len(place) + 1
        length = len(place)
        if not best or length > best[2] or (length == best[2] and start < best[3]):
            best = (place, prov, length, start)
    return (best[0], best[1]) if best else (None, None)

In [None]:
from tqdm import tqdm

# 1. 批量匹配
results = [
    find_best_match_aho(name)
    for name in tqdm(dissolved_roads_in_aoi['name'], desc="地名匹配")
]

# 2. 转为 DataFrame
match_df = pd.DataFrame(results, columns=['matched_place', 'province'])


# 构建 clean->原始 place_name 映射
mapping = dict(zip(master_gazetteer_df['cleaned_place_name'], master_gazetteer_df['place_name']))

# 在 match_df 中新增原始地名列
match_df['original_place_name'] = match_df['matched_place'].map(mapping)

# 然后再合并最终结果
final_gdf = pd.concat([dissolved_roads_in_aoi.reset_index(drop=True), match_df], axis=1)
final_gdf = final_gdf.dropna(subset=['province'])

In [None]:
final_gdf

大多数都是准确的：而且搜索之后才发现：**“长宁”确实是四川省的一个地名**，准确来说是**四川省宜宾市下辖的“长宁县”**，并非“长宁市”。

> 长宁县位于四川盆地南缘，地处宜宾市腹地，因历史上“地势边远宁静”或“希冀民族和睦”而得名，素有“竹子之乡”的美誉。此外，**上海的长宁区**和**长宁路**正是得名于四川的这个“长宁县”。

这里补充一些未成功匹配的数据，比如：
天山西路,"LINESTRING (121.34698 31.21907, 121.34755 31.21902, 121.34813 31.21898)",山西,山西省


# 空间分析与聚类

##　1. 按省份聚合为“代表点”或“中心线”
想做“中国地图缩影”，中观空间分布

－　每省份的“所有道路”合并成一个MultiLineString，求其 geometry.centroid 得到“代表点”。
－　还可以求“代表线/省份聚合线条”，也就是每省所有命名路段的联合图形（比如用于可视化描边/区域聚合）。
－　除几何聚合外，可以统计每省份匹配路段数量，并一起聚合输出。

In [None]:
from shapely.ops import unary_union

# 按省份分组，合并所有道路几何
province_group = final_gdf.groupby('province').apply(
    lambda df: pd.Series({
        'geometry': unary_union(df['geometry']),
        'road_count': df['name'].count(),  # 改名为 road_count
        'road_names': ', '.join(sorted(df['name'].unique())),  # 保留所有道路名称
        'matched_places': ', '.join(sorted(set(df['matched_place'])))  # 改名为 matched_places
    })
).reset_index()

# 计算每省的质心
province_group['centroid'] = province_group['geometry'].centroid

In [None]:
province_group.head()

In [None]:
# 分别保存线和点
province_group[['province', 'geometry', 'road_count', 'road_names']].to_file(output_filename, layer='province_roads', driver="GPKG", engine='fiona',crs="EPSG:4326")

In [None]:
province_group[['province', 'centroid', 'road_count', 'road_names']].set_geometry('centroid').to_file(output_filename, layer='province_centroids', driver="GPKG", engine='fiona',crs="EPSG:4326")


In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# 设置中文字体和更好的显示参数
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 100

# 创建更大的图形
fig, ax = plt.subplots(figsize=(18, 14))

# 设置背景色
fig.patch.set_facecolor('white')
ax.set_facecolor('#f8f9fa')


# 1. 绘制聚合线（底层）
# 1. 为每个省份分配不同颜色
n_provinces = len(province_group)
# 使用tab20颜色映射，它有20种不同的颜色


colors = cm.tab20(np.linspace(0, 1, n_provinces))

# 2. 逐个绘制每个省份的聚合线
for idx, (_, row) in enumerate(province_group.iterrows()):
    # 创建单个省份的GeoDataFrame
    single_province = gpd.GeoDataFrame([row], crs=province_group.crs if hasattr(province_group, 'crs') else None)
    
    # 绘制该省份的线条
    single_province.plot(ax=ax, 
                        color=colors[idx], 
                        linewidth=2.5, 
                        alpha=0.8,
                        label=row['province'], 
                        zorder=1)

# 2. 绘制代表点（顶层）- 使用更醒目的红色，增加边框
# sizes = province_group['name'] * 15 + 50  # 基础大小50，按道路数量调整

province_group.set_geometry('centroid').plot(ax=ax, 
                                            color='#F24236', 
                                            markersize=80,
                                            edgecolor='white',
                                            linewidth=3,
                                            label='代表点', 
                                            zorder=3)

# 3. 标注省份名称（最顶层）- 改善文字样式
for idx, row in province_group.iterrows():
    x, y = row['centroid'].x, row['centroid'].y
    
    # 添加文字背景框
    ax.text(x, y, row['province'], 
           fontsize=11, 
           color='#2c3e50',
           ha='center', 
           va='center',
           weight='bold',
           bbox=dict(boxstyle="round,pad=0.3", 
                    facecolor='white', 
                    edgecolor='#bdc3c7',
                    alpha=0.9),
           zorder=4)

# 4. 美化图表
ax.set_title("上海地名路网空间分布分析\n各省份道路聚合与代表点位置", 
            fontsize=16, 
            fontweight='bold', 
            color='#2c3e50',
            pad=20)

# 设置坐标轴
ax.set_xlabel('经度', fontsize=12, color='#34495e')
ax.set_ylabel('纬度', fontsize=12, color='#34495e')

# 美化网格
ax.grid(True, alpha=0.3, color='#bdc3c7', linestyle='--', linewidth=0.5)

# 美化图例
legend = ax.legend(loc='upper right', 
                  frameon=True, 
                  fancybox=True, 
                  shadow=True,
                  fontsize=11)
legend.get_frame().set_facecolor('white')
legend.get_frame().set_alpha(0.9)

# 设置坐标轴刻度颜色
ax.tick_params(colors='#34495e', labelsize=10)

# 移除上边框和右边框
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('#bdc3c7')
ax.spines['bottom'].set_color('#bdc3c7')

# 调整布局
plt.tight_layout()

# 显示图表
plt.show()

## 4. 空间聚类分析

现在我们来分析每个省份的道路在上海的空间分布模式：
- 使用KMeans聚类分析每省道路的核心分布区域
- 使用DBSCAN识别高密度聚集区
- 分析哪些省份呈现"多核心分布"特征

In [None]:
# 4.1 准备聚类数据：计算每条道路的质心
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import json

# 为每条道路计算质心坐标
final_gdf['centroid'] = final_gdf['geometry'].centroid
final_gdf['lon'] = final_gdf['centroid'].x
final_gdf['lat'] = final_gdf['centroid'].y

print("✅ 道路质心计算完成")
print(f"总计 {len(final_gdf)} 条道路待分析")

# 按省份统计道路数量，选择道路数量较多的省份进行聚类
province_stats = final_gdf['province'].value_counts()
print("\n📊 各省份道路数量统计：")
print(province_stats.head(10))

In [None]:
# 4.2 KMeans聚类分析函数
def analyze_province_clusters_kmeans(province_name, n_clusters=3, min_roads=5):
    """
    对指定省份的道路进行KMeans聚类分析
    
    参数:
    - province_name: 省份名称
    - n_clusters: 聚类数量
    - min_roads: 最少道路数量阈值
    """
    
    # 筛选该省份的道路
    province_roads = final_gdf[final_gdf['province'] == province_name].copy()
    
    if len(province_roads) < min_roads:
        print(f"⚠️ {province_name} 道路数量不足 ({len(province_roads)} < {min_roads})，跳过聚类")
        return None
    
    # 提取坐标
    coords = province_roads[['lon', 'lat']].values
    
    # 标准化坐标（重要：避免经纬度尺度差异）
    scaler = StandardScaler()
    coords_scaled = scaler.fit_transform(coords)
    
    # KMeans聚类
    kmeans = KMeans(n_clusters=min(n_clusters, len(province_roads)), random_state=42)
    province_roads.loc[:, 'cluster'] = kmeans.fit_predict(coords_scaled)
    
    # 计算聚类中心（原始坐标）
    cluster_centers = []
    for i in range(kmeans.n_clusters):
        cluster_coords = coords[province_roads['cluster'] == i]
        center_lon = cluster_coords[:, 0].mean()
        center_lat = cluster_coords[:, 1].mean()
        cluster_size = len(cluster_coords)
        cluster_centers.append({
            'cluster_id': i,
            'center_lon': center_lon,
            'center_lat': center_lat,
            'road_count': cluster_size,
            'roads': province_roads[province_roads['cluster'] == i]['name'].tolist()
        })
    
    return {
        'province': province_name,
        'total_roads': len(province_roads),
        'n_clusters': kmeans.n_clusters,
        'cluster_centers': cluster_centers,
        'roads_with_clusters': province_roads
    }

print("✅ KMeans聚类分析函数定义完成")

In [None]:
# 4.3 DBSCAN聚类分析函数
def analyze_province_clusters_dbscan(province_name, eps=0.01, min_samples=2, min_roads=5):
    """
    对指定省份的道路进行DBSCAN聚类分析
    
    参数:
    - province_name: 省份名称
    - eps: DBSCAN的邻域半径参数（经纬度单位，约1km≈0.01度）
    - min_samples: 形成聚类的最小样本数
    - min_roads: 最少道路数量阈值
    """
    
    # 筛选该省份的道路
    province_roads = final_gdf[final_gdf['province'] == province_name].copy()
    
    if len(province_roads) < min_roads:
        print(f"⚠️ {province_name} 道路数量不足 ({len(province_roads)} < {min_roads})，跳过聚类")
        return None
    
    # 提取坐标
    coords = province_roads[['lon', 'lat']].values
    
    # DBSCAN聚类（不需要标准化，直接使用经纬度）
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    province_roads.loc[:, 'cluster'] = dbscan.fit_predict(coords)
    
    # 统计聚类结果
    unique_clusters = set(province_roads['cluster'])
    noise_count = sum(province_roads['cluster'] == -1)  # -1表示噪声点
    
    # 计算有效聚类中心
    cluster_centers = []
    for cluster_id in unique_clusters:
        if cluster_id == -1:  # 跳过噪声点
            continue
            
        cluster_coords = coords[province_roads['cluster'] == cluster_id]
        center_lon = cluster_coords[:, 0].mean()
        center_lat = cluster_coords[:, 1].mean()
        cluster_size = len(cluster_coords)
        
        cluster_centers.append({
            'cluster_id': cluster_id,
            'center_lon': center_lon,
            'center_lat': center_lat,
            'road_count': cluster_size,
            'roads': province_roads[province_roads['cluster'] == cluster_id]['name'].tolist()
        })
    
    return {
        'province': province_name,
        'total_roads': len(province_roads),
        'n_clusters': len(cluster_centers),
        'noise_points': noise_count,
        'cluster_centers': cluster_centers,
        'roads_with_clusters': province_roads
    }

print("✅ DBSCAN聚类分析函数定义完成")

In [None]:
# 4.4 批量执行聚类分析
# 选择道路数量 >= 8 的省份进行分析
target_provinces = province_stats[province_stats >= 8].index.tolist()

print(f"🎯 目标省份：{target_provinces}")
print(f"总计 {len(target_provinces)} 个省份将进行聚类分析\n")

# 存储聚类结果
kmeans_results = {}
dbscan_results = {}

print("=" * 50)
print("开始 KMeans 聚类分析...")
print("=" * 50)

for province in target_provinces:
    print(f"\n🔍 分析 {province}...")
    
    # KMeans分析
    kmeans_result = analyze_province_clusters_kmeans(province, n_clusters=1)
    if kmeans_result:
        kmeans_results[province] = kmeans_result
        print(f"✅ KMeans完成: {kmeans_result['n_clusters']} 个聚类中心")
        
        # 显示聚类详情
        for center in kmeans_result['cluster_centers']:
            print(f"   聚类 {center['cluster_id']}: {center['road_count']} 条道路")

print("\n" + "=" * 50)
print("开始 DBSCAN 聚类分析...")
print("=" * 50)

for province in target_provinces:
    print(f"\n🔍 分析 {province}...")
    
    # DBSCAN分析
    dbscan_result = analyze_province_clusters_dbscan(province, eps=0.01, min_samples=2)
    if dbscan_result:
        dbscan_results[province] = dbscan_result
        print(f"✅ DBSCAN完成: {dbscan_result['n_clusters']} 个聚类中心, {dbscan_result['noise_points']} 个噪声点")
        
        # 显示聚类详情
        for center in dbscan_result['cluster_centers']:
            print(f"   聚类 {center['cluster_id']}: {center['road_count']} 条道路")

print("\n🎉 聚类分析全部完成！")
print(f"KMeans结果: {len(kmeans_results)} 个省份")
print(f"DBSCAN结果: {len(dbscan_results)} 个省份")

In [None]:
province_stats

In [None]:
roads

In [None]:
# 4.5 可视化聚类结果
def plot_province_clusters(province_name, method='both', figsize=(16, 6), show_plot=True):
    """
    可视化特定省份的聚类结果
    
    参数:
    - province_name: 省份名称
    - method: 显示方法 ('kmeans', 'dbscan', 'both')
    - figsize: 图形大小
    - show_plot: 是否显示图形（保存时设为False）
    """
    
    if method == 'both':
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
        axes = [ax1, ax2]
        methods = ['kmeans', 'dbscan']
    else:
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        axes = [ax]
        methods = [method]

    # 构造要显示的信息 - 移到标题中
    road_count = province_stats[province_name]
    sample_roads = ', '.join(final_gdf[final_gdf['province'] == province_name]['name'].unique()[:5])
    if len(final_gdf[final_gdf['province'] == province_name]['name'].unique()) > 5:
        sample_roads += "..."

    for i, current_method in enumerate(methods):
        ax = axes[i]
        
        # 先绘制上海行政区划底图（只要边界线，不填充）
        aoi_gdf = gpd.GeoDataFrame(geometry=[aoi_polygon], crs="EPSG:4326")
        aoi_gdf.boundary.plot(ax=ax, color='black', linewidth=1.5, alpha=0.8, zorder=0, label='上海行政区划')
        
        # 绘制底图道路 颜色淡一点
        dissolved_roads_in_aoi.plot(ax=ax, color='lightgray', linewidth=0.5, alpha=0.5, zorder=0, label='所有道路（合并后）')
        
        
        # 绘制对应的道路
        roads = final_gdf[final_gdf['province'] == province_name].copy().geometry
        roads.plot(ax=ax, color='black', linewidth=1.5, alpha=1, zorder=1, label='所属道路')


        if current_method == 'kmeans' and province_name in kmeans_results:
            result = kmeans_results[province_name]
            roads_data = result['roads_with_clusters']
            title = f"{province_name} - KMeans聚类 ({result['n_clusters']} 个聚类)\n道路数量: {road_count} 条 | 示例: {sample_roads}"

            
        elif current_method == 'dbscan' and province_name in dbscan_results:
            result = dbscan_results[province_name]
            roads_data = result['roads_with_clusters']
            title = f"{province_name} - DBSCAN聚类 ({result['n_clusters']} 个聚类)\n道路数量: {road_count} 条 | 示例: {sample_roads}"

        else:
            ax.text(0.5, 0.5, f"无 {current_method.upper()} 聚类结果", 
                   ha='center', va='center', transform=ax.transAxes)
            ax.set_title(f"{province_name} - {current_method.upper()}")
            continue
        
        # 绘制道路点，按聚类着色
        unique_clusters = roads_data['cluster'].unique()
        colors = cm.tab10(np.linspace(0, 1, len(unique_clusters)))
        
        for j, cluster_id in enumerate(unique_clusters):
            cluster_roads = roads_data[roads_data['cluster'] == cluster_id]
            
            if cluster_id == -1:  # DBSCAN的噪声点
                ax.scatter(cluster_roads['lon'], cluster_roads['lat'], 
                          c='green', s=10, alpha=0.8, label='噪声点')
            else:
                ax.scatter(cluster_roads['lon'], cluster_roads['lat'], 
                          c=[colors[j]], s=15, alpha=1, label=f'聚类 {cluster_id}')
        
        # 绘制聚类中心
        for center in result['cluster_centers']:
            ax.scatter(center['center_lon'], center['center_lat'], 
                      c='red', s=50, marker='*',
                      label='聚类中心' if center['cluster_id'] == result['cluster_centers'][0]['cluster_id'] else "")
        
        ax.set_title(title, fontsize=12, fontweight='bold')
        ax.set_xlabel('经度')
        ax.set_ylabel('纬度')
        ax.grid(True, alpha=0.3)
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    
     # 只有在show_plot=True时才显示图形
    if show_plot:
        plt.show()
        
    return fig  # 返回图形对象

# 可视化几个代表性省份
representative_provinces = ['江苏省', '浙江省', '山东省', '吉林省']

for province in representative_provinces:
    if province in target_provinces:
        print(f"\n📊 {province} 聚类结果可视化:")
        plot_province_clusters(province)

In [None]:
# 保存所有省份的图标可视化到文件夹
from pathlib import Path
output_folder = 'image/province_clusters_visualization'

Path(output_folder).mkdir(parents=True, exist_ok=True)

for province in target_provinces:
    print(f"📊 正在保存 {province} 的聚类可视化图...")
    
    try:
        # 创建图形，不显示
        fig = plot_province_clusters(province, method='both', figsize=(16, 6), show_plot=False)
        
        # 保存图形
        save_path = Path(output_folder) / f"{province}_clusters.png"
        fig.savefig(save_path, 
                    bbox_inches='tight', 
                    dpi=300,
                    facecolor='white',
                    edgecolor='none')
        
        print(f"   ✅ 已保存: {save_path}")
        
        # 关闭图形释放内存
        plt.close(fig)
        
    except Exception as e:
        print(f"   ❌ 保存失败: {e}")
        # 确保即使出错也关闭图形
        plt.close('all')

print(f"✅ 所有聚类可视化图已保存到 '{output_folder}' 文件夹")


In [None]:
# 4.6 聚类结果分析与总结
def analyze_clustering_patterns():
    """
    分析各省份的聚类模式，识别多核心分布特征
    """
    print("=" * 60)
    print("🔍 聚类模式分析报告")
    print("=" * 60)
    
    # 分析KMeans结果
    print("\n📊 KMeans聚类分析:")
    print("-" * 40)
    
    for province, result in kmeans_results.items():
        n_clusters = result['n_clusters']
        total_roads = result['total_roads']
        
        # 计算聚类平衡度（各聚类大小的标准差）
        cluster_sizes = [center['road_count'] for center in result['cluster_centers']]
        balance_score = np.std(cluster_sizes) / np.mean(cluster_sizes) if len(cluster_sizes) > 1 else 0
        
        distribution_type = "多核心分布" if n_clusters >= 3 and balance_score < 0.5 else \
                           "双核心分布" if n_clusters == 2 else "单核心分布"
        
        print(f"{province:8s}: {total_roads:2d}条道路 → {n_clusters}个聚类 ({distribution_type})")
        
        # 显示每个聚类的详细信息
        for center in result['cluster_centers']:
            sample_roads = center['roads']
            roads_preview = ', '.join(sample_roads) + ('...' if len(center['roads']) > 3 else '')
            print(f"         聚类{center['cluster_id']}: {center['road_count']}条 - {roads_preview}")
    
    # 分析DBSCAN结果
    print(f"\n📊 DBSCAN聚类分析:")
    print("-" * 40)
    
    for province, result in dbscan_results.items():
        n_clusters = result['n_clusters']
        total_roads = result['total_roads']
        noise_points = result['noise_points']
        
        noise_ratio = noise_points / total_roads * 100
        
        density_type = "高密度聚集" if noise_ratio < 20 else \
                      "中等密度" if noise_ratio < 50 else "分散分布"
        
        print(f"{province:8s}: {total_roads:2d}条道路 → {n_clusters}个聚类 + {noise_points}噪声点 ({density_type})")
        
        # 显示有效聚类信息
        for center in result['cluster_centers']:
            sample_roads = center['roads']
            roads_preview = ', '.join(sample_roads) + ('...' if len(center['roads']) > 3 else '')
            print(f"         聚类{center['cluster_id']}: {center['road_count']}条 - {roads_preview}")

# 执行聚类模式分析
analyze_clustering_patterns()

# 4.7 保存聚类结果
print(f"\n💾 保存聚类分析结果...")

# 保存聚类中心为地理数据
all_cluster_centers = []

for province, result in kmeans_results.items():
    for center in result['cluster_centers']:
        all_cluster_centers.append({
            'province': province,
            'method': 'KMeans',
            'cluster_id': center['cluster_id'],
            'road_count': center['road_count'],
            'center_lon': center['center_lon'],
            'center_lat': center['center_lat'],
            'geometry': gpd.points_from_xy([center['center_lon']], [center['center_lat']])[0]
        })

for province, result in dbscan_results.items():
    for center in result['cluster_centers']:
        all_cluster_centers.append({
            'province': province,
            'method': 'DBSCAN',
            'cluster_id': center['cluster_id'],
            'road_count': center['road_count'],
            'center_lon': center['center_lon'],
            'center_lat': center['center_lat'],
            'geometry': gpd.points_from_xy([center['center_lon']], [center['center_lat']])[0]
        })

# 转换为GeoDataFrame
cluster_centers_gdf = gpd.GeoDataFrame(all_cluster_centers, crs='EPSG:4326')

# 保存到文件
cluster_centers_gdf.to_file(output_filename, layer='cluster_centers', driver="GPKG")

print(f"✅ 聚类中心已保存到 '{output_filename}' 的 'cluster_centers' 图层")
print(f"📈 总计保存了 {len(cluster_centers_gdf)} 个聚类中心点")

In [None]:
from pathlib import Path
import geopandas as gpd

def create_comprehensive_cluster_map(figsize=(20, 16)):
    """
    综合展示行政区边界、道路底图和聚类中心
    """
    fig, ax = plt.subplots(figsize=figsize)
    fig.patch.set_facecolor('white')
    ax.set_facecolor('#f8f9fa')

    plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['figure.dpi'] = 100

    # 1. 绘制行政区边界
    aoi_gdf = gpd.GeoDataFrame(geometry=[aoi_polygon], crs="EPSG:4326")
    aoi_gdf.boundary.plot(ax=ax, color='black', linewidth=1.5, alpha=0.8, zorder=0)

    # 2. 绘制所有道路（底层）
    dissolved_roads_in_aoi.plot(ax=ax, color='lightgray', linewidth=0.5, alpha=0.5, zorder=2)

    # 3. 绘制聚类中心
    unique_provinces = list(set(kmeans_results.keys()) | set(dbscan_results.keys()))
    colors = cm.tab20(np.linspace(0, 1, len(unique_provinces)))
    province_colors = dict(zip(unique_provinces, colors))

    for province, result in kmeans_results.items():
        color = province_colors[province]
        for center in result['cluster_centers']:
            ax.scatter(center['center_lon'], center['center_lat'],
                       c=[color], s=center['road_count'] * 10 + 50,  # 根据道路数量调整大小
                       marker='o', alpha=0.8, edgecolor='white', linewidth=2,
                       label=f"{province} (KMeans)" if center['cluster_id']==0 else "",
                       zorder=4)

    # 图例和样式
    ax.set_title("上海地名路网聚类分析\n各省份道路空间分布模式", fontsize=18, fontweight='bold', pad=20)
    ax.set_xlabel('经度'); ax.set_ylabel('纬度')
    ax.grid(True, alpha=0.3, linestyle='--')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8, frameon=True)
    ax.text(0.02, 0.98,
            "● 圆形: KMeans 中心\n",
            transform=ax.transAxes, verticalalignment='top',
            bbox=dict(facecolor='white', alpha=0.9))

    plt.tight_layout()
    plt.show()

print("🗺️ 生成带行政区划、道路和聚类中心的综合地图...")
create_comprehensive_cluster_map()

In [None]:
final_gdf.drop(columns=['centroid', 'lon', 'lat']).to_file(output_filename, layer="matched_roads", driver="GPKG")

In [None]:
# 查找地名
final_gdf.query("name == '四川北路'")

In [None]:
# TODO 可以根据道路线的长度

In [None]:
# 4.8 区域化分析 - 从点集到区域多边形
from shapely.geometry import Point, Polygon, MultiPolygon
from shapely.ops import unary_union
from scipy.spatial import ConvexHull
import alphashape
import warnings
warnings.filterwarnings('ignore')

def create_cluster_regions(province_name, method='both', buffer_distance=0.005):
    """
    为指定省份的聚类创建区域多边形
    
    参数:
    - province_name: 省份名称
    - method: 使用的聚类方法 ('kmeans', 'dbscan', 'both')
    - buffer_distance: 缓冲区距离（经纬度单位，约500米≈0.005度）
    
    返回:
    - 包含区域多边形的字典
    """
    
    regions_data = {
        'kmeans_regions': [],
        'dbscan_regions': [],
        'province': province_name
    }
    
    # 处理KMeans聚类区域
    if method in ['kmeans', 'both'] and province_name in kmeans_results:
        result = kmeans_results[province_name]
        roads_data = result['roads_with_clusters']
        
        for cluster_id in roads_data['cluster'].unique():
            cluster_roads = roads_data[roads_data['cluster'] == cluster_id]
            points = [(row['lon'], row['lat']) for _, row in cluster_roads.iterrows()]
            
            region_poly = create_region_polygon(points, buffer_distance)
            if region_poly:
                regions_data['kmeans_regions'].append({
                    'cluster_id': cluster_id,
                    'geometry': region_poly,
                    'road_count': len(cluster_roads),
                    'roads': cluster_roads['name'].tolist(),
                    'area_km2': calculate_polygon_area_km2(region_poly)
                })
    
    # 处理DBSCAN聚类区域
    if method in ['dbscan', 'both'] and province_name in dbscan_results:
        result = dbscan_results[province_name]
        roads_data = result['roads_with_clusters']
        
        for cluster_id in roads_data['cluster'].unique():
            if cluster_id == -1:  # 跳过噪声点
                continue
                
            cluster_roads = roads_data[roads_data['cluster'] == cluster_id]
            points = [(row['lon'], row['lat']) for _, row in cluster_roads.iterrows()]
            
            region_poly = create_region_polygon(points, buffer_distance)
            if region_poly:
                regions_data['dbscan_regions'].append({
                    'cluster_id': cluster_id,
                    'geometry': region_poly,
                    'road_count': len(cluster_roads),
                    'roads': cluster_roads['name'].tolist(),
                    'area_km2': calculate_polygon_area_km2(region_poly)
                })
    
    return regions_data

def create_region_polygon(points, buffer_distance=0.005, method='convex_hull'):
    """
    从点集创建区域多边形
    
    参数:
    - points: 点坐标列表 [(lon, lat), ...]
    - buffer_distance: 缓冲区距离
    - method: 'convex_hull', 'alpha_shape', 'buffer_union'
    
    返回:
    - Shapely Polygon对象
    """
    
    if len(points) < 3:
        # 点数不足，创建缓冲区
        if len(points) == 1:
            point = Point(points[0])
            return point.buffer(buffer_distance)
        elif len(points) == 2:
            from shapely.geometry import LineString
            line = LineString(points)
            return line.buffer(buffer_distance)
        return None
    
    try:
        if method == 'convex_hull':
            # 方法1: 凸包 + 缓冲区（最稳定）
            hull = ConvexHull(points)
            hull_points = [points[i] for i in hull.vertices]
            polygon = Polygon(hull_points)
            return polygon.buffer(buffer_distance)
            
        elif method == 'alpha_shape':
            # 方法2: Alpha Shape（更贴合实际形状，但可能不稳定）
            try:
                alpha_shape = alphashape.alphashape(points, 0.1)
                if alpha_shape.geom_type == 'Polygon':
                    return alpha_shape.buffer(buffer_distance)
                elif alpha_shape.geom_type == 'MultiPolygon':
                    # 取最大的多边形
                    largest = max(alpha_shape.geoms, key=lambda x: x.area)
                    return largest.buffer(buffer_distance)
            except:
                # Alpha shape失败，回退到凸包
                hull = ConvexHull(points)
                hull_points = [points[i] for i in hull.vertices]
                polygon = Polygon(hull_points)
                return polygon.buffer(buffer_distance)
                
        elif method == 'buffer_union':
            # 方法3: 点缓冲区联合（适合分散分布）
            buffered_points = [Point(p).buffer(buffer_distance) for p in points]
            union_result = unary_union(buffered_points)
            return union_result
            
    except Exception as e:
        print(f"创建多边形失败: {e}")
        # 失败时创建简单缓冲区联合
        try:
            buffered_points = [Point(p).buffer(buffer_distance) for p in points]
            return unary_union(buffered_points)
        except:
            return None

def calculate_polygon_area_km2(polygon):
    """
    计算多边形面积（平方公里）
    注意：这是一个近似计算，适用于小范围区域
    """
    if polygon is None:
        return 0
        
    # 将度转换为米的近似系数（在上海纬度附近）
    lat_to_m = 111000  # 1度纬度 ≈ 111km
    lon_to_m = 91000   # 1度经度 ≈ 91km（在北纬31度附近）
    
    # 获取边界框
    minx, miny, maxx, maxy = polygon.bounds
    
    # 简单的面积估算（矩形近似）
    width_m = (maxx - minx) * lon_to_m
    height_m = (maxy - miny) * lat_to_m
    
    # 更精确的方法：使用Shapely的面积并转换
    area_deg2 = polygon.area
    area_m2 = area_deg2 * lat_to_m * lon_to_m
    area_km2 = area_m2 / 1000000
    
    return round(area_km2, 3)

print("✅ 区域化分析函数定义完成")

# 为所有目标省份创建区域多边形
print("\n🗺️ 开始创建聚类区域多边形...")

all_regions_data = {}

for province in target_provinces:
    print(f"🔍 处理 {province} 的区域化...")
    regions = create_cluster_regions(province, method='both', buffer_distance=0.003)
    all_regions_data[province] = regions
    
    kmeans_count = len(regions['kmeans_regions'])
    dbscan_count = len(regions['dbscan_regions'])
    print(f"   ✅ KMeans: {kmeans_count} 个区域, DBSCAN: {dbscan_count} 个区域")

print("\n🎉 区域化处理完成！")

In [None]:
# 4.9 可视化聚类区域
def plot_province_cluster_regions(province_name, method='both', figsize=(16, 6), show_plot=True):
    """
    可视化特定省份的聚类区域多边形
    """
    
    if method == 'both':
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
        axes = [ax1, ax2]
        methods = ['kmeans', 'dbscan']
    else:
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        axes = [ax]
        methods = [method]

    # 获取区域数据
    regions_data = all_regions_data.get(province_name, {})
    road_count = province_stats[province_name]
    sample_roads = ', '.join(final_gdf[final_gdf['province'] == province_name]['name'].unique()[:5])
    if len(final_gdf[final_gdf['province'] == province_name]['name'].unique()) > 5:
        sample_roads += "..."

    for i, current_method in enumerate(methods):
        ax = axes[i]
        
        # 绘制上海行政区划底图
        aoi_gdf = gpd.GeoDataFrame(geometry=[aoi_polygon], crs="EPSG:4326")
        aoi_gdf.boundary.plot(ax=ax, color='black', linewidth=1.5, alpha=0.8, zorder=0)
        
        # 绘制所有道路（背景）
        roads_in_aoi_gdf.plot(ax=ax, color='lightgray', linewidth=0.3, alpha=0.4, zorder=1)

        if current_method == 'kmeans':
            regions = regions_data.get('kmeans_regions', [])
            title = f"{province_name} - KMeans 聚类区域 ({len(regions)} 个区域)\n 示例: {sample_roads}"
        else:
            regions = regions_data.get('dbscan_regions', [])
            title = f"{province_name} - DBSCAN 聚类区域 ({len(regions)} 个区域)\n 示例: {sample_roads}"

        if not regions:
            ax.text(0.5, 0.5, f"无 {current_method.upper()} 聚类区域", 
                   ha='center', va='center', transform=ax.transAxes)
            ax.set_title(title)
            continue
        
        # 绘制聚类区域多边形
        colors = cm.tab10(np.linspace(0, 1, len(regions)))
        
        for j, region in enumerate(regions):
            # 创建GeoDataFrame来绘制多边形
            region_gdf = gpd.GeoDataFrame([region], crs='EPSG:4326')
            
            # 绘制多边形区域（半透明填充）
            region_gdf.plot(ax=ax, 
                           color=colors[j], 
                           alpha=0.3, 
                           edgecolor=colors[j], 
                           linewidth=2,
                           label=f"聚类 {region['cluster_id']} ({region['road_count']}条路)",
                           zorder=3)
            
            # 在区域中心添加标注
            centroid = region['geometry'].centroid
            ax.annotate(f"C{region['cluster_id']}\n{region['road_count']}条\n{region['area_km2']}km²",
                       (centroid.x, centroid.y),
                       ha='center', va='center',
                       fontsize=8, weight='bold',
                       bbox=dict(boxstyle="round,pad=0.3", 
                               facecolor='white', 
                               alpha=0.8,
                               edgecolor=colors[j]),
                       zorder=5)
        
        # 绘制道路点
        if current_method == 'kmeans' and province_name in kmeans_results:
            roads_data = kmeans_results[province_name]['roads_with_clusters']
        elif current_method == 'dbscan' and province_name in dbscan_results:
            roads_data = dbscan_results[province_name]['roads_with_clusters']
        else:
            roads_data = None
            
        if roads_data is not None:
            for cluster_id in roads_data['cluster'].unique():
                if cluster_id == -1:  # 跳过DBSCAN噪声点
                    continue
                cluster_roads = roads_data[roads_data['cluster'] == cluster_id]
                color_idx = cluster_id if cluster_id < len(colors) else cluster_id % len(colors)
                ax.scatter(cluster_roads['lon'], cluster_roads['lat'], 
                          c=[colors[color_idx]], s=15, alpha=0.8, 
                          edgecolor='white', linewidth=0.5,
                          zorder=4)
        
        ax.set_title(title, fontsize=10, fontweight='bold')
        ax.set_xlabel('经度')
        ax.set_ylabel('纬度')
        ax.grid(True, alpha=0.3)
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    
    plt.tight_layout()
    if show_plot:
        plt.show()
    return fig

# 可视化几个代表性省份的区域
representative_provinces = ['江苏省', '浙江省', '山东省', '安徽省']

for province in representative_provinces:
    if province in target_provinces:
        print(f"\n📊 {province} 聚类区域可视化:")
        plot_province_cluster_regions(province)

In [None]:
# 4.10 保存区域化结果
def save_cluster_regions():
    """
    保存所有聚类区域为地理数据文件
    """
    
    # 准备KMeans区域数据
    kmeans_regions_list = []
    for province, regions_data in all_regions_data.items():
        for region in regions_data['kmeans_regions']:
            kmeans_regions_list.append({
                'province': province,
                'method': 'KMeans',
                'cluster_id': region['cluster_id'],
                'road_count': region['road_count'],
                'area_km2': region['area_km2'],
                'roads_sample': ', '.join(region['roads'][:5]) + ('...' if len(region['roads']) > 5 else ''),
                'geometry': region['geometry']
            })
    
    # 准备DBSCAN区域数据
    dbscan_regions_list = []
    for province, regions_data in all_regions_data.items():
        for region in regions_data['dbscan_regions']:
            dbscan_regions_list.append({
                'province': province,
                'method': 'DBSCAN',
                'cluster_id': region['cluster_id'],
                'road_count': region['road_count'],
                'area_km2': region['area_km2'],
                'roads_sample': ', '.join(region['roads'][:5]) + ('...' if len(region['roads']) > 5 else ''),
                'geometry': region['geometry']
            })
    
    # 保存为GeoDataFrame
    if kmeans_regions_list:
        kmeans_regions_gdf = gpd.GeoDataFrame(kmeans_regions_list, crs='EPSG:4326')
        kmeans_regions_gdf.to_file(output_filename, layer='kmeans_regions', driver="GPKG", engine="fiona")
        print(f"✅ KMeans区域已保存: {len(kmeans_regions_gdf)} 个区域")
    
    if dbscan_regions_list:
        dbscan_regions_gdf = gpd.GeoDataFrame(dbscan_regions_list, crs='EPSG:4326')
        dbscan_regions_gdf.to_file(output_filename, layer='dbscan_regions', driver="GPKG", engine="fiona")
        print(f"✅ DBSCAN区域已保存: {len(dbscan_regions_gdf)} 个区域")
    
    # 合并所有区域
    all_regions_list = kmeans_regions_list + dbscan_regions_list
    if all_regions_list:
        all_regions_gdf = gpd.GeoDataFrame(all_regions_list, crs='EPSG:4326')
        all_regions_gdf.to_file(output_filename, layer='all_cluster_regions', driver="GPKG", engine="fiona")
        print(f"✅ 所有聚类区域已保存: {len(all_regions_gdf)} 个区域")

# 执行保存
print("\n💾 保存区域化结果...")
save_cluster_regions()

# 统计区域化结果
print("\n📊 区域化统计总结:")
print("=" * 50)

total_kmeans_regions = 0
total_dbscan_regions = 0
total_area_kmeans = 0
total_area_dbscan = 0

for province, regions_data in all_regions_data.items():
    kmeans_count = len(regions_data['kmeans_regions'])
    dbscan_count = len(regions_data['dbscan_regions'])
    
    kmeans_area = sum(r['area_km2'] for r in regions_data['kmeans_regions'])
    dbscan_area = sum(r['area_km2'] for r in regions_data['dbscan_regions'])
    
    total_kmeans_regions += kmeans_count
    total_dbscan_regions += dbscan_count
    total_area_kmeans += kmeans_area
    total_area_dbscan += dbscan_area
    
    print(f"{province:8s}: KMeans {kmeans_count:2d}区域({kmeans_area:5.1f}km²) | DBSCAN {dbscan_count:2d}区域({dbscan_area:5.1f}km²)")

print("-" * 50)
print(f"总计: KMeans {total_kmeans_regions}区域({total_area_kmeans:.1f}km²) | DBSCAN {total_dbscan_regions}区域({total_area_dbscan:.1f}km²)")

In [None]:
# 4.11 保存区域可视化图片
print("\n📊 保存所有省份的区域可视化图...")

# 创建区域可视化文件夹
regions_output_folder = 'image/province_cluster_regions_visualization'
Path(regions_output_folder).mkdir(parents=True, exist_ok=True)

for province in target_provinces:
    if province in all_regions_data:
        print(f"📊 正在保存 {province} 的聚类区域图...")
        
        # 创建图形
        fig = plot_province_cluster_regions(province, method='both', figsize=(16, 6), show_plot=False)
        
        
        # 保存图形
        plt.savefig(Path(regions_output_folder) / f"{province}_cluster_regions.png", 
                    bbox_inches='tight', 
                    dpi=300,
                    facecolor='white')
        
        # 关闭图形释放内存
        plt.close(fig)

print(f"✅ 所有聚类区域可视化图已保存到 '{regions_output_folder}' 文件夹")