# Load analysis Data

In [2]:

import geopandas as gpd
from shapely.geometry import Point
import pandas as pd


# Function to clip dataframes using the US states shapefile
def clip_data_with_us_states(df, lon_col='lon', lat_col='lat'):
    """
    Clip a dataframe using US states shapefile
    """
    # Create geometry points from coordinates
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    
    # Create GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')
    
    # Ensure US states has the same CRS
    us_states_4326 = us_states.to_crs('EPSG:4326')
    
    # Perform spatial join to find points within US states
    clipped_gdf = gpd.sjoin(gdf, us_states_4326, how='inner', predicate='within')
    
    # Remove geometry columns and keep original data
    clipped_df = clipped_gdf.drop(columns=['geometry', 'index_right'])
    
    # Remove any additional columns from the shapefile that might have been added
    shapefile_columns = us_states.columns.tolist()
    for col in shapefile_columns:
        if col in clipped_df.columns:
            clipped_df = clipped_df.drop(columns=[col])
    
    return clipped_df

# # Clip all your dataframes
# df_economic = clip_data_with_us_states(df_economic)
# df_weight = clip_data_with_us_states(df_weight)
# df_strategies = clip_data_with_us_states(df_strategies)
# df_net_benefit = clip_data_with_us_states(df_net_benefit)
# df_pv_npv = clip_data_with_us_states(df_pv_npv)
# df_agricultural_npv = clip_data_with_us_states(df_agricultural_npv)
# df_afforestation_npv = clip_data_with_us_states(df_afforestation_npv)
# df_natural_npv = clip_data_with_us_states(df_natural_npv)

df_economic = pd.read_csv('data/US_data/df_economic.csv')
df_weight = pd.read_csv('data/US_data/df_weight.csv')
df_strategies = pd.read_csv('data/US_data/df_strategies.csv')
df_net_benefit = pd.read_csv('data/US_data/df_net_benefit.csv')
df_pv_npv = pd.read_csv('data/US_data/df_pv_npv.csv')
df_agricultural_npv = pd.read_csv('data/US_data/df_agricultural_npv.csv')
df_afforestation_npv = pd.read_csv('data/US_data/df_afforestation_npv.csv')
df_natural_npv = pd.read_csv('data/US_data/df_natural_npv.csv')
df_pixel_optimized_data = pd.read_csv('data/US_data/df_pixel_optimized_data.csv')
df_transformer = pd.read_csv('data/US_data/df_transformer.csv')


us_nation = gpd.read_file(r'data\US_data\cb_2018_us_nation_5m.shp')
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')
us_counties = gpd.read_file('data/cb_2018_us_county_500k.shp')


us_nation_4326 = us_nation.to_crs('EPSG:4326')
us_states_4326 = us_states.to_crs('EPSG:4326')
us_counties_4326 = us_counties.to_crs('EPSG:4326')





In [3]:
df_strategies['area_m2'].sum()


47033571818.09639

In [4]:
(df_strategies['acc_forest'] * df_strategies['area_m2']).sum()


12232831152747.332

In [5]:
df_weight

Unnamed: 0,lat,lon,time,acc_forest,cap_forest,final_forest,weighted_density_Forest,weighted_density_Agricultural,weighted_density_Vegetation,acc_agro,...,final_veg,Revenue_ratio,gmm_logp,predicted_label,predicted_prob,sample_type,gmm_density,LNCS_expect,Expectation_net_benefit,pv_potential_dens
0,25.295834,-80.287500,2020-01-01,811.84186,859.4,811.84186,0.998714,0.000367,0.000918,772.493469,...,844.84770,8.858564,44.174661,1,0.947286,prediction,2.806133e+19,811.854595,3268.029468,4079.884063
1,25.437500,-80.537500,2020-01-01,1391.45790,1436.6,1391.45790,0.829869,0.007830,0.162302,1102.152466,...,1138.81860,5.218384,51.390177,1,0.999397,prediction,5.399806e+21,1348.189022,2642.915217,3991.104240
2,25.437500,-80.495834,2020-01-01,1028.14750,1072.6,1028.14750,0.771417,0.022229,0.206354,750.145691,...,782.05790,7.232520,46.699314,1,0.963775,prediction,1.508036e+20,971.186274,3013.539266,3984.725540
3,25.445833,-80.454170,2020-01-01,1028.14750,1073.7,1028.14750,0.770923,0.007369,0.221708,756.315613,...,787.59076,7.213776,37.429584,0,0.330575,prediction,1.800785e+16,972.811005,3008.236626,3981.047631
4,25.445833,-80.404170,2020-01-01,1053.12240,1093.3,1053.12240,0.714253,0.099801,0.185946,728.004333,...,772.76874,7.274306,52.967203,1,0.999345,prediction,6.792866e+22,968.544647,3028.301877,3996.846524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70332,48.995834,-99.995834,2020-01-01,377.83080,405.3,377.83080,1.000000,0.000000,0.000000,324.812927,...,357.41420,15.812380,34.561048,1,0.989598,prediction,5.692829e+14,377.830800,3011.397640,3389.228440
70333,48.995834,-99.987500,2020-01-01,307.51660,332.1,307.51660,1.000000,0.000000,0.000000,262.768982,...,266.84415,19.403884,33.200528,1,0.985191,prediction,1.413612e+14,307.516600,3077.520875,3385.037475
70334,48.995834,-99.895836,2020-01-01,324.67432,347.0,324.67432,1.000000,0.000000,0.000000,273.972321,...,343.00000,18.239319,32.026889,1,0.965398,prediction,3.161397e+13,324.674320,3034.733949,3359.408269
70335,48.995834,-99.887500,2020-01-01,323.37466,349.2,323.37466,1.000000,0.000000,0.000000,273.972321,...,338.53870,18.325695,32.786314,1,0.969823,prediction,6.767585e+13,323.374660,3038.431607,3361.806267


In [6]:
df_weight.columns

Index(['lat', 'lon', 'time', 'acc_forest', 'cap_forest', 'final_forest',
       'weighted_density_Forest', 'weighted_density_Agricultural',
       'weighted_density_Vegetation', 'acc_agro', 'cap_agro', 'final_agro',
       'acc_veg', 'cap_veg', 'final_veg', 'Revenue_ratio', 'gmm_logp',
       'predicted_label', 'predicted_prob', 'sample_type', 'gmm_density',
       'LNCS_expect', 'Expectation_net_benefit', 'pv_potential_dens'],
      dtype='object')

In [7]:
# 合并数据
merged = pd.merge(
    df_weight[['lat', 'lon', 'pv_potential_dens', 'Expectation_net_benefit']],
    df_strategies[['lat', 'lon', 'area_m2']],
    on=['lat', 'lon'],
    how='inner'
)

# 计算总量
total_MgC = (merged['pv_potential_dens'] * merged['area_m2']).sum() / 10000
total_GtC = total_MgC / 1e9
total_GtCO2 = total_GtC * 3.667

# 计算标准误
# 先计算每个像素的MgC
pixel_MgC = merged['pv_potential_dens'] * merged['area_m2'] / 10000
# 标准误 = 标准差 / sqrt(N)
std_MgC = pixel_MgC.std(ddof=1)
se_MgC = std_MgC / (len(pixel_MgC) ** 0.5)
se_GtC = se_MgC / 1e9
se_GtCO2 = se_GtC * 3.667

print(f"Total Mg C: {total_MgC:.2e} ± {se_MgC:.2e} Mg C")
print(f"Total Gt C: {total_GtC:.4f} ± {se_GtC:.4f} Gt C")
print(f"Total Gt CO2: {total_GtCO2:.4f} ± {se_GtCO2:.4f} Gt CO2")


Total Mg C: 1.74e+10 ± 1.37e+02 Mg C
Total Gt C: 17.3651 ± 0.0000 Gt C
Total Gt CO2: 63.6777 ± 0.0000 Gt CO2


In [8]:
# Display the proportion of pixels where weighted_density_Forest is greater than 90%
proportion_above_90 = (df_weight['weighted_density_Forest'] > 0.7).mean()
print(f"Proportion of pixels with weighted_density_Forest > 70%: {proportion_above_90:.2%}")

Proportion of pixels with weighted_density_Forest > 70%: 66.69%


In [9]:
# 期望净效益的单位是 Mg（兆克），将其转化为 Gt C 和 Gt CO2
# 1. 先计算总的 Mg
total_Mg = ((merged['Expectation_net_benefit'] * merged['area_m2']).sum()) / 10000  # 先转为ha后求和，单位仍为Mg

# 2. Mg 转 Gt C（1 Gt = 1e9 Mg）
total_GtC = total_Mg / 1e9

# 3. Gt C 转 Gt CO2（1 t C = 3.667 t CO2）
total_GtCO2 = total_GtC * 3.667

print(f"Total Mg: {total_Mg:.2e} Mg")
print(f"Total Gt C: {total_GtC:.4f} Gt C")
print(f"Total Gt CO2: {total_GtCO2:.4f} Gt CO2")
 

Total Mg: 1.62e+10 Mg
Total Gt C: 16.2089 Gt C
Total Gt CO2: 59.4379 Gt CO2


In [10]:
opp=104.9934 - 101.1162 
opp

3.8771999999999878

In [11]:
total_pv_potential = (merged['pv_potential_dens'] * merged['area_m2']).sum() / 10000  
expected_net_benefit =((merged['Expectation_net_benefit'] * merged['area_m2']).sum()) / 10000

total_pv_potential/(total_pv_potential-expected_net_benefit)

15.018904045446806

In [12]:
df_economic

Unnamed: 0,lat,lon,pv_category,pv_model,pv_scenario,policy_category,rcp_category,net_npv_usd,analysis_year
0,33.595833,-117.587500,C1,REMIND 2.1,CEMICS_GDPgrowth_1p5,P2a,RCP2.6,-2.389517e+06,2020
1,33.629166,-117.579170,C1,REMIND 2.1,CEMICS_GDPgrowth_1p5,P2a,RCP2.6,-2.389462e+06,2020
2,33.720833,-117.737500,C1,REMIND 2.1,CEMICS_GDPgrowth_1p5,P2a,RCP2.6,-2.389492e+06,2020
3,33.904167,-117.820830,C1,REMIND 2.1,CEMICS_GDPgrowth_1p5,P2a,RCP2.6,-2.390356e+06,2020
4,33.920834,-117.620834,C1,REMIND 2.1,CEMICS_GDPgrowth_1p5,P2a,RCP2.6,-2.390268e+06,2020
...,...,...,...,...,...,...,...,...,...
74557215,47.262500,-68.379166,C7,TIAM-ECN 1.1,EN_NPi2100_COV,P1b,RCP8.5,2.464577e+06,2050
74557216,47.262500,-68.370834,C7,TIAM-ECN 1.1,EN_NPi2100_COV,P1b,RCP8.5,2.481259e+06,2050
74557217,47.270832,-68.387500,C7,TIAM-ECN 1.1,EN_NPi2100_COV,P1b,RCP8.5,2.431978e+06,2050
74557218,47.270832,-68.370834,C7,TIAM-ECN 1.1,EN_NPi2100_COV,P1b,RCP8.5,2.431043e+06,2050


In [13]:
economic_2050 = df_economic[df_economic['analysis_year'] == 2050]
avg_npv = economic_2050.groupby(['lon', 'lat'])['net_npv_usd'].mean().reset_index()
avg_npv.columns = ['lon', 'lat', 'avg_npv']

In [14]:
# Calculate the proportion of grid cells where avg_npv is greater than zero
proportion_gt_zero = (avg_npv['avg_npv'] > 0).mean()
print(f"Proportion of grid cells with avg_npv > 0: {proportion_gt_zero:.2%}")

Proportion of grid cells with avg_npv > 0: 40.95%


# Result

## 1、3E assessment of deploying PV on abandoned cropland 

这个文档应该严格按照文章的分析顺序，标记文章涉及具体的位置、具体数值、计算说明，例如

```
位置指针：3E assessment of deploying PV on abandoned cropland ，第一段第一句
撂荒地总量：
计算说明：输入state_analysis_df，对各个州的abandonment进行求和
```

In [27]:
import pandas as pd
import geopandas as gpd

'''
指针：Result 有关光伏站点大于with a capacity of 1 megawatt的总量统计
数据：光伏站点面积、撂荒地与现有光伏的倍数关系
计算：利用总量、排序即可
'''

# 读取光伏站点数据
uspv_data = gpd.read_file('D:/Photovoltaic_data/uspv_data/uspvdb_v3_0_20250430.shp')

# 计算总的光伏站点面积（公顷）
total_pv_area = uspv_data['p_area'].sum() / 10000  # 单位：公顷
total_pv_area_kha = total_pv_area / 1e3  # 单位：千公顷
total_pv_area_million_ha = total_pv_area / 1e6  # 单位：百万公顷

print(f"总光伏电站面积: {total_pv_area:.2f} 公顷 ({total_pv_area_kha:.2f} 千公顷, {total_pv_area_million_ha:.4f} 百万公顷)")

# 计算df_net_benefit面积与现有光伏的倍数关系
area_abandoned_million_ha = df_net_benefit['area_m2'].sum() / 10000 / 1e6  # 单位：百万公顷
area_abandoned_kha = df_net_benefit['area_m2'].sum() / 10000 / 1e3  # 单位：千公顷
times = area_abandoned_million_ha / total_pv_area_million_ha

print(f"df_net_benefit撂荒地面积: {area_abandoned_million_ha:.4f} 百万公顷 ({area_abandoned_kha:.2f} 千公顷)")
print(f"撂荒地面积/现有光伏面积（倍数）: {times:.2f} 倍")

总光伏电站面积: 225357.50 公顷 (225.36 千公顷, 0.2254 百万公顷)
df_net_benefit撂荒地面积: 4.7034 百万公顷 (4703.36 千公顷)
撂荒地面积/现有光伏面积（倍数）: 20.87 倍


In [28]:
import numpy as np 
import pandas as pd 

'''
指针：Result 第一段有关撂荒地的数据统计
数据：撂荒地总量、前3个州的统计量与百分比（Mha）为单位
计算：利用总量、排序即可
'''
state_analysis_df = pd.read_csv('data/US_data/US_analysis_reslut/state_level_analysis_with_wccd.csv')

# 耕地撂荒求和
top3_states = state_analysis_df.nlargest(3, 'abandoned_land_ha')[['State_name', 'abandoned_land_ha']]
total_abandoned = state_analysis_df['abandoned_land_ha'].sum()
top3_states['abandoned_land_million_ha'] = (top3_states['abandoned_land_ha'] / 1e6).round(2)
top3_states['abandoned_land_kha'] = (top3_states['abandoned_land_ha'] / 1e3).round(2)
top3_states['percent_of_total'] = (top3_states['abandoned_land_ha'] / total_abandoned * 100).round(2)
print(top3_states[['State_name', 'abandoned_land_million_ha', 'abandoned_land_kha', 'percent_of_total']])
print(f"The total area of abandoned land: {(total_abandoned/1e6).round(2)} million ha, {(total_abandoned/1e3).round(2)} kha")


   State_name  abandoned_land_million_ha  abandoned_land_kha  percent_of_total
0       Texas                       0.52              515.96             10.97
1    Illinois                       0.30              295.40              6.28
2  California                       0.27              272.49              5.79
The total area of abandoned land: 4.7 million ha, 4703.36 kha


In [47]:
import numpy as np 
import pandas as pd 

'''
指针：Result第一段有关政策期望产出
数据：主要相关各个维度的排序方案以及期望产出
计算：
'''


df_analysis=pd.read_csv(r'data\US_data\df_merged_data_for_analysis.csv')
df_economic = pd.read_csv('data/US_data/df_economic.csv')
df_analysis


# 环境维度的变量
env_data = df_analysis[['lat', 'lon', 'predicted_prob','gmm_density','sample_type']].copy()

# 碳减排维度的变量

emission_data = df_analysis[['lat', 'lon', 'Expectation_net_benefit','area_m2']].copy()
emission_data = emission_data.rename(columns={'Expectation_net_benefit': 'Expectation_net_benefit'})

# 经济维度的变量
economic_2050 = df_economic[df_economic['analysis_year'] == 2050]
avg_npv = economic_2050.groupby(['lat', 'lon'])['net_npv_usd'].mean().reset_index().rename(columns={'net_npv_usd': 'avg_npv'})

# 协同指数的变量
coordinate_data = df_analysis[['lat', 'lon', 'ccd_optimized']].copy()

# 合并所有变量（增加协同指数维度）
merged_data_for_plot = env_data.merge(emission_data, on=['lat', 'lon'], how='outer') \
                      .merge(avg_npv, on=['lat', 'lon'], how='outer') \
                      .merge(coordinate_data, on=['lat', 'lon'], how='outer')




In [55]:
# 计算各个维度优先区方案的平均效率、变异系数和置信区间
import numpy as np
import pandas as pd
from scipy import stats

def calculate_integration_scores_with_cv(merged_data_for_plot, confidence_level=0.95):
    """
    计算各个维度优先区方案的平均效率、变异系数和置信区间
    优化：统一列名，消除NaN，添加95%置信区间
    新增：如果是avg_npv变量，统计不同排序下的最小最大、累积曲线之间分位数最大差距
    """
    
    # ===== 变量定义 =====
    variables = [
        ('predicted_prob', 'Environmental Suitability', 'Score', 'Environmental'),
        ('Expectation_net_benefit', 'Emission Mitigation', 'Gt', 'Emission'),
        ('avg_npv', 'Economic Feasibility', 'USD', 'Economic')
    ]
    
    area_values = merged_data_for_plot['area_m2'].values / 10000  # ha
    total_area = area_values.sum()  # 总面积

    # ===== 积分计算工具函数 =====
    def percentage_to_01(pct): 
        return pct / 100.0
    
    def calculate_integral(x, y):
        if len(x) < 2: 
            return 0.0
        if x[0] > x[-1]:
            x = x[::-1]
            y = y[::-1]
        return np.trapz(y, x)
    
    # ===== 计算积分 =====
    all_integrals = {}
    economic_extra_stats = {}

    for variable_name, variable_label, unit, short_name in variables:
        dens = merged_data_for_plot[variable_name].values.astype(float)
        if variable_name == 'Expectation_net_benefit':
            dens = dens / 0.27
        total_vals = dens * area_values

        integrals = {}
        # 若是avg_npv变量，收集排序用到的数据
        economic_minmax = {}
        economic_cumcurves = {}

        # 1. 目标曲线（自身排序, 按密度排序）
        idx_sort = np.argsort(dens)[::-1]  # 按密度排序
        cum_ben = np.cumsum(total_vals[idx_sort])  # 累积总效益
        cum_pct = np.arange(1, len(cum_ben)+1) / len(cum_ben) * 100
        cum_ben = np.concatenate([[0], cum_ben])
        cum_pct = np.concatenate([[0], cum_pct])
        x01 = percentage_to_01(cum_pct)
        integral_main = calculate_integral(x01, cum_ben) / total_area
        integrals[short_name] = integral_main

        if variable_name == 'avg_npv':
            min_self = cum_ben.min()
            max_self = cum_ben.max()
            economic_minmax[short_name] = (min_self, max_self)
            economic_cumcurves[short_name] = (cum_pct.copy(), cum_ben.copy())
            print(f"[打印] variable_name: {variable_name}, 排序方法: {short_name} (自身), cum_ben min: {min_self}, max: {max_self}")
        # 2. 交叉曲线（按其他维度的“密度”排序）
        for other_var_name, other_var_label, _, other_short_name in variables:
            if other_var_name == variable_name:
                continue
            other_dens = merged_data_for_plot[other_var_name].values.astype(float)
            if other_var_name == 'Expectation_net_benefit':
                other_dens = other_dens / 0.27

            idx_cross = np.argsort(other_dens)[::-1]  # 按密度排序
            cross_cum_ben = np.cumsum(total_vals[idx_cross])  # 累积仍用自身total_vals
            cross_cum_pct = np.arange(1, len(cross_cum_ben)+1) / len(cross_cum_ben) * 100
            cross_cum_ben = np.concatenate([[0], cross_cum_ben])
            cross_cum_pct = np.concatenate([[0], cross_cum_pct])
            x01 = percentage_to_01(cross_cum_pct)
            integral_cross = calculate_integral(x01, cross_cum_ben) / total_area
            integrals[other_short_name] = integral_cross

            if variable_name == 'avg_npv':
                min_cross = cross_cum_ben.min()
                max_cross = cross_cum_ben.max()
                min_cross_idx = np.argmin(cross_cum_ben)
                max_cross_idx = np.argmax(cross_cum_ben)
                min_cross_pct = cross_cum_pct[min_cross_idx]
                max_cross_pct = cross_cum_pct[max_cross_idx]
                economic_minmax[other_short_name] = (min_cross, max_cross)
                economic_cumcurves[other_short_name] = (cross_cum_pct.copy(), cross_cum_ben.copy())
                print(f"[打印] variable_name: {variable_name}, 排序方法: {other_short_name}, cross_cum_ben min: {min_cross} (分位数: {min_cross_pct}%), max: {max_cross} (分位数: {max_cross_pct}%)")
            all_integrals[variable_name] = integrals

        # 若为avg_npv，收集额外统计量
        if variable_name == 'avg_npv':
            # 统计不同排序下的min/max
            economic_extra_stats['minmax'] = economic_minmax

            # 统计不同排序下分位点累积最大差距
            # 这里只比较Environmental和Economic排序
            if ('Economic' in economic_cumcurves) and ('Environmental' in economic_cumcurves):
                pct_e, ben_e = economic_cumcurves['Economic']
                pct_env, ben_env = economic_cumcurves['Environmental']
                # 取二者交集的分位点
                pcts_shared = np.linspace(0, 100, min(len(pct_e), len(pct_env)))
                # 插值到相同分位点
                ben_e_interp = np.interp(pcts_shared, pct_e, ben_e)
                ben_env_interp = np.interp(pcts_shared, pct_env, ben_env)
                abs_diff_arr = np.abs(ben_e_interp - ben_env_interp)
                idx_maxdiff = np.argmax(abs_diff_arr)
                max_diff = abs_diff_arr[idx_maxdiff]
                pct_at_max = pcts_shared[idx_maxdiff]
                val_e = ben_e_interp[idx_maxdiff]
                val_env = ben_env_interp[idx_maxdiff]
                economic_extra_stats['max_cum_curve_diff'] = {
                    'max_abs_diff': max_diff,
                    'percentile': pct_at_max,
                    'economic_at_pct': val_e,
                    'environmental_at_pct': val_env,
                }
                print('【Economic/Environmental 累积分位曲线最大差统计】')
                print('max_abs_diff:', max_diff)
                print('percentile:', pct_at_max)
                print('economic_at_pct:', val_e)
                print('environmental_at_pct:', val_env)
            else:
                economic_extra_stats['max_cum_curve_diff'] = {}
                print('【Economic/Environmental 累积分位曲线最大差统计】数据不足')
        # end for each variable

    # ===== 提取结果并计算统计量 =====
    sort_methods = ['Environmental', 'Emission', 'Economic']
    results = []
    
    alpha = 1 - confidence_level
    
    for variable_name, variable_label, unit, short_name in variables:
        integrals = all_integrals[variable_name]
        
        # 按统一的顺序提取值
        values = [integrals[method] for method in sort_methods]
        
        # 判断是否为经济维度
        is_economic = variable_label.strip().startswith("Economic Feasibility")
        if is_economic:
            values = [val / 1000 for val in values]  # 转换为K thousand
        
        # 计算统计量
        values_array = np.array(values)
        n = len(values_array)
        mean_val = np.mean(values_array)
        std_val = np.std(values_array, ddof=1)
        cv = (std_val / mean_val) if mean_val != 0 else 0
        
        # 计算95%置信区间
        sem = std_val / np.sqrt(n)
        df_freedom = n - 1
        t_critical = stats.t.ppf(1 - alpha/2, df_freedom)
        margin_of_error = t_critical * sem
        ci_lower = mean_val - margin_of_error
        ci_upper = mean_val + margin_of_error
        relative_error = (margin_of_error / abs(mean_val) * 100) if mean_val != 0 else np.inf
        
        # 构建结果行
        result_row = {
            'Target': variable_label,
            'Variable_Name': variable_name,
            'Unit': unit,
        }
        
        # 添加各个排序方案的积分值（统一列名）
        for method, val in zip(sort_methods, values):
            result_row[f'Sort_by_{method}'] = val
        
        # 添加统计量
        result_row['Mean'] = mean_val
        result_row['±Error'] = margin_of_error
        result_row['Mean±Error'] = f"{mean_val:.6e} ± {margin_of_error:.6e}" if not is_economic else f"{mean_val:.2f} ± {margin_of_error:.2f}"
        result_row['CI_Lower'] = ci_lower
        result_row['CI_Upper'] = ci_upper
        result_row['Std'] = std_val
        result_row['CV'] = cv
        result_row['Relative_Error_%'] = relative_error
        result_row['Sample_Size'] = n
        
        results.append(result_row)
    
    # 转换为DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

# 执行分析
results_df = calculate_integration_scores_with_cv(merged_data_for_plot, confidence_level=0.95)

# 显示完整的DataFrame（无NaN）
display(results_df)

[打印] variable_name: avg_npv, 排序方法: Economic (自身), cum_ben min: 0.0, max: 1462032284597.9695
[打印] variable_name: avg_npv, 排序方法: Environmental, cross_cum_ben min: -206169885325.66077 (分位数: 37.12270924264612%), max: 339628562491.8943 (分位数: 100.0%)
[打印] variable_name: avg_npv, 排序方法: Emission, cross_cum_ben min: 0.0 (分位数: 0.0%), max: 1072751265065.2577 (分位数: 35.36261142784026%)
【Economic/Environmental 累积分位曲线最大差统计】
max_abs_diff: 1663865912884.7397
percentile: 39.04488391600438
economic_at_pct: 1460162041078.166
environmental_at_pct: -203703871806.57364


Unnamed: 0,Target,Variable_Name,Unit,Sort_by_Environmental,Sort_by_Emission,Sort_by_Economic,Mean,±Error,Mean±Error,CI_Lower,CI_Upper,Std,CV,Relative_Error_%,Sample_Size
0,Environmental Suitability,predicted_prob,Score,0.484335,0.402025,0.428236,0.438199,0.104457,4.381987e-01 ± 1.044571e-01,0.333742,0.542656,0.04205,0.09596,23.837846,3
1,Emission Mitigation,Expectation_net_benefit,Gt,6189.636243,6922.194054,6867.905898,6659.912065,1013.961335,6.659912e+03 ± 1.013961e+03,5645.95073,7673.8734,408.174366,0.061288,15.224846,3
2,Economic Feasibility,avg_npv,USD,-9.925117,170.48259,226.842512,129.133328,307.241227,129.13 ± 307.24,-178.107899,436.374555,123.681238,0.957779,237.925585,3


In [35]:
# 计算各排序方案的期望值

import numpy as np 
import pandas as pd 
from scipy import stats

def calculate_integration_scores_with_cv_and_ccd(merged_data_for_plot, confidence_level=0.95):
    """
    计算各个维度优先区方案的平均效率、变异系数和置信区间
    增加基于 ccd_optimized 的排序统计
    优化：统一列名，消除NaN
    """
    
    # ===== 变量定义 =====
    variables = [
        ('predicted_prob', 'Environmental Suitability', 'Score', 'Environmental'),
        ('Expectation_net_benefit', 'Emission Mitigation', 't', 'Emission'),
        ('avg_npv', 'Economic Feasibility', 'USD', 'Economic'),
        ('ccd_optimized', 'Coordination Index', 'CCD', 'Coordination')
    ]
    
    area_values = merged_data_for_plot['area_m2'].values / 10000  # ha
    total_area = area_values.sum()
    
    # ===== 积分计算工具函数 =====
    def percentage_to_01(pct): 
        return pct / 100.0
    
    def calculate_integral(x, y):
        if len(x) < 2: 
            return 0.0
        if x[0] > x[-1]:
            x = x[::-1]
            y = y[::-1]
        return np.trapz(y, x)
    
    # ===== 计算积分（扩展到四个维度）=====
    all_integrals = {}
    
    for variable_name, variable_label, unit, short_name in variables:
        dens = merged_data_for_plot[variable_name].values.astype(float)

        # 特殊处理：Emission需要除以0.27转换单位
        if variable_name == 'Expectation_net_benefit':
            dens = dens / 0.27

        total_vals = dens * area_values
        integrals = {}

        # 1. 目标曲线（自身排序，按密度排序，不用total_vals排序）
        idx_sort = np.argsort(dens)[::-1]  # 参照“密度排序”
        cum_ben = np.cumsum(total_vals[idx_sort])
        cum_pct = np.arange(1, len(cum_ben) + 1) / len(cum_ben) * 100
        cum_ben = np.concatenate([[0], cum_ben])
        cum_pct = np.concatenate([[0], cum_pct])
        x01 = percentage_to_01(cum_pct)
        integral_main = calculate_integral(x01, cum_ben) / total_area
        integrals[short_name] = integral_main

        # 2. 交叉曲线（按其他维度的密度排序）
        for other_var_name, other_var_label, _, other_short_name in variables:
            if other_var_name == variable_name:
                continue

            other_dens = merged_data_for_plot[other_var_name].values.astype(float)
            if other_var_name == 'Expectation_net_benefit':
                other_dens = other_dens / 0.27

            idx_cross = np.argsort(other_dens)[::-1]  # 按密度排序
            cross_cum_ben = np.cumsum(total_vals[idx_cross])
            cross_cum_pct = np.arange(1, len(cross_cum_ben) + 1) / len(cross_cum_ben) * 100
            cross_cum_ben = np.concatenate([[0], cross_cum_ben])
            cross_cum_pct = np.concatenate([[0], cross_cum_pct])
            x01 = percentage_to_01(cross_cum_pct)
            integral_cross = calculate_integral(x01, cross_cum_ben) / total_area
            integrals[other_short_name] = integral_cross

        all_integrals[variable_name] = integrals
    
    # ===== 提取结果并计算统计量 =====
    # 统一的列名顺序
    sort_methods = ['Environmental', 'Emission', 'Economic', 'Coordination']
    results = []
    
    alpha = 1 - confidence_level
    
    for variable_name, variable_label, unit, short_name in variables:
        integrals = all_integrals[variable_name]
        
        # 按统一的顺序提取值（现在所有值都在integrals字典中，键名一致）
        values = [integrals[method] for method in sort_methods]
        
        # 判断是否为经济维度
        is_economic = variable_label.strip().startswith("Economic Feasibility")
        if is_economic:
            values = [val / 1000 for val in values]  # 转换为K thousand
        
        # 计算统计量
        values_array = np.array(values)
        n = len(values_array)
        mean_val = np.mean(values_array)
        std_val = np.std(values_array, ddof=1)
        cv = (std_val / mean_val) if mean_val != 0 else 0
        
        # 计算95%置信区间
        sem = std_val / np.sqrt(n)
        df_freedom = n - 1
        t_critical = stats.t.ppf(1 - alpha/2, df_freedom)
        margin_of_error = t_critical * sem
        ci_lower = mean_val - margin_of_error
        ci_upper = mean_val + margin_of_error
        relative_error = (margin_of_error / abs(mean_val) * 100) if mean_val != 0 else np.inf
        
        # 构建结果行
        result_row = {
            'Target': variable_label,
            'Variable_Name': variable_name,
            'Unit': unit,
        }
        
        # 添加各个排序方案的积分值（统一列名）
        for method, val in zip(sort_methods, values):
            result_row[f'Sort_by_{method}'] = val
        
        # 添加统计量
        result_row['Mean'] = mean_val
        result_row['±Error'] = margin_of_error
        result_row['Mean±Error'] = f"{mean_val:.6e} ± {margin_of_error:.6e}" if not is_economic else f"{mean_val:.2f} ± {margin_of_error:.2f}"
        result_row['CI_Lower'] = ci_lower
        result_row['CI_Upper'] = ci_upper
        result_row['Std'] = std_val
        result_row['CV'] = cv
        result_row['Relative_Error_%'] = relative_error
        result_row['Sample_Size'] = n
        
        results.append(result_row)
    
    # 转换为DataFrame
    results_df = pd.DataFrame(results)
    
    # ===== 格式化输出 =====
    print("\n" + "="*100)
    print("各目标维度下四种排序方案的积分效率及统计量（含95%置信区间）")
    print("="*100)
    
    for idx, row in results_df.iterrows():
        print(f"\n【{row['Target']}】")
        print(f"  变量名: {row['Variable_Name']:<30} 单位: {row['Unit']}")
        
        is_economic = str(row['Target']).strip().startswith("Economic Feasibility")
        
        # 输出四种排序方案的积分值
        for method in sort_methods:
            col_name = f'Sort_by_{method}'
            if is_economic:
                print(f"    按{method}排序{' '*(20-len(method))}: {row[col_name]:>15.2f} K USD")
            else:
                print(f"    按{method}排序{' '*(20-len(method))}: {row[col_name]:>15.6e}")
        
        # 输出统计量
        print(f"\n  统计量:")
        if is_economic:
            print(f"    {'均值 ± 95%CI':<25s}: {row['Mean']:.2f} ± {row['±Error']:.2f} K USD")
            print(f"    {'置信区间':<25s}: [{row['CI_Lower']:.2f}, {row['CI_Upper']:.2f}] K USD")
            print(f"    {'标准差':<25s}: {row['Std']:.2f} K USD")
        else:
            print(f"    {'均值 ± 95%CI':<25s}: {row['Mean']:.6e} ± {row['±Error']:.6e}")
            print(f"    {'置信区间':<25s}: [{row['CI_Lower']:.6e}, {row['CI_Upper']:.6e}]")
            print(f"    {'标准差':<25s}: {row['Std']:.6e}")
        
        print(f"    {'变异系数(CV)':<25s}: {row['CV']:.4f}")
        print(f"    {'相对误差(%)':<25s}: {row['Relative_Error_%']:.2f}%")
        print(f"    {'样本量(排序方案数)':<25s}: {row['Sample_Size']}")
    
    print("\n" + "="*100)
    
    # ===== 简洁的论文格式输出 =====
    print("\n论文用简洁格式（可直接复制）:")
    print("="*100)
    for idx, row in results_df.iterrows():
        print(f"{row['Target']}: {row['Mean±Error']} (CV={row['CV']:.4f})")
    print("="*100)
    
    return results_df

# 执行分析
results_df = calculate_integration_scores_with_cv_and_ccd(merged_data_for_plot, confidence_level=0.95)

# 显示完整的DataFrame（无NaN）
print("\n完整结果表（已优化，无NaN值）:")
display(results_df)


各目标维度下四种排序方案的积分效率及统计量（含95%置信区间）

【Environmental Suitability】
  变量名: predicted_prob                 单位: Score
    按Environmental排序       :    4.843351e-01
    按Emission排序            :    4.020252e-01
    按Economic排序            :    4.282359e-01
    按Coordination排序        :    4.654277e-01

  统计量:
    均值 ± 95%CI               : 4.450060e-01 ± 5.877058e-02
    置信区间                     : [3.862354e-01, 5.037765e-01]
    标准差                      : 3.693422e-02
    变异系数(CV)                 : 0.0830
    相对误差(%)                  : 13.21%
    样本量(排序方案数)               : 4

【Emission Mitigation】
  变量名: Expectation_net_benefit        单位: t
    按Environmental排序       :    6.189636e+03
    按Emission排序            :    6.922194e+03
    按Economic排序            :    6.867906e+03
    按Coordination排序        :    6.830002e+03

  统计量:
    均值 ± 95%CI               : 6.702434e+03 ± 5.473056e+02
    置信区间                     : [6.155129e+03, 7.249740e+03]
    标准差                      : 3.439528e+02
    变异系数(CV)

Unnamed: 0,Target,Variable_Name,Unit,Sort_by_Environmental,Sort_by_Emission,Sort_by_Economic,Sort_by_Coordination,Mean,±Error,Mean±Error,CI_Lower,CI_Upper,Std,CV,Relative_Error_%,Sample_Size
0,Environmental Suitability,predicted_prob,Score,0.484335,0.402025,0.428236,0.465428,0.445006,0.058771,4.450060e-01 ± 5.877058e-02,0.386235,0.503777,0.036934,0.082997,13.206696,4
1,Emission Mitigation,Expectation_net_benefit,t,6189.636243,6922.194054,6867.905898,6830.001731,6702.434482,547.30559,6.702434e+03 ± 5.473056e+02,6155.128892,7249.740072,343.952757,0.051318,8.165773,4
2,Economic Feasibility,avg_npv,USD,-9.925117,170.48259,226.842512,179.124669,141.631163,165.539342,141.63 ± 165.54,-23.908179,307.170506,104.032764,0.734533,116.880592,4
3,Coordination Index,ccd_optimized,CCD,0.424915,0.441564,0.450793,0.454591,0.442966,0.021034,4.429658e-01 ± 2.103365e-02,0.421932,0.463999,0.013219,0.029841,4.748369,4


In [None]:
# 仅计算单方案的期望值
import numpy as np
import pandas as pd
from scipy import stats

def analyze_coordination_performance_significance(results_df):
    """
    分析Sort_by_Coordination在其他目标维度下的性能提升及显著性
    仅分析Environmental, Emission, Economic三个目标维度
    """
    
    print("\n" + "="*100)
    print("Coordination排序方案在其他目标维度下的性能提升及显著性分析")
    print("="*100)
    
    # 定义排序方案（不包括Coordination）
    other_sort_methods = ['Environmental', 'Emission', 'Economic']
    
    # 存储所有维度的性能提升分析
    dimension_analyses = []
    
    # 只分析前3个目标维度（排除Coordination Index本身）
    for idx, row in results_df.iterrows():
        target = row['Target']
        variable_name = row['Variable_Name']
        
        # 跳过Coordination Index作为目标的情况
        if 'Coordination' in target:
            print(f"\n跳过 {target}（Coordination作为目标本身，不需要分析）")
            continue
        
        print(f"\n{'='*100}")
        print(f"【{target}】")
        print(f"{'='*100}")
        
        # 获取Coordination的积分值
        coord_value = row['Sort_by_Coordination']
        
        # 获取其他排序方案的积分值
        other_values = {}
        for method in other_sort_methods:
            col_name = f'Sort_by_{method}'
            other_values[method] = row[col_name]
        
        # 计算性能提升
        improvements = {}
        improvement_details = []
        for method, other_value in other_values.items():
            if abs(other_value) > 1e-10:  # 避免除零
                # 性能提升 = (Coordination - 其他) / |其他| * 100%
                improvement = (coord_value - other_value) / abs(other_value) * 100
                improvements[method] = improvement
                improvement_details.append(f"{improvement:+.2f}%")
                print(f"  Coordination vs {method}: {improvement:+.2f}%")
            else:
                improvements[method] = 0
                improvement_details.append("N/A")
                print(f"  Coordination vs {method}: 无法计算（分母接近0）")
        
        # 计算平均性能提升和标准误
        valid_improvements = [imp for imp in improvements.values() if imp != 0]
        
        if len(valid_improvements) > 0:
            mean_improvement = np.mean(valid_improvements)
            std_improvement = np.std(valid_improvements, ddof=1) if len(valid_improvements) > 1 else 0
            std_error = std_improvement / np.sqrt(len(valid_improvements)) if len(valid_improvements) > 1 else 0
            
            # 计算95%置信区间
            df_freedom = len(valid_improvements) - 1 if len(valid_improvements) > 1 else 0
            if df_freedom > 0:
                t_critical = stats.t.ppf(0.975, df_freedom)  # 95% CI
                margin_of_error = t_critical * std_error
                ci_lower = mean_improvement - margin_of_error
                ci_upper = mean_improvement + margin_of_error
                
                # t检验：检验平均性能提升是否显著大于0
                if std_error > 0:
                    t_statistic = mean_improvement / std_error
                    p_value = 1 - stats.t.cdf(t_statistic, df_freedom)  # 单侧检验
                    
                    # 判断显著性
                    if p_value < 0.001:
                        significance = "***"
                        sig_desc = "极显著"
                    elif p_value < 0.01:
                        significance = "**"
                        sig_desc = "高度显著"
                    elif p_value < 0.05:
                        significance = "*"
                        sig_desc = "显著"
                    else:
                        significance = "ns"
                        sig_desc = "不显著"
                else:
                    t_statistic = 0
                    p_value = 1.0
                    significance = "ns"
                    sig_desc = "无法检验"
            else:
                t_critical = 0
                margin_of_error = 0
                ci_lower = mean_improvement
                ci_upper = mean_improvement
                t_statistic = 0
                p_value = 1.0
                significance = "ns"
                sig_desc = "样本不足"
            
            # 输出统计量
            print(f"\n  统计量:")
            print(f"    平均性能提升: {mean_improvement:+.2f}%")
            print(f"    标准差: {std_improvement:.2f}%")
            print(f"    标准误: ±{std_error:.2f}%")
            print(f"    95%置信区间: [{ci_lower:.2f}%, {ci_upper:.2f}%]")
            print(f"    t统计量: {t_statistic:.3f}")
            print(f"    p值（单侧）: {p_value:.4f}")
            print(f"    显著性: {sig_desc} {significance}")
            
            # 存储结果
            dimension_analyses.append({
                'Target': target,
                'Variable_Name': variable_name,
                'Coord_Value': coord_value,
                'Mean_Improvement_%': mean_improvement,
                'Std_Error_%': std_error,
                'CI_95_Lower_%': ci_lower,
                'CI_95_Upper_%': ci_upper,
                't_statistic': t_statistic,
                'p_value': p_value,
                'Significance': significance,
                'Significance_Desc': sig_desc,
                'Sample_Size': len(valid_improvements),
                'vs_Environmental_%': improvements.get('Environmental', 0),
                'vs_Emission_%': improvements.get('Emission', 0),
                'vs_Economic_%': improvements.get('Economic', 0)
            })
        else:
            print(f"\n  无法计算性能提升（所有对比值都为0）")
    
    # 转换为DataFrame
    coord_analysis_df = pd.DataFrame(dimension_analyses)
    
    # ===== 总体性能提升分析（使用误差传播理论）=====
    print("\n" + "="*100)
    print("总体性能提升分析（跨3个目标维度，使用误差传播理论）")
    print("="*100)
    
    if len(dimension_analyses) > 0:
        # 计算总体均值
        n_dimensions = len(dimension_analyses)
        overall_mean = np.mean([a['Mean_Improvement_%'] for a in dimension_analyses])
        
        # 使用误差传播理论计算总体标准误
        # 总体标准误 = sqrt(Σ(se_i²) / n²)
        squared_std_errors = [a['Std_Error_%']**2 for a in dimension_analyses]
        overall_variance = np.sum(squared_std_errors) / (n_dimensions**2)
        overall_std_error = np.sqrt(overall_variance)
        
        print(f"\n误差传播计算:")
        print(f"  目标维度数量: {n_dimensions}")
        std_errors_str = ', '.join([f"{a['Std_Error_%']:.2f}%" for a in dimension_analyses])
        print(f"  各维度标准误: [{std_errors_str}]")
        print(f"  标准误平方和: {np.sum(squared_std_errors):.4f}")
        print(f"  总体方差: {overall_variance:.6f}")
        print(f"  总体标准误: {overall_std_error:.2f}%")
        
        # 计算95%置信区间
        overall_ci_95 = 1.96 * overall_std_error
        overall_cv = (overall_std_error / abs(overall_mean) * 100) if overall_mean != 0 else 0
        
        # 总体显著性检验
        if overall_std_error > 0:
            overall_z_stat = overall_mean / overall_std_error
            # 使用正态分布（样本量足够或误差传播后）
            overall_p_value = 1 - stats.norm.cdf(overall_z_stat)
            
            if overall_p_value < 0.001:
                overall_significance = "***"
                overall_sig_desc = "极显著"
            elif overall_p_value < 0.01:
                overall_significance = "**"
                overall_sig_desc = "高度显著"
            elif overall_p_value < 0.05:
                overall_significance = "*"
                overall_sig_desc = "显著"
            else:
                overall_significance = "ns"
                overall_sig_desc = "不显著"
        else:
            overall_z_stat = 0
            overall_p_value = 1.0
            overall_significance = "ns"
            overall_sig_desc = "无法检验"
        
        # 输出总体结果
        print(f"\n总体性能提升结果:")
        print(f"  均值: {overall_mean:+.2f}%")
        print(f"  标准误: ±{overall_std_error:.2f}%")
        print(f"  95%置信区间: [{overall_mean - overall_ci_95:.2f}%, {overall_mean + overall_ci_95:.2f}%]")
        print(f"  变异系数: {overall_cv:.2f}%")
        print(f"  Z统计量: {overall_z_stat:.3f}")
        print(f"  p值（单侧）: {overall_p_value:.4f}")
        print(f"  显著性: {overall_sig_desc} {overall_significance}")
        
        # 结论
        print(f"\n结论:")
        if overall_p_value < 0.05:
            print(f"  ✓ Coordination排序相对于单目标排序有显著改进")
            print(f"    平均改进幅度: {overall_mean:+.2f}% ± {overall_std_error:.2f}%")
        else:
            print(f"  ✗ Coordination排序相对于单目标排序无显著改进（p={overall_p_value:.4f}）")
        
        # 与错误方法的对比
        print(f"\n方法对比:")
        wrong_std = np.std([a['Mean_Improvement_%'] for a in dimension_analyses], ddof=1) if n_dimensions > 1 else 0
        wrong_std_error = wrong_std / np.sqrt(n_dimensions) if n_dimensions > 1 else 0
        
        print(f"  错误方法（忽略误差传播）:")
        print(f"    标准误: ±{wrong_std_error:.2f}%")
        print(f"    95%置信区间: [{overall_mean - 1.96*wrong_std_error:.2f}%, {overall_mean + 1.96*wrong_std_error:.2f}%]")
        
        print(f"  正确方法（误差传播）:")
        print(f"    标准误: ±{overall_std_error:.2f}%")
        print(f"    95%置信区间: [{overall_mean - overall_ci_95:.2f}%, {overall_mean + overall_ci_95:.2f}%]")
        
        # 误差传播的影响
        if wrong_std_error > 0:
            error_ratio = overall_std_error / wrong_std_error
            print(f"\n误差传播影响:")
            print(f"  标准误比率（传播/简单）: {error_ratio:.2f}")
        
        # 创建总体汇总DataFrame
        overall_summary = pd.DataFrame([{
            'Analysis': 'OVERALL (3 Dimensions)',
            'Mean_Improvement_%': overall_mean,
            'Std_Error_%': overall_std_error,
            'CI_95': f'[{overall_mean - overall_ci_95:.2f}%, {overall_mean + overall_ci_95:.2f}%]',
            'Z_statistic': overall_z_stat,
            'p_value': overall_p_value,
            'Significance': overall_significance,
            'Result': overall_sig_desc,
            'n_dimensions': n_dimensions
        }])
        
    else:
        print("\n无有效数据用于总体分析")
        overall_summary = None
    
    # ===== 各维度性能提升排序 =====
    print("\n" + "="*100)
    print("各目标维度性能提升排序:")
    print("="*100)
    
    if len(coord_analysis_df) > 0:
        sorted_dims = coord_analysis_df.sort_values('Mean_Improvement_%', ascending=False)
        for i, (idx, row) in enumerate(sorted_dims.iterrows(), 1):
            print(f"  {i}. {row['Target']}: {row['Mean_Improvement_%']:+.2f}% ± {row['Std_Error_%']:.2f}% ({row['Significance_Desc']}, {row['Significance']})")
    
    print("\n" + "="*100)
    
    return coord_analysis_df, overall_summary

# 执行分析
coord_analysis_df, overall_summary = analyze_coordination_performance_significance(results_df)

# 显示详细结果表
print("\n完整的Coordination性能分析表（仅Environmental, Emission, Economic）:")
display(coord_analysis_df)

if overall_summary is not None:
    print("\n总体性能提升汇总:")
    display(overall_summary)


Coordination排序方案在其他目标维度下的性能提升及显著性分析

【Environmental Suitability】
  Coordination vs Environmental: -3.90%
  Coordination vs Emission: +15.77%
  Coordination vs Economic: +8.68%

  统计量:
    平均性能提升: +6.85%
    标准差: 9.96%
    标准误: ±5.75%
    95%置信区间: [-17.90%, 31.60%]
    t统计量: 1.191
    p值（单侧）: 0.1780
    显著性: 不显著 ns

【Emission Mitigation】
  Coordination vs Environmental: +10.35%
  Coordination vs Emission: -1.33%
  Coordination vs Economic: -0.55%

  统计量:
    平均性能提升: +2.82%
    标准差: 6.53%
    标准误: ±3.77%
    95%置信区间: [-13.40%, 19.04%]
    t统计量: 0.748
    p值（单侧）: 0.2661
    显著性: 不显著 ns

【Economic Feasibility】
  Coordination vs Environmental: +1904.76%
  Coordination vs Emission: +5.07%
  Coordination vs Economic: -21.04%

  统计量:
    平均性能提升: +629.60%
    标准差: 1104.40%
    标准误: ±637.63%
    95%置信区间: [-2113.89%, 3373.08%]
    t统计量: 0.987
    p值（单侧）: 0.2138
    显著性: 不显著 ns

跳过 Coordination Index（Coordination作为目标本身，不需要分析）

总体性能提升分析（跨3个目标维度，使用误差传播理论）

误差传播计算:
  目标维度数量: 3
  各维度标准误: [5.75%, 3.77

Unnamed: 0,Target,Variable_Name,Coord_Value,Mean_Improvement_%,Std_Error_%,CI_95_Lower_%,CI_95_Upper_%,t_statistic,p_value,Significance,Significance_Desc,Sample_Size,vs_Environmental_%,vs_Emission_%,vs_Economic_%
0,Environmental Suitability,predicted_prob,0.465428,6.850626,5.75313,-17.903096,31.604348,1.190765,0.177956,ns,不显著,3,-3.903782,15.770788,8.684872
1,Emission Mitigation,Expectation_net_benefit,6830.001731,2.820677,3.769277,-13.397212,19.038565,0.748334,0.266146,ns,不显著,3,10.345769,-1.331837,-0.551903
2,Economic Feasibility,avg_npv,179.124669,629.598285,637.62606,-2113.885224,3373.081794,0.98741,0.213763,ns,不显著,3,1904.76134,5.069186,-21.03567



总体性能提升汇总:


Unnamed: 0,Analysis,Mean_Improvement_%,Std_Error_%,CI_95,Z_statistic,p_value,Significance,Result,n_dimensions
0,OVERALL (3 Dimensions),213.089863,212.554385,"[-203.52%, 629.70%]",1.002519,0.158046,ns,不显著,3


In [None]:
'''
主要逻辑：描述在740GW的能源转型目标下，美国单准则的累积效应差距，重点读取适宜性和减排的百分比
指针变量：指向Result第四部分
'''

# 740GW目标下的效率差异
import numpy as np
import pandas as pd

def analyze_pv_capacity_target_efficiency(merged_data_for_plot, target_pv_gw=740):
    """
    分析在给定PV容量目标下，不同排序方式的效率和效益
    仅分析Environmental, Emission, Economic三种排序方式
    """
    
    print("\n" + "="*100)
    print(f"PV容量目标分析：建设 {target_pv_gw} GW")
    print("="*100)
    
    # ===== 1. 计算每个像素的PV容量（kW） =====
    pv_capacity_kw = merged_data_for_plot['area_m2'].values * 0.17
    total_pv_capacity_gw = pv_capacity_kw.sum() / 1e6
    
    print(f"\n【总体PV潜力】")
    print(f"  总像素数: {len(pv_capacity_kw):,}")
    print(f"  总PV潜力: {total_pv_capacity_gw:.2f} GW")
    print(f"  目标容量: {target_pv_gw:.2f} GW")
    print(f"  目标占总潜力比: {target_pv_gw/total_pv_capacity_gw*100:.2f}%")
    
    if target_pv_gw > total_pv_capacity_gw:
        print(f"\n⚠️ 警告：目标容量超过总潜力")
        return None
    
    # ===== 2. 定义排序方案和目标维度 =====
    sort_methods = {
        'Environmental': 'predicted_prob',
        'Emission': 'Expectation_net_benefit',
        'Economic': 'avg_npv'
    }
    
    target_dimensions = {
        'Environmental': ('predicted_prob', 'Score'),
        'Emission': ('Expectation_net_benefit', 'Mg C'),
        'Economic': ('avg_npv', 'USD')
    }
    
    # 准备面积数据
    area_values = merged_data_for_plot['area_m2'].values / 10000  # 转换为公顷
    target_pv_kw = target_pv_gw * 1e6
    
    # ===== 3. 对每种排序方式进行分析 =====
    results = []
    
    for sort_name, sort_var in sort_methods.items():
        print(f"\n{'='*100}")
        print(f"【按 {sort_name} 排序】")
        print(f"{'='*100}")
        
        # 获取排序变量（密度值）
        sort_values = merged_data_for_plot[sort_var].values.astype(float)
        
        # 对于 Expectation_net_benefit，如果需要统一单位，除以 0.27
        if sort_var == 'Expectation_net_benefit':
            sort_values = sort_values / 0.27
        
        # 统一按密度值排序（不再使用总值排序）
        sorted_indices = np.argsort(sort_values)[::-1]
        
        # 计算累积PV容量，找到达到目标的位置
        cumulative_pv_kw = np.cumsum(pv_capacity_kw[sorted_indices])
        target_idx = np.searchsorted(cumulative_pv_kw, target_pv_kw)
        
        if target_idx >= len(cumulative_pv_kw):
            print(f"  ⚠️ 无法达到目标容量")
            continue
        
        # 基本信息
        pixels_needed = target_idx + 1
        percentage_needed = pixels_needed / len(pv_capacity_kw) * 100
        actual_pv_gw = cumulative_pv_kw[target_idx] / 1e6
        selected_indices = sorted_indices[:pixels_needed]
        
        # 使用的总面积（公顷）
        area_used_ha = area_values[selected_indices].sum()
        
        print(f"  所需像素数: {pixels_needed:,} / {len(pv_capacity_kw):,}")
        print(f"  所需百分比: {percentage_needed:.2f}%")
        print(f"  使用面积: {area_used_ha:,.2f} 公顷")
        print(f"  实际PV容量: {actual_pv_gw:.2f} GW")
        
        # ===== 4. 计算三个维度的累积效益（绝对值和单位面积） =====
        print(f"\n  【累积效益 - 绝对值】")
        
        dimension_benefits = {}
        dimension_per_area = {}
        
        for dim_name, (dim_var, dim_unit) in target_dimensions.items():
            dim_values = merged_data_for_plot[dim_var].values.astype(float)
            
            # 计算累积效益（绝对值）
            if dim_var == 'Expectation_net_benefit':
                dim_total_values = dim_values / 0.27 * area_values
                cumulative_benefit = dim_total_values[selected_indices].sum()
            elif dim_var == 'predicted_prob':
                cumulative_benefit = dim_values[selected_indices].sum()
            else:
                dim_total_values = dim_values * area_values
                cumulative_benefit = dim_total_values[selected_indices].sum()
            
            # 计算单位面积效益
            benefit_per_ha = cumulative_benefit / area_used_ha
            
            dimension_benefits[dim_name] = cumulative_benefit
            dimension_per_area[dim_name] = benefit_per_ha
            
            # 格式化输出
            if dim_var == 'avg_npv':
                print(f"    {dim_name}: {cumulative_benefit/1e9:.2f} B USD")
            else:
                print(f"    {dim_name}: {cumulative_benefit:.6e}")
        
        print(f"\n  【单位面积效益（每公顷）】")
        for dim_name in target_dimensions.keys():
            dim_var = target_dimensions[dim_name][0]
            if dim_var == 'avg_npv':
                print(f"    {dim_name}: {dimension_per_area[dim_name]/1e6:.2f} M USD/ha")
            else:
                print(f"    {dim_name}: {dimension_per_area[dim_name]:.6e} /ha")
        
        # 存储结果
        result_row = {
            'Sort_Method': sort_name,
            'Percentage_Needed_%': percentage_needed,
            'Pixels_Needed': pixels_needed,
            'Area_Used_ha': area_used_ha,
            'Actual_PV_GW': actual_pv_gw
        }
        
        # 添加绝对效益
        for dim_name, benefit in dimension_benefits.items():
            result_row[f'{dim_name}_Total'] = benefit
        
        # 添加单位面积效益
        for dim_name, per_area in dimension_per_area.items():
            result_row[f'{dim_name}_per_ha'] = per_area
        
        results.append(result_row)
    
    # ===== 5. 转换为DataFrame =====
    results_df = pd.DataFrame(results)
    
    # ===== 6. 效率对比 =====
    print("\n" + "="*100)
    print("【效率对比】")
    print("="*100)
    
    min_pct = results_df['Percentage_Needed_%'].min()
    max_pct = results_df['Percentage_Needed_%'].max()
    min_method = results_df.loc[results_df['Percentage_Needed_%'].idxmin(), 'Sort_Method']
    max_method = results_df.loc[results_df['Percentage_Needed_%'].idxmax(), 'Sort_Method']
    
    print(f"  最高效（所需优先级最少）: {min_method} - {min_pct:.2f}%")
    print(f"  最低效（所需优先级最多）: {max_method} - {max_pct:.2f}%")
    print(f"  效率差异: {max_pct - min_pct:.2f} 百分点")
    
    print(f"\n  各方法所需百分比:")
    for _, row in results_df.iterrows():
        efficiency_vs_best = (row['Percentage_Needed_%'] - min_pct) / min_pct * 100 if min_pct > 0 else 0
        print(f"    {row['Sort_Method']}: {row['Percentage_Needed_%']:.2f}% (比最优多 {efficiency_vs_best:+.2f}%)")
    
    # ===== 7. 交叉分析 - 累积效益（绝对值） =====
    print("\n" + "="*100)
    print("【交叉分析 - 累积效益（绝对值）】")
    print("="*100)
    
    for dim_name in target_dimensions.keys():
        col_name = f'{dim_name}_Total'
        dim_var = target_dimensions[dim_name][0]
        
        print(f"\n  【{dim_name} 目标维度】")
        
        max_benefit = results_df[col_name].max()
        min_benefit = results_df[col_name].min()
        max_method = results_df.loc[results_df[col_name].idxmax(), 'Sort_Method']
        min_method = results_df.loc[results_df[col_name].idxmin(), 'Sort_Method']
        
        benefit_gap = max_benefit - min_benefit
        benefit_gap_percent = (benefit_gap / max_benefit * 100) if max_benefit != 0 else 0
        
        if dim_var == 'avg_npv':
            print(f"    最高: {max_benefit/1e9:.2f} B USD ({max_method})")
            print(f"    最低: {min_benefit/1e9:.2f} B USD ({min_method})")
            print(f"    差距: {benefit_gap/1e9:.2f} B USD ({benefit_gap_percent:.2f}%)")
        else:
            print(f"    最高: {max_benefit:.6e} ({max_method})")
            print(f"    最低: {min_benefit:.6e} ({min_method})")
            print(f"    差距: {benefit_gap:.6e} ({benefit_gap_percent:.2f}%)")
        
        print(f"\n    各排序方式效益:")
        for _, row in results_df.iterrows():
            benefit = row[col_name]
            vs_max = (benefit - max_benefit) / max_benefit * 100 if max_benefit != 0 else 0
            
            if dim_var == 'avg_npv':
                print(f"      {row['Sort_Method']}: {benefit/1e9:.2f} B USD ({vs_max:+.2f}%)")
            else:
                print(f"      {row['Sort_Method']}: {benefit:.6e} ({vs_max:+.2f}%)")
    
    # ===== 8. 交叉分析 - 单位面积效益 =====
    print("\n" + "="*100)
    print("【交叉分析 - 单位面积效益】")
    print("="*100)
    
    for dim_name in target_dimensions.keys():
        col_name = f'{dim_name}_per_ha'
        dim_var = target_dimensions[dim_name][0]
        
        print(f"\n  【{dim_name} 目标维度（每公顷）】")
        
        max_per_ha = results_df[col_name].max()
        min_per_ha = results_df[col_name].min()
        max_method = results_df.loc[results_df[col_name].idxmax(), 'Sort_Method']
        min_method = results_df.loc[results_df[col_name].idxmin(), 'Sort_Method']
        
        per_ha_gap = max_per_ha - min_per_ha
        per_ha_gap_percent = (per_ha_gap / max_per_ha * 100) if max_per_ha != 0 else 0
        
        if dim_var == 'avg_npv':
            print(f"    最高: {max_per_ha/1e6:.2f} M USD/ha ({max_method})")
            print(f"    最低: {min_per_ha/1e6:.2f} M USD/ha ({min_method})")
            print(f"    差距: {per_ha_gap/1e6:.2f} M USD/ha ({per_ha_gap_percent:.2f}%)")
        else:
            print(f"    最高: {max_per_ha:.6e} /ha ({max_method})")
            print(f"    最低: {min_per_ha:.6e} /ha ({min_method})")
            print(f"    差距: {per_ha_gap:.6e} /ha ({per_ha_gap_percent:.2f}%)")
        
        print(f"\n    各排序方式单位效益:")
        for _, row in results_df.iterrows():
            per_ha = row[col_name]
            vs_max = (per_ha - max_per_ha) / max_per_ha * 100 if max_per_ha != 0 else 0
            
            if dim_var == 'avg_npv':
                print(f"      {row['Sort_Method']}: {per_ha/1e6:.2f} M USD/ha ({vs_max:+.2f}%)")
            else:
                print(f"      {row['Sort_Method']}: {per_ha:.6e} /ha ({vs_max:+.2f}%)")
    
    print("\n" + "="*100)
    
    return results_df

# 执行分析
pv_target_results = analyze_pv_capacity_target_efficiency(merged_data_for_plot, target_pv_gw=740)

# 显示完整结果
if pv_target_results is not None:
    print("\n完整结果表:")
    display(pv_target_results)


PV容量目标分析：建设 740 GW

【总体PV潜力】
  总像素数: 70,337
  总PV潜力: 7995.71 GW
  目标容量: 740.00 GW
  目标占总潜力比: 9.25%

【按 Environmental 排序】
  所需像素数: 6,668 / 70,337
  所需百分比: 9.48%
  使用面积: 435,295.16 公顷
  实际PV容量: 740.00 GW

  【累积效益 - 绝对值】
    Environmental: 6.666479e+03
    Emission: 5.053439e+09
    Economic: -106.75 B USD

  【单位面积效益（每公顷）】
    Environmental: 1.531485e-02 /ha
    Emission: 1.160922e+04 /ha
    Economic: -0.25 M USD/ha

【按 Emission 排序】
  所需像素数: 6,316 / 70,337
  所需百分比: 8.98%
  使用面积: 435,320.83 公顷
  实际PV容量: 740.05 GW

  【累积效益 - 绝对值】
    Environmental: 4.229324e+03
    Emission: 6.983368e+09
    Economic: 576.56 B USD

  【单位面积效益（每公顷）】
    Environmental: 9.715419e-03 /ha
    Emission: 1.604189e+04 /ha
    Economic: 1.32 M USD/ha

【按 Economic 排序】
  所需像素数: 6,224 / 70,337
  所需百分比: 8.85%
  使用面积: 435,340.98 公顷
  实际PV容量: 740.08 GW

  【累积效益 - 绝对值】
    Environmental: 4.560264e+03
    Emission: 6.768526e+09
    Economic: 714.38 B USD

  【单位面积效益（每公顷）】
    Environmental: 1.047515e-02 /ha
    Emission: 1.

Unnamed: 0,Sort_Method,Percentage_Needed_%,Pixels_Needed,Area_Used_ha,Actual_PV_GW,Environmental_Total,Emission_Total,Economic_Total,Environmental_per_ha,Emission_per_ha,Economic_per_ha
0,Environmental,9.480074,6668,435295.159945,740.001772,6666.478655,5053439000.0,-106746100000.0,0.015315,11609.224538,-245226.8
1,Emission,8.979627,6316,435320.834883,740.045419,4229.324441,6983368000.0,576557800000.0,0.009715,16041.887973,1324443.0
2,Economic,8.848828,6224,435340.975933,740.079659,4560.263798,6768526000.0,714381700000.0,0.010475,15547.643475,1640971.0


## 2、Environmental suitability for PV deploying 

In [29]:
# 对二阶段判断模型均值进行参数估计（置信区间）
import scipy.stats as stats

'''
指针：2.2 有关高斯components的统计
数据：均值在前面的几个州
计算：利用总量、排序即可
'''
df_weight = pd.read_csv('data/US_data/df_weight.csv')

mean_pred_prob = df_weight['predicted_prob'].mean() * 100
std_pred_prob = df_weight['predicted_prob'].std(ddof=1) * 100
n_pred_prob = df_weight['predicted_prob'].count()

conf_level = 0.95
alpha = 1 - conf_level
t_crit = stats.t.ppf(1 - alpha/2, df=n_pred_prob-1)
se_pred_prob = std_pred_prob / (n_pred_prob ** 0.5)

# 用±表示百分数
print(f"predicted_prob均值: {mean_pred_prob:.2f}% ± {t_crit * se_pred_prob:.2f}% (95% CI)")

# highly suitable >90%统计（即prob>0.9）
num_highly_suitable = (df_weight['predicted_prob'] > 0.9).sum()
percent_highly_suitable = num_highly_suitable / n_pred_prob * 100
print(f"被归类为 highly suitable(>90%) 的像元数量: {num_highly_suitable} ({percent_highly_suitable:.2f}%)")

# 去除0值后
df_nonzero = df_weight[df_weight['predicted_prob'] != 0]
mean_pred_prob_nonzero = df_nonzero['predicted_prob'].mean() * 100
std_pred_prob_nonzero = df_nonzero['predicted_prob'].std(ddof=1) * 100
n_pred_prob_nonzero = df_nonzero['predicted_prob'].count()
t_crit_nonzero = stats.t.ppf(1 - alpha/2, df=n_pred_prob_nonzero-1)
se_pred_prob_nonzero = std_pred_prob_nonzero / (n_pred_prob_nonzero ** 0.5)

print(f"去除0值后predicted_prob均值: {mean_pred_prob_nonzero:.2f}% ± {t_crit_nonzero * se_pred_prob_nonzero:.2f}% (95% CI)")

# highly suitable >90%（去除0值后）
num_highly_suitable_nonzero = (df_nonzero['predicted_prob'] > 0.9).sum()
percent_highly_suitable_nonzero = num_highly_suitable_nonzero / n_pred_prob_nonzero * 100
print(f"去除0值后被归类为 highly suitable(>90%) 的像元数量: {num_highly_suitable_nonzero} ({percent_highly_suitable_nonzero:.2f}%)")

predicted_prob均值: 84.03% ± 0.22% (95% CI)
被归类为 highly suitable(>90%) 的像元数量: 52576 (74.75%)
去除0值后predicted_prob均值: 84.03% ± 0.22% (95% CI)
去除0值后被归类为 highly suitable(>90%) 的像元数量: 52576 (74.75%)


In [38]:
# 适宜性概率的州排名
import numpy as np 
import pandas as pd 

'''
指针：2.2 有关适宜性概率的统计
数据：均值在前面的几个州
计算：利用总量、排序即可
'''
state_analysis_df = pd.read_csv('data/US_data/US_analysis_reslut/state_level_analysis_with_wccd.csv')



# 计算各州环境适宜性百分比，并排序
state_analysis_df['Environmental_suitability_per_ha_percent'] = (state_analysis_df['Environmental_suitability_per_ha'] * 100).round(2)

# 按环境适宜性降序排序
sorted_df = state_analysis_df.sort_values('Environmental_suitability_per_ha_percent', ascending=False).reset_index(drop=True)

# 找加利福尼亚州排名
california_mask = sorted_df['State_name'].str.lower().str.contains('california')
if california_mask.any():
    california_idx = sorted_df[california_mask].index[0] + 1  # 排名从1开始
    california_percent = sorted_df.loc[california_mask, 'Environmental_suitability_per_ha_percent'].values[0]
    print(f"加利福尼亚州在环境适宜性中的排名为: 第{california_idx}位，百分比为{california_percent}%")
else:
    print("未找到加利福尼亚州数据。")

# 若需要环境适宜性高于90%州的列表：
states_over_90 = sorted_df[sorted_df['Environmental_suitability_per_ha_percent'] > 90][['State_name', 'Environmental_suitability_per_ha_percent']]
print(states_over_90)



加利福尼亚州在环境适宜性中的排名为: 第39位，百分比为73.62%
        State_name  Environmental_suitability_per_ha_percent
0     Rhode Island                                     99.94
1          Vermont                                     99.64
2         New York                                     99.48
3            Maine                                     99.28
4          Georgia                                     99.11
5   South Carolina                                     99.09
6    Massachusetts                                     99.03
7   North Carolina                                     98.66
8         Michigan                                     98.48
9      Connecticut                                     98.48
10    Pennsylvania                                     98.04
11         Alabama                                     97.99
12       Louisiana                                     97.80
13   West Virginia                                     97.60
14     Mississippi                                

In [33]:
# 一阶段模型的环境模式统计以及df生成
import joblib
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import geopandas as gpd
from shapely.geometry import Point
from function import *

us_nation = gpd.read_file(r'data\US_data\cb_2018_us_nation_5m.shp')
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')
us_counties = gpd.read_file('data/cb_2018_us_county_500k.shp')
us_argscv = gpd.read_file("figure/draw_shp/ASD_2012_5m.shp")


us_nation_4326 = us_nation.to_crs('EPSG:4326')
us_states_4326 = us_states.to_crs('EPSG:4326')
us_counties_4326 = us_counties.to_crs('EPSG:4326')
us_argscv_4326 = us_argscv.to_crs('EPSG:4326')

def clip_data_with_us_states(df, us_states_gdf, lon_col='lon', lat_col='lat'):
    """
    使用美国州界 shapefile 剪裁点数据；兼容不同版本的 geopandas 参数名
    """
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')
    us_states_4326 = us_states_gdf.to_crs('EPSG:4326')

    try:
        clipped = gpd.sjoin(gdf, us_states_4326, how='inner', predicate='within')
    except TypeError:
        # 旧版本 geopandas 使用 op 参数
        clipped = gpd.sjoin(gdf, us_states_4326, how='inner', op='within')

    # 清理 shapefile 附加字段
    clipped = clipped.drop(columns=['geometry', 'index_right'], errors='ignore')
    for col in us_states_gdf.columns:
        if col in clipped.columns:
            clipped = clipped.drop(columns=[col], errors='ignore')
    return clipped

# ---------------------------
# 0) 基础地理范围与底图
# ---------------------------
usa_bounds_main = dict(lon_min=-125, lon_max=-65, lat_min=25, lat_max=49)
# ---------------------------
# 1) 读取数据
# ---------------------------
df_abandon = load_abandon(PATHS['us_abandon'])
df_embedding = load_embedding(PATHS['us_pv_embedding'])
df_net_benefit = pd.read_csv('data/US_data/df_net_benefit.csv')

# 初步经纬度范围过滤
df_embedding = df_embedding[
    (df_embedding['lon'] >= usa_bounds_main['lon_min']) &
    (df_embedding['lon'] <= usa_bounds_main['lon_max']) &
    (df_embedding['lat'] >= usa_bounds_main['lat_min']) &
    (df_embedding['lat'] <= usa_bounds_main['lat_max'])
]

df_abandon = df_abandon[
    (df_abandon['lon'] >= usa_bounds_main['lon_min']) &
    (df_abandon['lon'] <= usa_bounds_main['lon_max']) &
    (df_abandon['lat'] >= usa_bounds_main['lat_min']) &
    (df_abandon['lat'] <= usa_bounds_main['lat_max'])
]


# 第二次用州界矢量更精确裁剪
df_abandon = clip_data_with_us_states(df_abandon, us_nation)
df_embedding = clip_data_with_us_states(df_embedding, us_nation)

# ---------------------------
# 2) 缺失值填充 + 去重
# ---------------------------
df_abandon_fill   = fill_nonpositive_with_nearest(df_abandon)
df_embedding_fill = fill_nonpositive_with_nearest(df_embedding)

df_abandon_filtered = filter_duplicates(df_abandon_fill, df_embedding_fill)
df_abandon_filtered=df_abandon_filtered.merge(df_net_benefit[['lon', 'lat', 'area_m2']], on=['lat', 'lon'], how='left')
features_no_coords = [f for f in (NUMERIC_FEATURES + CAT_COLS) if f not in ['lat', 'lon']]
features_no_coords = [c for c in features_no_coords if c in df_embedding_fill.columns]



def calculate_cluster_probabilities(
    df_embedding_fill,
    features_no_coords,
    model_path="gmm_model_23c_fixed.pkl",
    distance_threshold=1.5,
    return_component_probs=False
):
    """
    计算每个样本属于各个hierarchical cluster的概率
    
    核心思想：
    1. GMM给出每个样本属于各component的概率 P(component|sample)
    2. 层次聚类将components分组为clusters
    3. 聚合概率：P(cluster|sample) = Σ P(component|sample) for all components in cluster
    
    Parameters:
    -----------
    df_embedding_fill : DataFrame
        包含特征的数据框
    features_no_coords : list
        特征列表（不包含经纬度）
    model_path : str
        GMM模型路径
    distance_threshold : float
        层次聚类的距离阈值
    return_component_probs : bool
        是否返回component级别的概率
        
    Returns:
    --------
    df_result : DataFrame
        包含cluster概率列的数据框
    cluster_mapping : dict
        component到cluster的映射关系
    """
    
    # 加载GMM模型
    gmm_pipeline = joblib.load(model_path)
    gmm = gmm_pipeline.named_steps['gmm']
    preprocessor = gmm_pipeline.named_steps['preprocessor']
    
    n_components = gmm.n_components
    means = gmm.means_
    numeric_features = preprocessor.numeric_features
    categorical_features = preprocessor.categorical_features
    n_numeric = len(numeric_features)
    
    # 提取component特征用于层次聚类
    if categorical_features:
        landcover_means = means[:, n_numeric:]
        dominant_landcover_values = np.take_along_axis(
            landcover_means, 
            np.argmax(landcover_means, axis=1)[:, np.newaxis], 
            axis=1
        ).squeeze()
        component_features_std = np.column_stack([
            means[:, :n_numeric], 
            dominant_landcover_values[:, np.newaxis]
        ])
    else:
        component_features_std = means
    
    # 层次聚类
    distance_matrix = pdist(component_features_std, metric='correlation')
    linkage_matrix = linkage(distance_matrix, method='ward')
    cluster_labels = fcluster(linkage_matrix, t=distance_threshold, criterion='distance')
    
    n_clusters = len(np.unique(cluster_labels))
    
    # 构建component到cluster的映射
    component_to_cluster = {comp_idx: cluster_labels[comp_idx] for comp_idx in range(n_components)}
    cluster_mapping = {}
    for cluster_id in range(1, n_clusters + 1):
        components_in_cluster = [comp for comp, clus in component_to_cluster.items() if clus == cluster_id]
        cluster_mapping[cluster_id] = components_in_cluster
    
    # 计算样本的component概率
    X_processed = preprocessor.transform(df_embedding_fill[features_no_coords])
    component_probs = gmm.predict_proba(X_processed)
    
    # 聚合到cluster概率
    cluster_probs = np.zeros((len(df_embedding_fill), n_clusters))
    for cluster_id, components_in_cluster in cluster_mapping.items():
        cluster_idx = cluster_id - 1
        for comp_idx in components_in_cluster:
            cluster_probs[:, cluster_idx] += component_probs[:, comp_idx]
    
    # 创建结果DataFrame
    df_result = df_embedding_fill.copy()
    
    # 添加cluster概率列
    for cluster_id in range(1, n_clusters + 1):
        cluster_idx = cluster_id - 1
        col_name = f'cluster_{cluster_id}_prob'
        df_result[col_name] = cluster_probs[:, cluster_idx]
    
    # 添加主导cluster信息
    df_result['dominant_cluster'] = np.argmax(cluster_probs, axis=1) + 1
    df_result['dominant_cluster_prob'] = np.max(cluster_probs, axis=1)
    
    # 添加component级别的概率（可选）
    if return_component_probs:
        for comp_idx in range(n_components):
            col_name = f'component_{comp_idx}_prob'
            df_result[col_name] = component_probs[:, comp_idx]
        df_result['dominant_component'] = np.argmax(component_probs, axis=1)
        df_result['dominant_component_prob'] = np.max(component_probs, axis=1)
    
    # 计算不确定性（熵）
    cluster_probs_safe = np.clip(cluster_probs, 1e-10, 1.0)
    entropy = -np.sum(cluster_probs_safe * np.log(cluster_probs_safe), axis=1)
    max_entropy = np.log(n_clusters)
    
    df_result['cluster_entropy'] = entropy
    df_result['cluster_uncertainty'] = entropy / max_entropy
    
    # 简洁统计摘要
    print(f"Clusters: {n_clusters} | Samples: {len(df_result):,}")
    for cluster_id in range(1, n_clusters + 1):
        n_dominant = (df_result['dominant_cluster'] == cluster_id).sum()
        pct = n_dominant / len(df_result) * 100
        print(f"Cluster {cluster_id}: {n_dominant:,} ({pct:.2f}%)")
    
    return df_result, cluster_mapping



# # ==================== 基础用法 ====================

# df_result, cluster_mapping = calculate_cluster_probabilities(
#     df_embedding_fill=df_embedding_fill,
#     features_no_coords=features_no_coords,
#     model_path="gmm_model_23c_fixed.pkl",
#     distance_threshold=1.5,
#     return_component_probs=False
# )


# ==================== 高级用法：包含component概率 ====================

df_result_full, cluster_mapping = calculate_cluster_probabilities(
    df_embedding_fill=df_embedding_fill,
    features_no_coords=features_no_coords,
    model_path="gmm_model_23c_fixed.pkl",
    distance_threshold=1.5,
    return_component_probs=True
)





You want to predict the year: 2020.0
列 GDPpc 没有需要填充的非正值或NaN值
列 GDPpc 没有需要填充的非正值或NaN值
Clusters: 3 | Samples: 10,473
Cluster 1: 4,072 (38.88%)
Cluster 2: 5,536 (52.86%)
Cluster 3: 865 (8.26%)


In [37]:
# 二阶段模型的环境结构后验概率（仅统计信息输出）
import joblib
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import geopandas as gpd
from shapely.geometry import Point
from function import *

us_nation = gpd.read_file(r'data\US_data\cb_2018_us_nation_5m.shp')
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')
us_counties = gpd.read_file('data/cb_2018_us_county_500k.shp')
us_argscv = gpd.read_file("figure/draw_shp/ASD_2012_5m.shp")


us_nation_4326 = us_nation.to_crs('EPSG:4326')
us_states_4326 = us_states.to_crs('EPSG:4326')
us_counties_4326 = us_counties.to_crs('EPSG:4326')
us_argscv_4326 = us_argscv.to_crs('EPSG:4326')

def clip_data_with_us_states(df, us_states_gdf, lon_col='lon', lat_col='lat'):
    """
    使用美国州界 shapefile 剪裁点数据；兼容不同版本的 geopandas 参数名
    """
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')
    us_states_4326 = us_states_gdf.to_crs('EPSG:4326')

    try:
        clipped = gpd.sjoin(gdf, us_states_4326, how='inner', predicate='within')
    except TypeError:
        # 旧版本 geopandas 使用 op 参数
        clipped = gpd.sjoin(gdf, us_states_4326, how='inner', op='within')

    # 清理 shapefile 附加字段
    clipped = clipped.drop(columns=['geometry', 'index_right'], errors='ignore')
    for col in us_states_gdf.columns:
        if col in clipped.columns:
            clipped = clipped.drop(columns=[col], errors='ignore')
    return clipped

# ---------------------------
# 0) 基础地理范围与底图
# ---------------------------
usa_bounds_main = dict(lon_min=-125, lon_max=-65, lat_min=25, lat_max=49)
# ---------------------------
# 1) 读取数据
# ---------------------------
df_abandon = load_abandon(PATHS['us_abandon'])
df_embedding = load_embedding(PATHS['us_pv_embedding'])
df_net_benefit = pd.read_csv('data/US_data/df_net_benefit.csv')

# 初步经纬度范围过滤
df_embedding = df_embedding[
    (df_embedding['lon'] >= usa_bounds_main['lon_min']) &
    (df_embedding['lon'] <= usa_bounds_main['lon_max']) &
    (df_embedding['lat'] >= usa_bounds_main['lat_min']) &
    (df_embedding['lat'] <= usa_bounds_main['lat_max'])
]

df_abandon = df_abandon[
    (df_abandon['lon'] >= usa_bounds_main['lon_min']) &
    (df_abandon['lon'] <= usa_bounds_main['lon_max']) &
    (df_abandon['lat'] >= usa_bounds_main['lat_min']) &
    (df_abandon['lat'] <= usa_bounds_main['lat_max'])
]

# 第二次用州界矢量更精确裁剪
df_abandon = clip_data_with_us_states(df_abandon, us_nation)
df_embedding = clip_data_with_us_states(df_embedding, us_nation)

# ---------------------------
# 2) 缺失值填充 + 去重
# ---------------------------
df_abandon_fill   = fill_nonpositive_with_nearest(df_abandon)
df_embedding_fill = fill_nonpositive_with_nearest(df_embedding)

df_abandon_filtered = filter_duplicates(df_abandon_fill, df_embedding_fill)
df_abandon_filtered=df_abandon_filtered.merge(df_net_benefit[['lon', 'lat', 'area_m2']], on=['lat', 'lon'], how='left')
features_no_coords = [f for f in (NUMERIC_FEATURES + CAT_COLS) if f not in ['lat', 'lon']]
features_no_coords = [c for c in features_no_coords if c in df_embedding_fill.columns]


def abandon_component_prob_stats(
    df_abandon_filtered,
    model_path="gmm_model_34c_fixed.pkl",
    features_no_coords=None,
    distance_threshold=1.5
):
    """
    统计撂荒地样本在各个GMM component及cluster上的概率分布（无绘图，仅统计信息）
    """
    print("=" * 80)
    print("统计撂荒地样本的Component和Cluster概率分布")
    print("=" * 80)
    
    # 1. 加载GMM模型
    print("\n步骤1: 加载GMM模型...")
    gmm_pipeline = joblib.load(model_path)
    gmm = gmm_pipeline.named_steps['gmm']
    preprocessor = gmm_pipeline.named_steps['preprocessor']
    means = gmm.means_
    n_components = gmm.n_components
    print(f"  模型组件数: {n_components}")
    
    # 2. 层次聚类
    print("\n步骤2: 执行层次聚类...")
    numeric_features = preprocessor.numeric_features
    categorical_features = preprocessor.categorical_features
    n_numeric = len(numeric_features)
    # if categorical_features:
    #     landcover_means = means[:, n_numeric:]
    #     dominant_landcover_values = np.take_along_axis(
    #         landcover_means, 
    #         np.argmax(landcover_means, axis=1)[:, np.newaxis], 
    #         axis=1
    #     ).squeeze()
    #     component_features_std = np.column_stack([
    #         means[:, :n_numeric], 
    #         dominant_landcover_values[:, np.newaxis]
    #     ])
    # else:
    #     component_features_std = means
    # distance_matrix = pdist(component_features_std, metric='correlation')
    # linkage_matrix = linkage(distance_matrix, method='ward')
    # cluster_labels = fcluster(linkage_matrix, t=distance_threshold, criterion='distance')

    # 构建用于聚类的特征矩阵：使用完整的landcover概率分布
    if categorical_features:
        landcover_means = means[:, n_numeric:]
        # 使用完整的landcover概率分布，而不是只取dominant值
        component_features_for_clustering = np.column_stack([
            means[:, :n_numeric],      # numeric特征
            landcover_means            # 完整的landcover概率分布（所有类别）
        ])
        n_features_for_clustering = n_numeric + landcover_means.shape[1]
        print(f"  聚类特征: numeric ({n_numeric}) + landcover ({landcover_means.shape[1]} 类别)")
    else:
        component_features_for_clustering = means
        n_features_for_clustering = n_numeric
        print(f"  聚类特征: numeric ({n_numeric})，无landcover")

    distance_matrix = pdist(component_features_for_clustering, metric='correlation')
    linkage_matrix = linkage(distance_matrix, method='ward')
    cluster_labels = fcluster(linkage_matrix, t=distance_threshold, criterion='distance')
    print(f"  聚类数量: {len(set(cluster_labels))}")
    clusters_dict = {}
    for comp_idx in range(n_components):
        cluster_id = cluster_labels[comp_idx]
        if cluster_id not in clusters_dict:
            clusters_dict[cluster_id] = []
        clusters_dict[cluster_id].append(comp_idx)
    for cluster_id, components in clusters_dict.items():
        print(f"  Cluster {cluster_id}: Components {components}")
    
    # 3. 计算撂荒地样本的component概率
    print(f"\n步骤3: 计算撂荒地样本的component概率...")
    print(f"  撂荒地样本数量: {len(df_abandon_filtered):,}")
    X_abandon = preprocessor.transform(df_abandon_filtered[features_no_coords])
    component_probs = gmm.predict_proba(X_abandon)
    print(f"  概率矩阵形状: {component_probs.shape}")  # (n_samples, n_components)
    
    # 4. 统计(每个component、每个cluster)撂荒地属于这个component/cluster的概率均值、最大隶属情况
    print("\n步骤4: 统计各component的概率均值及dominant比例...")
    comp_means = component_probs.mean(axis=0)
    comp_dominant_counts = (np.argmax(component_probs, axis=1).reshape(-1,1) == np.arange(n_components)).sum(axis=0)
    for comp_idx in range(n_components):
        print(f"Component {comp_idx}: 平均概率={comp_means[comp_idx]:.4f}, "
              f"dominant数量={comp_dominant_counts[comp_idx]}, "
              f"占比={comp_dominant_counts[comp_idx]/len(df_abandon_filtered)*100:.2f}%")
    
    print("\n步骤5: 统计各cluster的总概率加和均值")
    cluster_avg_probs = {}
    cluster_dominant_counts = {}
    dominant_components_all = np.argmax(component_probs, axis=1)
    for cluster_id, components in clusters_dict.items():
        probs_sum = component_probs[:, components].sum(axis=1)
        cluster_avg_probs[cluster_id] = probs_sum.mean()
        is_dominant = np.isin(dominant_components_all, components)
        cluster_dominant_counts[cluster_id] = is_dominant.sum()
        print(f"Cluster {cluster_id}: 平均概率加和={cluster_avg_probs[cluster_id]:.4f}, "
              f"dominant数量={cluster_dominant_counts[cluster_id]}, "
              f"占比={cluster_dominant_counts[cluster_id]/len(df_abandon_filtered)*100:.2f}%")
    
    stats = {
        'n_components': n_components,
        'n_samples': len(df_abandon_filtered),
        'n_clusters': len(clusters_dict),
        'clusters_dict': clusters_dict,
        'component_probs_mean': comp_means,
        'component_dominant_counts': comp_dominant_counts,
        'cluster_avg_probs': cluster_avg_probs,
        'cluster_dominant_counts': cluster_dominant_counts,
        'component_probs': component_probs,
    }
    print("\n统计完成。")
    print("=" * 80)
    return stats

# ==================== 运行 ====================
try:
    stats = abandon_component_prob_stats(
        df_abandon_filtered=df_abandon_filtered,
        model_path="gmm_model_23c_fixed.pkl",
        features_no_coords=features_no_coords,
        distance_threshold=1.5
    )

    print(f"\n统计信息:")
    print(f"  组件数: {stats['n_components']}")
    print(f"  样本数: {stats['n_samples']}")
    print(f"  聚类数: {stats['n_clusters']}")
    print(f"  clusters_dict: {stats['clusters_dict']}")
except Exception as e:
    print(f"❌ 统计失败: {e}")
    import traceback
    traceback.print_exc()

You want to predict the year: 2020.0
列 GDPpc 没有需要填充的非正值或NaN值
列 GDPpc 没有需要填充的非正值或NaN值
统计撂荒地样本的Component和Cluster概率分布

步骤1: 加载GMM模型...
  模型组件数: 23

步骤2: 执行层次聚类...
  聚类特征: numeric (14) + landcover (9 类别)
  聚类数量: 3
  Cluster 2: Components [0, 5, 6, 12, 14, 18, 22]
  Cluster 1: Components [1, 4, 7, 10, 13, 15, 17]
  Cluster 3: Components [2, 3, 8, 9, 11, 16, 19, 20, 21]

步骤3: 计算撂荒地样本的component概率...
  撂荒地样本数量: 70,337
  概率矩阵形状: (70337, 23)

步骤4: 统计各component的概率均值及dominant比例...
Component 0: 平均概率=0.0006, dominant数量=44, 占比=0.06%
Component 1: 平均概率=0.0036, dominant数量=251, 占比=0.36%
Component 2: 平均概率=0.1676, dominant数量=11684, 占比=16.61%
Component 3: 平均概率=0.0285, dominant数量=2007, 占比=2.85%
Component 4: 平均概率=0.0005, dominant数量=38, 占比=0.05%
Component 5: 平均概率=0.0005, dominant数量=34, 占比=0.05%
Component 6: 平均概率=0.0014, dominant数量=101, 占比=0.14%
Component 7: 平均概率=0.0380, dominant数量=2676, 占比=3.80%
Component 8: 平均概率=0.0377, dominant数量=2646, 占比=3.76%
Component 9: 平均概率=0.0169, dominant数量=1200, 占比=1.71%
Component 10

## 3、Emission abatement analysis

In [2]:
import xarray as xr
from function import *

ds_merge=load_all_ds()
ds_merge


gogogo
[OK] TensorFlow available
[OK] scikeras available
[OK] SHAP available


  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(


Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 38.60 MiB Shape (31, 21600, 43200) (31, 600, 544) Dask graph 3741 chunks in 2499 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 38.60 MiB Shape (31, 21600, 43200) (31, 600, 544) Dask graph 3741 chunks in 2499 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 38.60 MiB Shape (31, 21600, 43200) (31, 600, 544) Dask graph 3741 chunks in 2499 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 38.60 MiB Shape (31, 21600, 43200) (31, 600, 544) Dask graph 3741 chunks in 2499 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,38.60 MiB
Shape,"(31, 21600, 43200)","(31, 600, 544)"
Dask graph,3741 chunks in 2499 graph layers,3741 chunks in 2499 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,3.80 MiB
Shape,"(31, 21600, 43200)","(16, 250, 249)"
Dask graph,30276 chunks in 2579 graph layers,30276 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 3.80 MiB Shape (31, 21600, 43200) (16, 250, 249) Dask graph 30276 chunks in 2579 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,3.80 MiB
Shape,"(31, 21600, 43200)","(16, 250, 249)"
Dask graph,30276 chunks in 2579 graph layers,30276 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,490.43 MiB
Shape,"(31, 21600, 43200)","(31, 1440, 2880)"
Dask graph,225 chunks in 4 graph layers,225 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 490.43 MiB Shape (31, 21600, 43200) (31, 1440, 2880) Dask graph 225 chunks in 4 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,490.43 MiB
Shape,"(31, 21600, 43200)","(31, 1440, 2880)"
Dask graph,225 chunks in 4 graph layers,225 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,64.02 MiB
Shape,"(31, 21600, 43200)","(1, 1554, 5400)"
Dask graph,6076 chunks in 24 graph layers,6076 chunks in 24 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 215.52 GiB 64.02 MiB Shape (31, 21600, 43200) (1, 1554, 5400) Dask graph 6076 chunks in 24 graph layers Data type float64 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,64.02 MiB
Shape,"(31, 21600, 43200)","(1, 1554, 5400)"
Dask graph,6076 chunks in 24 graph layers,6076 chunks in 24 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,72.75 MiB
Shape,"(31, 21600, 43200)","(1, 3090, 6172)"
Dask graph,1519 chunks in 19 graph layers,1519 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 72.75 MiB Shape (31, 21600, 43200) (1, 3090, 6172) Dask graph 1519 chunks in 19 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,72.75 MiB
Shape,"(31, 21600, 43200)","(1, 3090, 6172)"
Dask graph,1519 chunks in 19 graph layers,1519 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,843.75 kiB
Shape,"(31, 21600, 43200)","(1, 432, 500)"
Dask graph,144150 chunks in 22 graph layers,144150 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 843.75 kiB Shape (31, 21600, 43200) (1, 432, 500) Dask graph 144150 chunks in 22 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,843.75 kiB
Shape,"(31, 21600, 43200)","(1, 432, 500)"
Dask graph,144150 chunks in 22 graph layers,144150 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,1.01 MiB
Shape,"(31, 21600, 43200)","(1, 530, 500)"
Dask graph,118668 chunks in 21 graph layers,118668 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 1.01 MiB Shape (31, 21600, 43200) (1, 530, 500) Dask graph 118668 chunks in 21 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,1.01 MiB
Shape,"(31, 21600, 43200)","(1, 530, 500)"
Dask graph,118668 chunks in 21 graph layers,118668 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,490.43 MiB
Shape,"(31, 21600, 43200)","(31, 1440, 2880)"
Dask graph,225 chunks in 4 graph layers,225 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 490.43 MiB Shape (31, 21600, 43200) (31, 1440, 2880) Dask graph 225 chunks in 4 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,490.43 MiB
Shape,"(31, 21600, 43200)","(31, 1440, 2880)"
Dask graph,225 chunks in 4 graph layers,225 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,843.75 kiB
Shape,"(31, 21600, 43200)","(1, 432, 500)"
Dask graph,144150 chunks in 22 graph layers,144150 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 843.75 kiB Shape (31, 21600, 43200) (1, 432, 500) Dask graph 144150 chunks in 22 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,843.75 kiB
Shape,"(31, 21600, 43200)","(1, 432, 500)"
Dask graph,144150 chunks in 22 graph layers,144150 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,120.97 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1024)"
Dask graph,946 chunks in 4 graph layers,946 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 120.97 MiB Shape (31, 21600, 43200) (31, 999, 1024) Dask graph 946 chunks in 4 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,120.97 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1024)"
Dask graph,946 chunks in 4 graph layers,946 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,464.04 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1964)"
Dask graph,484 chunks in 4 graph layers,484 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 215.52 GiB 464.04 MiB Shape (31, 21600, 43200) (31, 999, 1964) Dask graph 484 chunks in 4 graph layers Data type float64 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,464.04 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1964)"
Dask graph,484 chunks in 4 graph layers,484 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,72.75 MiB
Shape,"(31, 21600, 43200)","(1, 3090, 6172)"
Dask graph,1519 chunks in 19 graph layers,1519 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 72.75 MiB Shape (31, 21600, 43200) (1, 3090, 6172) Dask graph 1519 chunks in 19 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,72.75 MiB
Shape,"(31, 21600, 43200)","(1, 3090, 6172)"
Dask graph,1519 chunks in 19 graph layers,1519 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,464.04 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1964)"
Dask graph,484 chunks in 4 graph layers,484 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 215.52 GiB 464.04 MiB Shape (31, 21600, 43200) (31, 999, 1964) Dask graph 484 chunks in 4 graph layers Data type float64 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,464.04 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1964)"
Dask graph,484 chunks in 4 graph layers,484 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,490.43 MiB
Shape,"(31, 21600, 43200)","(31, 1440, 2880)"
Dask graph,225 chunks in 4 graph layers,225 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 490.43 MiB Shape (31, 21600, 43200) (31, 1440, 2880) Dask graph 225 chunks in 4 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,490.43 MiB
Shape,"(31, 21600, 43200)","(31, 1440, 2880)"
Dask graph,225 chunks in 4 graph layers,225 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,111.24 MiB
Shape,"(31, 21600, 43200)","(1, 2700, 5400)"
Dask graph,1984 chunks in 19 graph layers,1984 chunks in 19 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 215.52 GiB 111.24 MiB Shape (31, 21600, 43200) (1, 2700, 5400) Dask graph 1984 chunks in 19 graph layers Data type float64 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,111.24 MiB
Shape,"(31, 21600, 43200)","(1, 2700, 5400)"
Dask graph,1984 chunks in 19 graph layers,1984 chunks in 19 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,464.04 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1964)"
Dask graph,484 chunks in 4 graph layers,484 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 215.52 GiB 464.04 MiB Shape (31, 21600, 43200) (31, 999, 1964) Dask graph 484 chunks in 4 graph layers Data type float64 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,464.04 MiB
Shape,"(31, 21600, 43200)","(31, 999, 1964)"
Dask graph,484 chunks in 4 graph layers,484 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,843.75 kiB
Shape,"(31, 21600, 43200)","(1, 432, 500)"
Dask graph,144150 chunks in 22 graph layers,144150 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 843.75 kiB Shape (31, 21600, 43200) (1, 432, 500) Dask graph 144150 chunks in 22 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,843.75 kiB
Shape,"(31, 21600, 43200)","(1, 432, 500)"
Dask graph,144150 chunks in 22 graph layers,144150 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [3]:
import geopandas as gpd

# 计算各个州的辐射均值、以及总体均值
us_nation = gpd.read_file(r'data\US_data\cb_2018_us_nation_5m.shp')
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')
us_counties = gpd.read_file('data/cb_2018_us_county_500k.shp')
usa_bounds_main = dict(lon_min=-125, lon_max=-65, lat_min=25, lat_max=49)

us_nation_4326 = us_nation.to_crs('EPSG:4326')
us_states_4326 = us_states.to_crs('EPSG:4326')
us_counties_4326 = us_counties.to_crs('EPSG:4326')

# Use bounds to clip us_nation_4326 and us_states_4326
from shapely.geometry import box

bounds_geom = box(
    usa_bounds_main['lon_min'],
    usa_bounds_main['lat_min'],
    usa_bounds_main['lon_max'],
    usa_bounds_main['lat_max']
)

us_nation_4326 = us_nation_4326.clip(bounds_geom)
us_states_4326 = us_states_4326.clip(bounds_geom)

import rasterio
import geopandas as gpd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rasterstats import zonal_stats

# Determine time value as before
if '2020-01-01' in ds_merge['rsds'].coords['time'].values:
    time_str = '2020-01-01'
else:
    time_str = '2020'

# Select 2020 rsds (surface downwelling shortwave radiation)
rsds_2020 = ds_merge['rsds'].sel(time=time_str).squeeze()  # 添加 .squeeze() 移除多余维度

# Use rasterstats.zonal_stats to compute mean RSDS per state
from rasterstats import zonal_stats

# Get the geometries and state names
state_geoms = us_states_4326['geometry']
state_names = us_states_4326['NAME']

# To use zonal_stats, we need a raster file or an in-memory raster.
# We'll create an in-memory raster using rasterio and write the rsds_2020 DataArray to it.
import rasterio
import rasterio.features
from affine import Affine

# Prepare data for raster creation
rsds_data = rsds_2020.values.astype(np.float32)

# Ensure data is 2D (防止多余维度问题)
if rsds_data.ndim > 2:
    rsds_data = rsds_data.squeeze()

lat = rsds_2020['lat'].values
lon = rsds_2020['lon'].values

# Create transform for (lat, lon) grid, assuming ascending order
res_lat = abs(lat[1] - lat[0])
res_lon = abs(lon[1] - lon[0])
transform = Affine.translation(lon[0] - res_lon / 2, lat[0] - res_lat / 2) * Affine.scale(res_lon, res_lat)

# Rasterio expects (bands, rows, cols) and origin at top-left
# But our lat may be ascending (South-to-North); for north-up raster, flip array vertically if needed
if lat[0] < lat[-1]:
    rsds_data_rio = rsds_data[::-1, :]
    raster_transform = Affine.translation(lon[0] - res_lon / 2, lat[-1] + res_lat / 2) * Affine.scale(res_lon, -res_lat)
else:
    rsds_data_rio = rsds_data
    raster_transform = transform

# Write to an in-memory raster for zonal_stats
import tempfile
import os

with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as tmpfile:
    raster_path = tmpfile.name
    with rasterio.open(
        raster_path, 'w',
        driver='GTiff',
        height=rsds_data_rio.shape[0],
        width=rsds_data_rio.shape[1],
        count=1,
        dtype=rsds_data_rio.dtype,
        crs='EPSG:4326',
        transform=raster_transform,
        nodata=np.nan,
    ) as dst:
        dst.write(rsds_data_rio, 1)

    # Calculate zonal means using state geometries
    zs = zonal_stats(
        state_geoms,
        raster_path,
        stats='mean',
        nodata=np.nan,
        geojson_out=False,
        all_touched=True
    )

# Remove temp file
os.remove(raster_path)

# Combine results into a dataframe (state name, mean rsds)
state_rsds_means = pd.DataFrame({
    'NAME': state_names,
    'mean_rsds_MJ/m²/d': [z['mean'] for z in zs]
}).set_index('NAME').sort_values('mean_rsds_MJ/m²/d', ascending=False)

print("\nMean rsds per state (descending):")

state_rsds_means["rsds_kWh_m2_yr"] = state_rsds_means["mean_rsds_MJ/m²/d"] * 101.48
print(state_rsds_means)

# 计算均值、方差与变异系数
mean_rsds = state_rsds_means["rsds_kWh_m2_yr"].mean()
std_rsds = state_rsds_means["rsds_kWh_m2_yr"].std()
cv_rsds = std_rsds / mean_rsds

print("\nrsds_kWh_m2_yr 均值: {:.3f}".format(mean_rsds))
print("rsds_kWh_m2_yr 方差: {:.3f}".format(std_rsds))
print("rsds_kWh_m2_yr 变异系数: {:.3f}".format(cv_rsds))




Mean rsds per state (descending):
                      mean_rsds_MJ/m²/d  rsds_kWh_m2_yr
NAME                                                   
Arizona                       20.689999     2099.621147
New Mexico                    20.449741     2075.239721
Nevada                        19.524868     1981.383589
Utah                          19.433736     1972.135503
California                    19.396842     1968.391499
Colorado                      19.151214     1943.465217
Texas                         18.029445     1829.628083
Wyoming                       17.645435     1790.658739
Florida                       17.169969     1742.408423
Oklahoma                      16.978087     1722.936260
Kansas                        16.940154     1719.086784
Idaho                         16.634842     1688.103730
Georgia                       16.570740     1681.598671
Alabama                       16.339107     1658.092580
Louisiana                     16.290754     1653.185683
South Carolin

In [4]:
# 计算各个州的LNCS碳期望

import geopandas as gpd
from shapely.geometry import Point

# 读取分析数据
df_analysis = pd.read_csv(r'data\US_data\df_merged_data_for_analysis.csv')

# 读取州边界shapefile
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')

# 转换州边界shapefile为4326坐标系
if us_states.crs != "EPSG:4326":
    us_states = us_states.to_crs(epsg=4326)

# 创建点图层
geometry = [Point(xy) for xy in zip(df_analysis['lon'], df_analysis['lat'])]
pixel_gdf = gpd.GeoDataFrame(df_analysis, geometry=geometry, crs='EPSG:4326')

# 空间连接，获得州标签
pixel_with_states = gpd.sjoin(pixel_gdf, us_states, how='left', predicate='within')
df_analysis['state'] = pixel_with_states['NAME']   # 假设州字段名为'NAME'

# 按州分组计算加权均值
weighted_mean = (
    df_analysis
    .groupby('state')
    .apply(lambda x: (x['LNCS_expect'] * x['area_m2']).sum() / x['area_m2'].sum())
    .reset_index(name='weighted_mean_LNCS_expect')
    .sort_values('weighted_mean_LNCS_expect', ascending=False)
)

print(weighted_mean)

             state  weighted_mean_LNCS_expect
7          Florida                 585.536259
26   New Hampshire                 316.526621
19        Michigan                 305.505603
6         Delaware                 300.514837
32            Ohio                 289.201996
20       Minnesota                 288.967192
40           Texas                 285.854202
42         Vermont                 280.718636
31    North Dakota                 264.266313
11         Indiana                 262.383645
46       Wisconsin                 258.220230
12            Iowa                 254.639923
15       Louisiana                 254.129915
29        New York                 252.188971
37  South Carolina                 251.960064
16           Maine                 250.949926
18   Massachusetts                 246.089034
10        Illinois                 243.542965
36    Rhode Island                 241.405732
17        Maryland                 239.670371
30  North Carolina                

  .apply(lambda x: (x['LNCS_expect'] * x['area_m2']).sum() / x['area_m2'].sum())


In [5]:
# 美国总体撂荒地有发电能力T kWh yr，以及装机容量GW
import numpy as np
import pandas as pd
merged_data_for_analysis = pd.read_csv('data/US_data/df_merged_data_for_analysis.csv')
state_analysis_df = pd.read_csv('data/US_data/US_analysis_reslut/state_level_analysis_with_wccd.csv')
# 这个单位是kw,请你转为tw
# 计算全美总发电量
power_sum_kw = (state_analysis_df['Power_generation_per_ha'] * state_analysis_df['abandoned_land_ha']).sum()
power_sum_tkw_per_year = (power_sum_kw / 1e12) / 30
# 4,453 trillion Btu 转化为 kWh（1 Btu = 0.00029307107 kWh, 1 trillion = 1e12)
btu_trillion = 4453
btu_total = btu_trillion * 1e12  # Btu
btu_to_kwh = btu_total * 0.00029307107
btu_to_tkwh = btu_to_kwh / 1e12

print("4,453 trillion Btu 等于 {:.2f} T kWh".format(btu_to_tkwh))

print("全美年均发电量(单位：T kWh yr)：", round(power_sum_tkw_per_year, 2))

multiple = power_sum_tkw_per_year / btu_to_tkwh
print("全美撂荒地发电潜力约为4,453 trillion Btu的{:.2f}倍".format(multiple,btu_to_tkwh))

# 计算各州年均发电量
state_analysis_df['annual_generation_kw'] = state_analysis_df['Power_generation_per_ha'] * state_analysis_df['abandoned_land_ha'] / 30
state_analysis_df['annual_generation_tw'] = state_analysis_df['annual_generation_kw'] / 1e12
state_generation = state_analysis_df[['State_name', 'annual_generation_tw']].copy()
state_generation = state_generation.sort_values('annual_generation_tw', ascending=False).reset_index(drop=True)
print("各州年均发电量(单位：T kWh yr):")
print(state_generation)

# 计算各州的最大装机容量（GW）
# 先转为平方米（1 ha = 10,000 m²），再每平米*0.17kW，然后转为GW（1 GW = 1e6kW）
state_analysis_df['abandoned_land_m2'] = state_analysis_df['abandoned_land_ha'] * 10000
state_analysis_df['generation_kw_max'] = state_analysis_df['abandoned_land_m2'] * 0.17
state_analysis_df['generation_gw_max'] = state_analysis_df['generation_kw_max'] / 1e6
state_gw = state_analysis_df[['State_name', 'generation_gw_max']].copy().sort_values('generation_gw_max', ascending=False).reset_index(drop=True)
print("各州最大装机容量(单位：GW):")
print(state_gw)

# 汇总全美总装机容量（GW）
generation_gw = state_analysis_df['generation_gw_max'].sum()
generation_gw

import numpy as np
import scipy.stats as stats

# 这10个数字是在0.17效率、100%土地利用率下，多情景下的减排能力（单位：假设为Gt CO2）
reduction_samples = np.array([87.78, 29.87, 45.37, 70.6, 68.11, 82.65, 60.09, 65.52, 52.85, 65.47])

# 参数估计：均值、标准差、95%置信区间
mean_reduction = np.mean(reduction_samples)
std_reduction = np.std(reduction_samples, ddof=1)
n = len(reduction_samples)
confidence = 0.95
t_critical = stats.t.ppf((1 + confidence) / 2, n - 1)
margin_of_error = t_critical * (std_reduction / np.sqrt(n))
ci_lower = mean_reduction - margin_of_error
ci_upper = mean_reduction + margin_of_error

print(f"多情景下减排能力的参数估计（单位：与原始一致）：")
print(f"样本均值: {mean_reduction:.2f}")
print(f"标准差: {std_reduction:.2f}")
print(f"{int(confidence*100)}%置信区间: [{ci_lower:.2f}, {ci_upper:.2f}]")

4,453 trillion Btu 等于 1.31 T kWh
全美年均发电量(单位：T kWh yr)： 10.74
全美撂荒地发电潜力约为4,453 trillion Btu的8.23倍
各州年均发电量(单位：T kWh yr):
        State_name  annual_generation_tw
0            Texas              1.268097
1       California              0.731503
2         Illinois              0.621968
3          Georgia              0.567661
4         Michigan              0.493270
5          Indiana              0.462840
6        Wisconsin              0.421527
7          Montana              0.389981
8   North Carolina              0.377429
9          Florida              0.357419
10            Ohio              0.323148
11        Colorado              0.298077
12            Utah              0.292762
13       Louisiana              0.283926
14           Idaho              0.268704
15     Mississippi              0.235324
16          Kansas              0.228949
17            Iowa              0.214051
18         Alabama              0.210146
19        Missouri              0.203136
20       Minnesota  

In [6]:
# 统计各州的碳减排量 'pv_potential_total(t)'
# 按州分组汇总，并将单位从C（t）换算为CO2当量（t），再转换为10^9吨（Gt）

if 'State_name' not in merged_data_for_analysis.columns:
    merged_data_for_analysis = merged_data_for_analysis.merge(
        pixel_with_states[['lat', 'lon', 'NAME']],
        left_on=['lat', 'lon'],
        right_on=['lat', 'lon'],
        how='left'
    )
    merged_data_for_analysis.rename(columns={'NAME': 'State_name'}, inplace=True)

# 注意：'pv_potential_dens'是密度单位，需要乘以'area_m2'
merged_data_for_analysis['pv_potential_total_sumC'] = merged_data_for_analysis['pv_potential_dens'] * merged_data_for_analysis['area_m2'] /10000

state_co2_reduction = (
    merged_data_for_analysis
    .groupby('State_name')['pv_potential_total_sumC']
    .sum()
    .reset_index(name='total_co2_reduction_C_t')
)

# 先除以0.27，将C转为CO2 (t)，再转换为Gt（1Gt = 1e9 t）
state_co2_reduction['total_co2_reduction_Gt'] = state_co2_reduction['total_co2_reduction_C_t'] / 0.27 / 1e9
state_co2_reduction = state_co2_reduction.sort_values('total_co2_reduction_Gt', ascending=False)

print("各州总二氧化碳减排量（单位：Gt）:")
print(state_co2_reduction[['State_name', 'total_co2_reduction_Gt']])

# 计算全美总二氧化碳减排量（Gt）
total_co2_reduction_gt = state_co2_reduction['total_co2_reduction_Gt'].sum()
print(f"全美总二氧化碳减排量（单位：Gt）: {total_co2_reduction_gt}")

各州总二氧化碳减排量（单位：Gt）:
        State_name  total_co2_reduction_Gt
40           Texas               10.368143
3       California                5.980871
10        Illinois                5.085296
8          Georgia                4.641274
19        Michigan                4.033047
11         Indiana                3.784245
46       Wisconsin                3.446467
23         Montana                3.188543
30  North Carolina                3.085909
7          Florida                2.922312
32            Ohio                2.642105
4         Colorado                2.437119
41            Utah                2.393661
15       Louisiana                2.321420
9            Idaho                2.196961
21     Mississippi                1.924046
13          Kansas                1.871916
12            Iowa                1.750110
0          Alabama                1.718183
22        Missouri                1.660865
20       Minnesota                1.650718
44      Washington                1

In [16]:
# LNCS_expect单位是 Mg C/ha（兆克碳每公顷），计算总体与各州LNCS_expect换算的CO2（Gt）

# Step 1: 计算每像素的LNCS_expect*area（总Mg C），并加到表中
merged_data_for_analysis['LNCS_expect_sumC_Mg'] = merged_data_for_analysis['LNCS_expect'] * merged_data_for_analysis['area_m2'] / 10000  # Mg C

# Step 2: 按州统计
state_lncsexpect_c = (
    merged_data_for_analysis
    .groupby('State_name')['LNCS_expect_sumC_Mg']
    .sum()
    .reset_index(name='total_lncsexpect_c_Mg')
)
state_lncsexpect_c['total_lncsexpect_co2_Mg'] = state_lncsexpect_c['total_lncsexpect_c_Mg'] / 0.27
state_lncsexpect_c['total_lncsexpect_co2_Gt'] = state_lncsexpect_c['total_lncsexpect_co2_Mg'] / 1e9
state_lncsexpect_c = state_lncsexpect_c.sort_values('total_lncsexpect_co2_Gt', ascending=False)

print("各州总LNCS_expect CO2当量（单位：Gt）:")
print(state_lncsexpect_c[['State_name', 'total_lncsexpect_co2_Gt']])

# Step 3: 计算全美总LNCS_expect CO2当量（Gt）
usa_total_c_Mg = merged_data_for_analysis['LNCS_expect_sumC_Mg'].sum()
usa_total_co2_Mg = usa_total_c_Mg / 0.27
usa_total_co2_Gt = usa_total_co2_Mg / 1e9
print(f"全美总LNCS_expect CO2当量（单位：Gt）: {usa_total_co2_Gt}")

# Step 4: 计算total_co2_reduction_gt是全美LNCS_expect CO2当量的倍数，并计算二者的差值
if 'total_co2_reduction_gt' in locals():
    multiple = total_co2_reduction_gt / usa_total_co2_Gt
    diff = total_co2_reduction_gt - usa_total_co2_Gt
    print(f"此前基于pv_potential_total(t)的全美CO2减排总量({total_co2_reduction_gt:.3f} Gt)是LNCS_expect ({usa_total_co2_Gt:.3f} Gt) 的 {multiple:.2f} 倍。")
    print(f"两者的差值为: {diff:.3f} Gt")
else:
    print("变量total_co2_reduction_gt未定义。")

# Step 5: 如果用 mean_reduction 的估算结果（与 1-63 行一致）进行对比
# mean_reduction 单位同 reduction_samples，是 Gt CO2？（假定为Gt CO2，见1-63细节）
if 'mean_reduction' in locals():
    print(f"\n===== Step 5: 使用 mean_reduction 结果对比 =====")
    # total LNCS_expect CO2 （Gt）已在上面算出：usa_total_co2_Gt
    mean_reduction_value = mean_reduction
    if usa_total_co2_Gt != 0:
        multiple2 = mean_reduction_value / usa_total_co2_Gt
        diff2 = mean_reduction_value - usa_total_co2_Gt
        print(f"mean_reduction ({mean_reduction_value:.3f} Gt) 是 LNCS_expect CO2 ({usa_total_co2_Gt:.3f} Gt) 的 {multiple2:.2f} 倍。")
        print(f"两者的差值为: {diff2:.3f} Gt")
    else:
        print("LNCS_expect CO2总量为0，无法比较。")
else:
    print("变量 mean_reduction 未定义，无法 step5 对比。")

各州总LNCS_expect CO2当量（单位：Gt）:
        State_name  total_lncsexpect_co2_Gt
40           Texas                 0.546261
7          Florida                 0.331057
19        Michigan                 0.278361
10        Illinois                 0.266451
11         Indiana                 0.217256
46       Wisconsin                 0.200165
3       California                 0.176074
8          Georgia                 0.175706
32            Ohio                 0.167307
30  North Carolina                 0.148327
23         Montana                 0.127794
15       Louisiana                 0.118455
20       Minnesota                 0.104682
12            Iowa                 0.092879
41            Utah                 0.086978
9            Idaho                 0.076749
22        Missouri                 0.074899
21     Mississippi                 0.074595
35    Pennsylvania                 0.073342
44      Washington                 0.073023
37  South Carolina                 0.071328
4  

In [18]:
# 是实现1.5℃目标百分之多少
print(diff / 250)
print(diff2 / 250)

0.33398735042539845
0.23419489424838286


In [9]:
# -*- coding: utf-8 -*-
import xarray as xr
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import box
from shapely.validation import make_valid
from rasterstats import zonal_stats
from rasterio.transform import from_origin
from scipy.spatial import cKDTree

# -----------------------------
# 路径参数
# -----------------------------
NC_PATH   = r'figure\draw_shp\Base_Pot_AGB_BGB_SOC_MgCha_500m.nc'
SHP_PATH  = r'data\cb_2018_us_state_500k.shp'
MERGED_CSV= r'data/US_data/df_merged_data_for_analysis.csv'

# -----------------------------
# 读取数据
# -----------------------------
ds = xr.open_dataset(NC_PATH)
var_name = 'Base_Pot_AGB_BGB_SOC_MgCha_500m'
carbon_var = ds[var_name] if var_name in ds else list(ds.data_vars.values())[0]
us_states = gpd.read_file(SHP_PATH).to_crs('EPSG:4326')

# 美国本土范围
usa_bbox = dict(lon_min=-125, lon_max=-65, lat_min=25, lat_max=49)
buffer_deg = 2
lon_min = usa_bbox['lon_min'] - buffer_deg
lon_max = usa_bbox['lon_max'] + buffer_deg
lat_min = usa_bbox['lat_min'] - buffer_deg
lat_max = usa_bbox['lat_max'] + buffer_deg

# -----------------------------
# 预裁剪栅格
# -----------------------------
lat = carbon_var['lat'].values
lon = carbon_var['lon'].values
if lat[0] < lat[-1]:
    carbon_sub = carbon_var.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max))
    flip_needed = True
else:
    carbon_sub = carbon_var.sel(lat=slice(lat_max, lat_min), lon=slice(lon_min, lon_max))
    flip_needed = False

arr = np.asarray(carbon_sub.squeeze().values, dtype=np.float32)
lat_sub = carbon_sub['lat'].values
lon_sub = carbon_sub['lon'].values
if arr.ndim > 2:
    arr = arr.squeeze()

if arr.size == 0 or len(lat_sub) < 2 or len(lon_sub) < 2:
    raise ValueError("裁剪后数据为空或网格分辨率不足")

# -----------------------------
# 构造 transform
# -----------------------------
res_lat = float(abs(lat_sub[1] - lat_sub[0]))
res_lon = float(abs(lon_sub[1] - lon_sub[0]))

if flip_needed:
    arr_rio = arr[::-1, :]
    lat_north = float(lat_sub.max())
else:
    arr_rio = arr
    lat_north = float(lat_sub[0])

west  = float(lon_sub.min()) - res_lon / 2.0
north = lat_north + res_lat / 2.0
transform = from_origin(west, north, res_lon, res_lat)

# -----------------------------
# 裁剪州边界
# -----------------------------
bbox_geom = box(usa_bbox['lon_min'], usa_bbox['lat_min'], usa_bbox['lon_max'], usa_bbox['lat_max'])
states_clip = us_states.clip(bbox_geom).copy()
states_clip = states_clip[states_clip.geometry.notnull() & ~states_clip.geometry.is_empty]

state_geoms = []
state_names = []
for name, geom in zip(states_clip['NAME'], states_clip['geometry']):
    try:
        gv = make_valid(geom)
    except Exception:
        gv = geom.buffer(0)
    if gv.is_empty:
        continue
    state_geoms.append(gv)
    state_names.append(name)

if not state_geoms:
    raise RuntimeError("没有有效的州几何")

# -----------------------------
# 全国像元分位数与区间分布
# -----------------------------
vals_all = arr_rio.ravel()
vals_all = vals_all[~np.isnan(vals_all)]
if vals_all.size == 0:
    raise ValueError("美国本土范围内没有有效像元")

quantiles = np.arange(0.1, 1.01, 0.1)
q_vals = np.quantile(vals_all, quantiles)
bins = [vals_all.min()] + list(q_vals)
labels = [f'{i*10}-{(i+1)*10}%' for i in range(10)]

binned = pd.cut(vals_all, bins=bins, labels=labels, include_lowest=True, duplicates='drop')
bin_counts = binned.value_counts().sort_index()
percentile_distribution = pd.DataFrame({
    'Percentile_Range': bin_counts.index,
    'Pixel_Count': bin_counts.values,
    'Percentage': (bin_counts.values / vals_all.size * 100).round(2),
    'Lower_Bound': bins[:-1],
    'Upper_Bound': bins[1:]
})

print("碳储量分位数区间分布：")
print(percentile_distribution)

# -----------------------------
# 州级统计
# -----------------------------
pct_stats = [f'percentile_{int(q*100)}' for q in quantiles]
stats_list = ['mean'] + pct_stats

zs = zonal_stats(
    state_geoms,
    arr_rio,
    affine=transform,
    stats=stats_list,
    nodata=np.nan,
    all_touched=True
)

state_carbon_stats = pd.DataFrame([
    {'NAME': n, **{k: z.get(k, np.nan) for k in stats_list}}
    for n, z in zip(state_names, zs)
])

print(f"\n各州碳储量统计（前10名）：")
print(state_carbon_stats[['NAME','mean']].sort_values('mean', ascending=False).head(10))

# -----------------------------
# LNCS_expect 各分位数组的碳储量分布交叉分析
# -----------------------------
merged = pd.read_csv(MERGED_CSV)

if all(col in merged.columns for col in ['LNCS_expect', 'lat', 'lon']):
    # 1. 计算 LNCS_expect 的分位数并分组
    lncs_vals = merged['LNCS_expect'].dropna()
    lncs_q_vals = np.quantile(lncs_vals, quantiles)
    lncs_bins = [lncs_vals.min()] + list(lncs_q_vals)
    lncs_labels = [f'{i*10}-{(i+1)*10}%' for i in range(10)]
    
    merged_valid = merged.dropna(subset=['LNCS_expect', 'lat', 'lon']).copy()
    merged_valid['LNCS_group'] = pd.cut(
        merged_valid['LNCS_expect'], 
        bins=lncs_bins, 
        labels=lncs_labels, 
        include_lowest=True, 
        duplicates='drop'
    )
    
    # 2. 提取每个站点对应的碳储量值（利用最近邻填充nan）
    print("\n提取站点碳储量值...")

    valid_mask = ~np.isnan(arr_rio)
    if not valid_mask.any():
        raise ValueError("栅格中没有任何有效像元，无法构建 KDTree")
    lat_grid, lon_grid = np.meshgrid(carbon_sub['lat'].values,
                                 carbon_sub['lon'].values,
                                 indexing='ij')
    valid_points = np.column_stack([
        lat_grid[valid_mask],
        lon_grid[valid_mask]
    ])
    valid_values = arr_rio[valid_mask].astype(float)
    tree = cKDTree(valid_points)

    print("\n提取站点碳储量值（KDTree 最近邻）...")
    carbon_vals = []
    for _, row in merged_valid.iterrows():
        if np.isnan(row['lat']) or np.isnan(row['lon']):
            carbon_vals.append(None)
            continue
        dist, idx = tree.query([row['lat'], row['lon']])
        carbon_vals.append(float(valid_values[idx]))

    merged_valid['carbon_val'] = carbon_vals
    nan_count = merged_valid['carbon_val'].isna().sum()
    print(f"carbon_val 中的 NaN 个数: {nan_count}")

    
    merged_valid = merged_valid.dropna(subset=['carbon_val'])
    
    # 3. 将碳储量值分到碳储量分位数区间
    merged_valid['carbon_group'] = pd.cut(
        merged_valid['carbon_val'], 
        bins=bins, 
        labels=labels, 
        include_lowest=True, 
        duplicates='drop'
    )
    
    # 4. 交叉统计表
    cross_tab = pd.crosstab(
        merged_valid['LNCS_group'], 
        merged_valid['carbon_group'], 
        normalize='index'
    ) * 100
    cross_tab = cross_tab.round(2)
    
    print(f"\nLNCS_expect 各分位数组的碳储量分布（%）(n={len(merged_valid):,}):")
    print(cross_tab)
    
    # 5. 汇总统计
    lncs_carbon_summary = []
    for lncs_grp in lncs_labels:
        group_data = merged_valid[merged_valid['LNCS_group'] == lncs_grp]
        if len(group_data) > 0:
            lncs_carbon_summary.append({
                'LNCS_Group': lncs_grp,
                'Site_Count': len(group_data),
                'Mean_Carbon': group_data['carbon_val'].mean().round(2),
                'Median_Carbon': group_data['carbon_val'].median().round(2)
            })
    
    lncs_carbon_summary_df = pd.DataFrame(lncs_carbon_summary)
    print(f"\nLNCS_expect 分组汇总:")
    print(lncs_carbon_summary_df)
    
else:
    print("\n提示：缺少必要列（LNCS_expect, lat, lon）")

碳储量分位数区间分布：
  Percentile_Range  Pixel_Count  Percentage  Lower_Bound  Upper_Bound
0            0-10%      1581612       10.08          0.0        114.0
1           10-20%      1560746        9.94        114.0        155.0
2           20-30%      1585471       10.10        155.0        195.0
3           30-40%      1566217        9.98        195.0        223.0
4           40-50%      1579965       10.06        223.0        248.0
5           50-60%      1554234        9.90        248.0        273.0
6           60-70%      1606019       10.23        273.0        301.0
7           70-80%      1533153        9.77        301.0        334.0
8           80-90%      1561310        9.95        334.0        385.0
9          90-100%      1568938        9.99        385.0       5391.0

各州碳储量统计（前10名）：
             NAME        mean
14        Florida  471.658148
46        Vermont  390.765548
47  New Hampshire  382.867005
40     Washington  362.591068
44  Massachusetts  355.409418
43   Rhode Island  354

In [10]:
cross_tab

carbon_group,0-10%,10-20%,20-30%,30-40%,40-50%,50-60%,60-70%,70-80%,80-90%,90-100%
LNCS_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0-10%,34.56,54.45,6.8,2.09,1.29,0.36,0.23,0.06,0.17,0.0
10-20%,0.04,8.73,42.82,22.46,18.6,5.66,1.32,0.31,0.06,0.0
20-30%,0.0,0.0,19.29,36.9,26.59,13.45,3.07,0.65,0.04,0.0
30-40%,0.01,0.0,2.87,35.93,30.0,22.41,7.41,1.02,0.33,0.03
40-50%,0.0,0.01,0.01,14.66,27.47,33.42,19.89,3.87,0.57,0.1
50-60%,0.18,0.0,0.01,0.64,15.06,31.21,35.76,13.81,3.01,0.31
60-70%,0.03,0.0,0.0,0.0,1.76,17.33,34.99,33.35,11.06,1.48
70-80%,0.06,0.0,0.0,0.0,0.0,3.33,20.49,37.78,33.37,4.98
80-90%,0.16,0.0,0.0,0.0,0.0,0.01,3.27,17.86,58.42,20.29
90-100%,0.03,0.0,0.0,0.0,0.0,0.01,0.0,0.01,12.6,87.35


In [21]:
# 三维交叉分析：PV分位数 → LNCS分位数 → 碳储量分位数
# -----------------------------
if all(col in merged.columns for col in ['pv_potential_dens', 'LNCS_expect', 'lat', 'lon']):
    print("\n" + "="*70)
    print("三维交叉分析：PV → LNCS → Carbon")
    print("="*70)
    
    # 1. 计算各指标的分位数并分组
    merged_valid = merged.dropna(subset=['pv_potential_dens', 'LNCS_expect', 'lat', 'lon']).copy()
    
    # PV 分位数分组
    pv_vals = merged_valid['pv_potential_dens']
    pv_q_vals = np.quantile(pv_vals, quantiles)
    pv_bins = [pv_vals.min()] + list(pv_q_vals)
    pv_labels = [f'PV_{i*10}-{(i+1)*10}%' for i in range(10)]
    merged_valid['PV_group'] = pd.cut(pv_vals, bins=pv_bins, labels=pv_labels, include_lowest=True, duplicates='drop')
    
    # LNCS 分位数分组
    lncs_vals = merged_valid['LNCS_expect']
    lncs_q_vals = np.quantile(lncs_vals, quantiles)
    lncs_bins = [lncs_vals.min()] + list(lncs_q_vals)
    lncs_labels = [f'LNCS_{i*10}-{(i+1)*10}%' for i in range(10)]
    merged_valid['LNCS_group'] = pd.cut(lncs_vals, bins=lncs_bins, labels=lncs_labels, include_lowest=True, duplicates='drop')
    
    # 2. 提取碳储量值
    print("提取碳储量值...")

    valid_mask = ~np.isnan(arr_rio)
    if not valid_mask.any():
        raise ValueError("栅格中没有任何有效像元，无法构建 KDTree")
    lat_grid, lon_grid = np.meshgrid(carbon_sub['lat'].values,
                                 carbon_sub['lon'].values,
                                 indexing='ij')
    valid_points = np.column_stack([
        lat_grid[valid_mask],
        lon_grid[valid_mask]
    ])
    valid_values = arr_rio[valid_mask].astype(float)
    tree = cKDTree(valid_points)

    print("\n提取站点碳储量值（KDTree 最近邻）...")
    carbon_vals = []
    for _, row in merged_valid.iterrows():
        if np.isnan(row['lat']) or np.isnan(row['lon']):
            carbon_vals.append(None)
            continue
        dist, idx = tree.query([row['lat'], row['lon']])
        carbon_vals.append(float(valid_values[idx]))

    merged_valid['carbon_val'] = carbon_vals
    nan_count = merged_valid['carbon_val'].isna().sum()
    print(f"carbon_val 中的 NaN 个数: {nan_count}")
    
    # Carbon 分位数分组（使用全局bins）
    carbon_labels = [f'C_{i*10}-{(i+1)*10}%' for i in range(10)]
    merged_valid['Carbon_group'] = pd.cut(merged_valid['carbon_val'], bins=bins, labels=carbon_labels, include_lowest=True, duplicates='drop')
    
    print(f"有效样本数: {len(merged_valid):,}")
    
    # 3. 生成三维交叉统计
    # 方法1: 完整的三维交叉表（可能很大）
    three_way_counts = merged_valid.groupby(['PV_group', 'LNCS_group', 'Carbon_group'], observed=True).size().reset_index(name='Count')
    three_way_counts = three_way_counts.sort_values(['PV_group', 'LNCS_group', 'Carbon_group'])
    
    print("\n完整三维交叉表（前20行）:")
    print(three_way_counts.head(20))
    
    # 4. 分层分析：PV → LNCS → Carbon
    print("\n" + "-"*70)
    print("分层分析：各PV分位数组内的 LNCS-Carbon 关系")
    print("-"*70)
    
    for pv_grp in pv_labels[:3]:  # 示例：只显示前3个PV组
        pv_data = merged_valid[merged_valid['PV_group'] == pv_grp]
        if len(pv_data) > 0:
            cross_tab = pd.crosstab(
                pv_data['LNCS_group'], 
                pv_data['Carbon_group'], 
                normalize='index'
            ) * 100
            
            print(f"\n{pv_grp} (n={len(pv_data):,}):")
            print(cross_tab.round(1))
    
    # 5. 汇总统计：每个PV-LNCS组合的平均碳储量
    summary_3d = merged_valid.groupby(['PV_group', 'LNCS_group'], observed=True).agg({
        'carbon_val': ['count', 'mean', 'median'],
        'pv_potential_dens': 'mean',
        'LNCS_expect': 'mean'
    }).round(2)
    summary_3d.columns = ['Site_Count', 'Mean_Carbon', 'Median_Carbon', 'Mean_PV', 'Mean_LNCS']
    summary_3d = summary_3d.reset_index()
    
    print("\n" + "-"*70)
    print("PV-LNCS组合的碳储量统计（前20组）:")
    print("-"*70)
    print(summary_3d.head(20))
    
    # 6. 关键洞察：高PV高LNCS的碳储量分布，以及高PV&高LNCS&高Carbon的分布

    # 高PV + 高LNCS
    high_pv_high_lncs = merged_valid[
        (merged_valid['PV_group'].isin([pv_labels[-3], pv_labels[-2], pv_labels[-1]])) &  # Top 30% PV
        (merged_valid['LNCS_group'].isin([lncs_labels[-3], lncs_labels[-2], lncs_labels[-1]]))  # Top 30% LNCS
    ]
    
    if len(high_pv_high_lncs) > 0:
        high_carbon_dist = high_pv_high_lncs['Carbon_group'].value_counts(normalize=True).sort_index() * 100
        
        print("\n" + "="*70)
        print(f"关键发现：高PV & 高LNCS站点的碳储量分布 (n={len(high_pv_high_lncs):,}):")
        print("="*70)
        for carbon_grp, pct in high_carbon_dist.items():
            print(f"  {carbon_grp}: {pct:.1f}%")
        
        print(f"\n  平均碳储量: {high_pv_high_lncs['carbon_val'].mean():.2f} MgC/ha")
        print(f"  中位碳储量: {high_pv_high_lncs['carbon_val'].median():.2f} MgC/ha")
        print(f"  平均PV潜力密度: {high_pv_high_lncs['pv_potential_dens'].mean():.2f}")
        print(f"  平均LNCS期望值: {high_pv_high_lncs['LNCS_expect'].mean():.2f}")


        
        # 额外统计：高PV&高LNCS&高Carbon分布（top 30% carbon，即最后3组label）
        carbon_top_labels = [carbon_labels[-3], carbon_labels[-2], carbon_labels[-1]]
        high_pv_high_lncs_high_carbon = high_pv_high_lncs[high_pv_high_lncs['Carbon_group'].isin(carbon_top_labels)]
        high_count = len(high_pv_high_lncs_high_carbon)
        percent_high = high_count / len(high_pv_high_lncs) * 100 if len(high_pv_high_lncs) > 0 else 0
        
        print("\n" + "="*70)
        print(f"高PV & 高LNCS & 高Carbon站点（top 30% carbon）：")
        print(f"数量: {high_count:,}，占高PV&高LNCS站点比例: {percent_high:.1f}%")
        if high_count > 0:
            print(f"  平均碳储量: {high_pv_high_lncs_high_carbon['carbon_val'].mean():.2f} MgC/ha")
            print(f"  中位碳储量: {high_pv_high_lncs_high_carbon['carbon_val'].median():.2f} MgC/ha")
    

    
else:
    print("\n提示：缺少必要列")


三维交叉分析：PV → LNCS → Carbon
提取碳储量值...

提取站点碳储量值（KDTree 最近邻）...
carbon_val 中的 NaN 个数: 0
有效样本数: 70,337

完整三维交叉表（前20行）:
    PV_group   LNCS_group Carbon_group  Count
0   PV_0-10%   LNCS_0-10%      C_0-10%     25
1   PV_0-10%   LNCS_0-10%     C_10-20%      2
2   PV_0-10%   LNCS_0-10%     C_20-30%      1
3   PV_0-10%   LNCS_0-10%     C_30-40%      2
4   PV_0-10%   LNCS_0-10%     C_40-50%      2
5   PV_0-10%  LNCS_10-20%     C_10-20%      8
6   PV_0-10%  LNCS_10-20%     C_20-30%      1
7   PV_0-10%  LNCS_10-20%     C_30-40%     49
8   PV_0-10%  LNCS_10-20%     C_40-50%    142
9   PV_0-10%  LNCS_10-20%     C_50-60%     14
10  PV_0-10%  LNCS_10-20%     C_60-70%      2
11  PV_0-10%  LNCS_20-30%     C_20-30%      3
12  PV_0-10%  LNCS_20-30%     C_30-40%      5
13  PV_0-10%  LNCS_20-30%     C_40-50%    223
14  PV_0-10%  LNCS_20-30%     C_50-60%    144
15  PV_0-10%  LNCS_20-30%     C_60-70%      8
16  PV_0-10%  LNCS_30-40%     C_20-30%      5
17  PV_0-10%  LNCS_30-40%     C_40-50%     72
18  PV_0-1

In [12]:
len(high_pv_high_lncs) / len(merged_valid)

0.06202994156702731

In [13]:
# 桑基图：PV → LNCS → Carbon 三级流向
import plotly.graph_objects as go

if 'merged_valid' in locals() and len(merged_valid) > 0:
    # 1. 将10个分位数区间合并为3个大区间
    def merge_to_3groups(group_col):
        """将10个分位数组合并为低(0-30%)、中(30-70%)、高(70-100%)"""
        mapping = {}
        for i in range(10):
            old_label = group_col.cat.categories[i]
            if i < 3:
                mapping[old_label] = 'Low'
            elif i < 7:
                mapping[old_label] = 'Mid'
            else:
                mapping[old_label] = 'High'
        return group_col.map(mapping)
    
    df_sankey = merged_valid.copy()
    df_sankey['PV_3group'] = merge_to_3groups(df_sankey['PV_group'])
    df_sankey['LNCS_3group'] = merge_to_3groups(df_sankey['LNCS_group'])
    df_sankey['Carbon_3group'] = merge_to_3groups(df_sankey['Carbon_group'])
    
    # 2. 计算流量
    flow_pv_lncs = df_sankey.groupby(['PV_3group', 'LNCS_3group']).size().reset_index(name='count')
    flow_lncs_carbon = df_sankey.groupby(['LNCS_3group', 'Carbon_3group']).size().reset_index(name='count')
    
    # 3. 创建节点
    nodes = []
    node_labels = []
    node_colors = []
    
    pv_groups = ['Low', 'Mid', 'High']
    for g in pv_groups:
        nodes.append(f'PV_{g}')
        node_labels.append(f'PV\n{g}')
        node_colors.append('#e74c3c' if g=='Low' else '#f39c12' if g=='Mid' else '#27ae60')
    
    lncs_groups = ['Low', 'Mid', 'High']
    for g in lncs_groups:
        nodes.append(f'LNCS_{g}')
        node_labels.append(f'LNCS\n{g}')
        node_colors.append('#3498db' if g=='Low' else '#9b59b6' if g=='Mid' else '#e67e22')
    
    carbon_groups = ['Low', 'Mid', 'High']
    for g in carbon_groups:
        nodes.append(f'Carbon_{g}')
        node_labels.append(f'Carbon\n{g}')
        node_colors.append('#95a5a6' if g=='Low' else '#34495e' if g=='Mid' else '#2c3e50')
    
    # 4. 节点索引映射
    node_dict = {name: i for i, name in enumerate(nodes)}
    
    # 5. 创建连接
    sources = []
    targets = []
    values = []
    link_colors = []
    
    for _, row in flow_pv_lncs.iterrows():
        source_idx = node_dict[f"PV_{row['PV_3group']}"]
        target_idx = node_dict[f"LNCS_{row['LNCS_3group']}"]
        sources.append(source_idx)
        targets.append(target_idx)
        values.append(row['count'])
        link_colors.append('rgba(150, 150, 150, 0.3)')
    
    for _, row in flow_lncs_carbon.iterrows():
        source_idx = node_dict[f"LNCS_{row['LNCS_3group']}"]
        target_idx = node_dict[f"Carbon_{row['Carbon_3group']}"]
        sources.append(source_idx)
        targets.append(target_idx)
        values.append(row['count'])
        link_colors.append('rgba(100, 100, 100, 0.3)')
    
    # 6. 创建桑基图
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=20,
            thickness=30,
            line=dict(color='white', width=2),
            label=node_labels,
            color=node_colors,
            x=[0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.9, 0.9, 0.9],
            y=[0.1, 0.5, 0.9, 0.1, 0.5, 0.9, 0.1, 0.5, 0.9]
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            color=link_colors
        )
    )])
    
    fig.update_layout(
        title=f"PV → LNCS → Carbon 三级流向关系 (n={len(df_sankey):,})",
        font=dict(size=12),
        height=600,
        width=1000
    )
    
    # 使用浏览器渲染（会自动打开浏览器）
    fig.show(renderer='browser')
    
    # 7. 输出统计摘要
    print("\n流向统计摘要:")
    print("-" * 60)
    
    pv_dist = df_sankey['PV_3group'].value_counts().sort_index()
    print(f"\nPV分布:")
    for g, cnt in pv_dist.items():
        print(f"  {g:4s}: {cnt:6,} ({cnt/len(df_sankey)*100:5.1f}%)")
    
    high_pv = df_sankey[df_sankey['PV_3group'] == 'High']
    if len(high_pv) > 0:
        lncs_dist = high_pv['LNCS_3group'].value_counts(normalize=True).sort_index() * 100
        print(f"\n高PV站点的LNCS分布:")
        for g, pct in lncs_dist.items():
            print(f"  → LNCS {g:4s}: {pct:5.1f}%")
        
        high_pv_high_lncs = high_pv[high_pv['LNCS_3group'] == 'High']
        if len(high_pv_high_lncs) > 0:
            carbon_dist = high_pv_high_lncs['Carbon_3group'].value_counts(normalize=True).sort_index() * 100
            print(f"\n高PV & 高LNCS站点的Carbon分布:")
            for g, pct in carbon_dist.items():
                print(f"  → Carbon {g:4s}: {pct:5.1f}%")
    
else:
    print("\n提示：请先运行三维交叉分析代码")


流向统计摘要:
------------------------------------------------------------

PV分布:
  High: 21,101 ( 30.0%)
  Low : 21,101 ( 30.0%)
  Mid : 28,135 ( 40.0%)

高PV站点的LNCS分布:
  → LNCS High:  20.7%
  → LNCS Low :  53.0%
  → LNCS Mid :  26.3%

高PV & 高LNCS站点的Carbon分布:
  → Carbon High:  87.2%
  → Carbon Low :   0.0%
  → Carbon Mid :  12.8%


## 4、Economic anlysis


In [3]:
import pandas as pd 
import numpy as np 
from scipy import stats

def analyze_policy_paths_by_pixel_with_ci(df_economic, target_year=2050, confidence_level=0.95):
    """精简版政策路径分析，包含95%置信区间评估"""
    
    def match_policy(row):
        if 'P1' in str(row):
            return 'P1'
        elif 'P2' in str(row):
            return 'P2'
        elif 'P3' in str(row):
            return 'P3'
        else:
            return 'P4'
    
    if 'policy_group' not in df_economic.columns:
        df_economic['policy_group'] = df_economic['policy_category'].astype(str).apply(match_policy)
    
    df_target_year = df_economic[df_economic['analysis_year'] == target_year].copy()
    unique_policies = sorted(df_target_year['policy_category'].unique())
    
    alpha = 1 - confidence_level
    results = {}
    
    # ========== 统计每个pixel的各政策小组数量 ==========
    pixel_policy_counts = df_target_year.groupby(['lat', 'lon', 'policy_group']).size().unstack(fill_value=0)
    pixel_policy_counts = pixel_policy_counts.reset_index()
    pixel_policy_counts.columns.name = None
    
    # 大组分析（P1-P4）
    for policy in ['P1', 'P2', 'P3', 'P4']:
        policy_data = df_target_year[df_target_year['policy_group'] == policy]
        
        if len(policy_data) > 0:
            pixel_stats = policy_data.groupby(['lat', 'lon']).agg({
                'net_npv_usd': ['count', 'mean', 'max', 'min', 'std']
            }).round(2)
            pixel_stats.columns = ['path_count', 'mean_npv', 'max_npv', 'min_npv', 'std_npv']
            pixel_stats = pixel_stats.reset_index()
            
            positive_paths = policy_data[policy_data['net_npv_usd'] > 0]
            positive_pixel_count = positive_paths.groupby(['lat', 'lon']).size().reset_index(name='positive_paths')
            final_stats = pixel_stats.merge(positive_pixel_count, on=['lat', 'lon'], how='left').fillna(0)
            
            # 合并政策小组数量统计
            final_stats = final_stats.merge(
                pixel_policy_counts[['lat', 'lon', 'P1', 'P2', 'P3', 'P4']], 
                on=['lat', 'lon'], 
                how='left'
            ).fillna(0)
            
            positive_mean_pixels = (final_stats['mean_npv'] > 0).sum()
            positive_max_pixels = (final_stats['max_npv'] > 0).sum()
            
            # 计算均值和标准差
            mean_npv = final_stats['mean_npv'].mean()
            std_npv = final_stats['mean_npv'].std()
            n = len(final_stats)
            
            # 标准误差 (Standard Error of Mean)
            sem = std_npv / np.sqrt(n)
            
            # T分布置信区间
            df_freedom = n - 1
            t_critical = stats.t.ppf(1 - alpha/2, df_freedom)
            
            # 置信区间半宽（margin of error）
            margin_of_error = t_critical * sem
            
            ci_lower = mean_npv - margin_of_error
            ci_upper = mean_npv + margin_of_error
            
            # 相对误差（置信区间宽度相对于均值的百分比）
            relative_error = (margin_of_error / abs(mean_npv) * 100) if mean_npv != 0 else np.inf
            
            cv = (std_npv / mean_npv) if mean_npv != 0 else 0
            
            # 统计每个pixel的政策小组数量分布
            policy_count_stats = {
                'pixels_with_P1': (final_stats['P1'] > 0).sum(),
                'pixels_with_P2': (final_stats['P2'] > 0).sum(),
                'pixels_with_P3': (final_stats['P3'] > 0).sum(),
                'pixels_with_P4': (final_stats['P4'] > 0).sum(),
                'avg_P1_per_pixel': final_stats['P1'].mean(),
                'avg_P2_per_pixel': final_stats['P2'].mean(),
                'avg_P3_per_pixel': final_stats['P3'].mean(),
                'avg_P4_per_pixel': final_stats['P4'].mean()
            }
            
            results[policy] = {
                'total_pixels': n,
                'mean_npv': mean_npv,
                'std_npv': std_npv,
                'sem': sem,
                'margin_of_error': margin_of_error,  # ± 后面的值
                'ci_lower': ci_lower,
                'ci_upper': ci_upper,
                'relative_error_pct': relative_error,
                't_critical': t_critical,
                'df': df_freedom,
                'positive_mean_ratio': (positive_mean_pixels / n * 100) if n > 0 else 0,
                'positive_max_ratio': (positive_max_pixels / n * 100) if n > 0 else 0,
                'cv': cv,
                'max_npv': final_stats['max_npv'].max(),
                'min_npv': final_stats['min_npv'].min(),
                **policy_count_stats  # 添加政策小组数量统计
            }
    
    # 小组分析（具体政策类别）
    detailed_results = {}
    for policy_cat in unique_policies:
        if pd.notna(policy_cat) and str(policy_cat) != 'nan':
            cat_data = df_target_year[df_target_year['policy_category'] == policy_cat]
            
            if len(cat_data) > 0:
                pixel_stats = cat_data.groupby(['lat', 'lon']).agg({
                    'net_npv_usd': ['count', 'mean', 'max', 'min', 'std']
                }).round(2)
                pixel_stats.columns = ['path_count', 'mean_npv', 'max_npv', 'min_npv', 'std_npv']
                pixel_stats = pixel_stats.reset_index()
                
                positive_paths = cat_data[cat_data['net_npv_usd'] > 0]
                positive_pixel_count = positive_paths.groupby(['lat', 'lon']).size().reset_index(name='positive_paths')
                final_stats = pixel_stats.merge(positive_pixel_count, on=['lat', 'lon'], how='left').fillna(0)
                
                # 合并政策小组数量统计
                final_stats = final_stats.merge(
                    pixel_policy_counts[['lat', 'lon', 'P1', 'P2', 'P3', 'P4']], 
                    on=['lat', 'lon'], 
                    how='left'
                ).fillna(0)
                
                positive_mean_pixels = (final_stats['mean_npv'] > 0).sum()
                positive_max_pixels = (final_stats['max_npv'] > 0).sum()
                
                mean_npv = final_stats['mean_npv'].mean()
                std_npv = final_stats['mean_npv'].std()
                n = len(final_stats)
                
                sem = std_npv / np.sqrt(n)
                df_freedom = n - 1
                t_critical = stats.t.ppf(1 - alpha/2, df_freedom)
                
                margin_of_error = t_critical * sem
                ci_lower = mean_npv - margin_of_error
                ci_upper = mean_npv + margin_of_error
                relative_error = (margin_of_error / abs(mean_npv) * 100) if mean_npv != 0 else np.inf
                
                cv = (std_npv / mean_npv) if mean_npv != 0 else 0
                
                # 统计每个pixel的政策小组数量分布
                policy_count_stats = {
                    'pixels_with_P1': (final_stats['P1'] > 0).sum(),
                    'pixels_with_P2': (final_stats['P2'] > 0).sum(),
                    'pixels_with_P3': (final_stats['P3'] > 0).sum(),
                    'pixels_with_P4': (final_stats['P4'] > 0).sum(),
                    'avg_P1_per_pixel': final_stats['P1'].mean(),
                    'avg_P2_per_pixel': final_stats['P2'].mean(),
                    'avg_P3_per_pixel': final_stats['P3'].mean(),
                    'avg_P4_per_pixel': final_stats['P4'].mean()
                }
                
                detailed_results[str(policy_cat)] = {
                    'total_pixels': n,
                    'mean_npv': mean_npv,
                    'std_npv': std_npv,
                    'sem': sem,
                    'margin_of_error': margin_of_error,
                    'ci_lower': ci_lower,
                    'ci_upper': ci_upper,
                    'relative_error_pct': relative_error,
                    't_critical': t_critical,
                    'df': df_freedom,
                    'positive_mean_ratio': (positive_mean_pixels / n * 100) if n > 0 else 0,
                    'positive_max_ratio': (positive_max_pixels / n * 100) if n > 0 else 0,
                    'cv': cv,
                    'max_npv': final_stats['max_npv'].max(),
                    'min_npv': final_stats['min_npv'].min(),
                    **policy_count_stats  # 添加政策小组数量统计
                }
    
    return results, detailed_results
# 执行分析
policy_results, detailed_policy_results = analyze_policy_paths_by_pixel_with_ci(df_economic, target_year=2050)

# 生成政策层级大组对比表（含 ± 格式）
policy_comparison = []
for policy in ['P1', 'P2', 'P3', 'P4']:
    if policy in policy_results:
        stats = policy_results[policy]
        # 创建 mean ± error 格式
        mean_pm_error = f"{stats['mean_npv']:.2f} ± {stats['margin_of_error']:.2f}"
        
        policy_comparison.append({
            '政策层级': policy,
            '总像素数': stats['total_pixels'],
            '均值NPV (±95%CI)': mean_pm_error,
            '均值': round(stats['mean_npv'], 2),
            '±误差': round(stats['margin_of_error'], 2),
            '相对误差(%)': round(stats['relative_error_pct'], 2),
            '均值>0比例(%)': round(stats['positive_mean_ratio'], 2),
            '最优>0比例(%)': round(stats['positive_max_ratio'], 2),
            '标准差': round(stats['std_npv'], 2),
            '变异系数': round(stats['cv'], 3),
            '最大NPV': round(stats['max_npv'], 2),
            '最小NPV': round(stats['min_npv'], 2)
        })

policy_level_df = pd.DataFrame(policy_comparison)

# 生成具体政策类别对比表（含 ± 格式）
detailed_comparison = []
for policy_cat, stats in detailed_policy_results.items():
    mean_pm_error = f"{stats['mean_npv']:.2f} ± {stats['margin_of_error']:.2f}"
    
    detailed_comparison.append({
        '政策类别': policy_cat,
        '总像素数': stats['total_pixels'],
        '均值NPV (±95%CI)': mean_pm_error,
        '均值': round(stats['mean_npv'], 2),
        '±误差': round(stats['margin_of_error'], 2),
        '相对误差(%)': round(stats['relative_error_pct'], 2),
        '均值>0比例(%)': round(stats['positive_mean_ratio'], 2),
        '最优>0比例(%)': round(stats['positive_max_ratio'], 2),
        '标准差': round(stats['std_npv'], 2),
        '变异系数': round(stats['cv'], 3),
        '最大NPV': round(stats['max_npv'], 2),
        '最小NPV': round(stats['min_npv'], 2)
    })

detailed_policy_df = pd.DataFrame(detailed_comparison)

# 输出两个对比表
print("=" * 100)
print("政策层级大组对比表 (含95%置信区间 ± 格式):")
print("=" * 100)
display(policy_level_df)

print("\n" + "=" * 100)
print("具体政策类别对比表 (含95%置信区间 ± 格式):")
print("=" * 100)
display(detailed_policy_df)

# 输出简洁的论文用格式
print("\n" + "=" * 100)
print("论文用格式 (可直接复制):")
print("=" * 100)
for policy in ['P1', 'P2', 'P3', 'P4']:
    if policy in policy_results:
        s = policy_results[policy]
        print(f"{policy}: {s['mean_npv']:.2f} ± {s['margin_of_error']:.2f} USD (n={s['total_pixels']}, 相对误差={s['relative_error_pct']:.2f}%)")

print("\n具体政策类别:")
for policy_cat, s in detailed_policy_results.items():
    print(f"{policy_cat}: {s['mean_npv']:.2f} ± {s['margin_of_error']:.2f} USD (n={s['total_pixels']}, 相对误差={s['relative_error_pct']:.2f}%)")



政策层级大组对比表 (含95%置信区间 ± 格式):


Unnamed: 0,政策层级,总像素数,均值NPV (±95%CI),均值,±误差,相对误差(%),均值>0比例(%),最优>0比例(%),标准差,变异系数,最大NPV,最小NPV
0,P1,70337,-1645507.42 ± 3114.96,-1645507.42,3114.96,0.19,0.06,100.0,421491.84,-0.256,21664629.84,-6481719.0
1,P2,70337,-269874.92 ± 4870.12,-269874.92,4870.12,1.8,29.59,100.0,658985.94,-2.442,27474809.82,-6548864.92
2,P3,70337,2173710.77 ± 7967.94,2173710.77,7967.94,0.37,99.85,100.0,1078158.05,0.496,28488274.13,-3723591.92
3,P4,70337,-2717126.60 ± 2916.04,-2717126.6,2916.04,0.11,0.0,0.0,394575.41,-0.145,-198914.41,-3804076.66



具体政策类别对比表 (含95%置信区间 ± 格式):


Unnamed: 0,政策类别,总像素数,均值NPV (±95%CI),均值,±误差,相对误差(%),均值>0比例(%),最优>0比例(%),标准差,变异系数,最大NPV,最小NPV
0,P1a,70337,-2726258.63 ± 2378.73,-2726258.63,2378.73,0.09,0.0,31.4,321870.39,-0.118,1865763.44,-4410847.06
1,P1b,70337,-2207730.91 ± 2757.92,-2207730.91,2757.92,0.12,0.0,100.0,373179.19,-0.169,15065387.68,-6481719.0
2,P1c,70337,-75866.44 ± 4969.59,-75866.44,4969.59,6.55,35.27,100.0,672444.59,-8.864,21664629.84,-3682397.54
3,P1d,70337,-1444327.44 ± 2593.24,-1444327.44,2593.24,0.18,0.02,72.8,350896.08,-0.243,2738796.65,-5743406.85
4,P2,70337,-759582.12 ± 3542.10,-759582.12,3542.1,0.47,8.23,92.6,479288.87,-0.631,4292145.24,-3108933.76
5,P2a,70337,-23592.40 ± 5207.89,-23592.4,5207.89,22.07,37.14,100.0,704690.59,-29.869,27474809.82,-6548864.92
6,P2c,70337,-2118899.12 ± 3132.14,-2118899.12,3132.14,0.15,0.0,0.42,423816.81,-0.2,966299.38,-3786001.05
7,P3a,70337,-1731335.68 ± 3076.50,-1731335.68,3076.5,0.18,0.03,0.03,416287.6,-0.24,516842.69,-2700912.03
8,P3b,70337,2424489.04 ± 8413.35,2424489.04,8413.35,0.35,99.97,100.0,1138427.2,0.47,28488274.13,-3723591.92
9,P3c,70337,438744.63 ± 4780.09,438744.63,4780.09,1.09,76.91,80.92,646803.45,1.474,3076931.65,-1219782.54



论文用格式 (可直接复制):
P1: -1645507.42 ± 3114.96 USD (n=70337, 相对误差=0.19%)
P2: -269874.92 ± 4870.12 USD (n=70337, 相对误差=1.80%)
P3: 2173710.77 ± 7967.94 USD (n=70337, 相对误差=0.37%)
P4: -2717126.60 ± 2916.04 USD (n=70337, 相对误差=0.11%)

具体政策类别:
P1a: -2726258.63 ± 2378.73 USD (n=70337, 相对误差=0.09%)
P1b: -2207730.91 ± 2757.92 USD (n=70337, 相对误差=0.12%)
P1c: -75866.44 ± 4969.59 USD (n=70337, 相对误差=6.55%)
P1d: -1444327.44 ± 2593.24 USD (n=70337, 相对误差=0.18%)
P2: -759582.12 ± 3542.10 USD (n=70337, 相对误差=0.47%)
P2a: -23592.40 ± 5207.89 USD (n=70337, 相对误差=22.07%)
P2c: -2118899.12 ± 3132.14 USD (n=70337, 相对误差=0.15%)
P3a: -1731335.68 ± 3076.50 USD (n=70337, 相对误差=0.18%)
P3b: 2424489.04 ± 8413.35 USD (n=70337, 相对误差=0.35%)
P3c: 438744.63 ± 4780.09 USD (n=70337, 相对误差=1.09%)
P4: -2717126.60 ± 2916.04 USD (n=70337, 相对误差=0.11%)


In [5]:
detailed_policy_results

{'P1a': {'total_pixels': 70337,
  'mean_npv': -2726258.632064631,
  'std_npv': 321870.3900472014,
  'sem': 1213.637831552,
  'margin_of_error': 2378.7273740175474,
  'ci_lower': -2728637.359438649,
  'ci_upper': -2723879.9046906135,
  'relative_error_pct': 0.08725244721980417,
  't_critical': 1.9599977128066541,
  'df': 70336,
  'positive_mean_ratio': 0.0,
  'positive_max_ratio': 31.395993573794733,
  'cv': -0.11806304297822426,
  'max_npv': 1865763.44,
  'min_npv': -4410847.06,
  'pixels_with_P1': 70337,
  'pixels_with_P2': 70337,
  'pixels_with_P3': 70337,
  'pixels_with_P4': 70337,
  'avg_P1_per_pixel': 65.0,
  'avg_P2_per_pixel': 126.0,
  'avg_P3_per_pixel': 72.0,
  'avg_P4_per_pixel': 2.0},
 'P1b': {'total_pixels': 70337,
  'mean_npv': -2207730.907371938,
  'std_npv': 373179.1923607033,
  'sem': 1407.1017397112967,
  'margin_of_error': 2757.9161915204054,
  'ci_lower': -2210488.823563459,
  'ci_upper': -2204972.9911804176,
  'relative_error_pct': 0.12492084892734515,
  't_critical

In [None]:
# 按州统计各州累积盈亏情况、纯PV盈亏（使用空间连接方法）

import geopandas as gpd
from shapely.geometry import Point

# 1. 仅选择2050年的经济结果
df_economic_2050 = df_economic[df_economic['analysis_year'] == 2050].copy()

# 2. 读取面积信息
merged_data_for_analysis = pd.read_csv('data/US_data/df_merged_data_for_analysis.csv')

if 'area_m2' not in merged_data_for_analysis.columns:
    print("错误：merged_data_for_analysis 中缺少 area_m2 列")
else:
    # 3. 合并面积信息
    pixel_info = merged_data_for_analysis[['lat', 'lon', 'area_m2']].drop_duplicates(subset=['lat', 'lon'])
    df_economic_2050 = df_economic_2050.merge(pixel_info, on=['lat', 'lon'], how='left')
    
    # 检查面积数据完整性
    missing_area = df_economic_2050['area_m2'].isna().sum()
    if missing_area > 0:
        df_economic_2050 = df_economic_2050.dropna(subset=['area_m2'])
    
    # 4. 读取州边界shapefile
    us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')
    
    # 转换州边界shapefile为4326坐标系
    if us_states.crs != "EPSG:4326":
        us_states = us_states.to_crs(epsg=4326)
    
    # 5. 创建点图层（从经济数据的经纬度）
    geometry = [Point(xy) for xy in zip(df_economic_2050['lon'], df_economic_2050['lat'])]
    pixel_gdf = gpd.GeoDataFrame(df_economic_2050, geometry=geometry, crs='EPSG:4326')
    
    # 6. 空间连接，获得州标签
    pixel_with_states = gpd.sjoin(pixel_gdf, us_states[['NAME', 'geometry']], how='left', predicate='within')
    
    # 检查空间连接结果
    unmatched = pixel_with_states['NAME'].isna().sum()
    if unmatched > 0:
        print(f"警告：有 {unmatched} 个像素未匹配到州信息，将被排除")
        pixel_with_states = pixel_with_states.dropna(subset=['NAME'])
    
    # 7. 计算面积（仅公顷）及累积NPV
    # 1公顷 = 10000平方米
    pixel_with_states['area_ha'] = pixel_with_states['area_m2'] / 10000
    pixel_with_states['total_npv_usd'] = pixel_with_states['net_npv_usd'] * pixel_with_states['area_ha']
    
    # 8. 按州分组统计
    state_npv_summary = pixel_with_states.groupby('NAME', as_index=False).agg({
        'total_npv_usd': 'sum',
        'area_ha': 'sum',
        'lat': 'count'  # 统计像素数量
    })
    state_npv_summary.columns = ['State_name', 'total_npv_usd', 'total_area_ha', 'pixel_count']
    
    # 9. 计算派生指标
    state_npv_summary['total_npv_billion_usd'] = state_npv_summary['total_npv_usd'] / 1e8
    state_npv_summary['npv_per_ha'] = state_npv_summary['total_npv_usd'] / state_npv_summary['total_area_ha']
    
    # 10. 排序和格式化
    state_npv_summary = state_npv_summary.sort_values('total_npv_usd', ascending=False)
    
    # 四舍五入
    state_npv_summary['total_npv_usd'] = state_npv_summary['total_npv_usd'].round(2)
    state_npv_summary['total_area_ha'] = state_npv_summary['total_area_ha'].round(2)
    state_npv_summary['total_npv_billion_usd'] = state_npv_summary['total_npv_billion_usd'].round(2)
    state_npv_summary['npv_per_ha'] = state_npv_summary['npv_per_ha'].round(2)
    
    # 11. 统计盈利和亏损州
    profit_states = (state_npv_summary['total_npv_usd'] > 0).sum()
    loss_states = (state_npv_summary['total_npv_usd'] < 0).sum()
    zero_states = (state_npv_summary['total_npv_usd'] == 0).sum()
    
    # 12. 输出结果
    print("=" * 80)
    print("各州累积盈亏情况 (2050年) - 基于空间连接")
    print("=" * 80)
    display(state_npv_summary[[
        'State_name', 'pixel_count', 'total_area_ha', 
        'total_npv_billion_usd', 'npv_per_ha'
    ]])
    
    print("\n" + "=" * 80)
    print("汇总统计 (2050年)")
    print("=" * 80)
    print(f"盈利州数量: {profit_states}")
    print(f"亏损州数量: {loss_states}")
    print(f"盈亏平衡州数量: {zero_states}")
    print(f"总州数: {len(state_npv_summary)}")
    print(f"\n全美累积NPV: ${state_npv_summary['total_npv_usd'].sum():,.2f} USD")
    print(f"全美累积NPV: {state_npv_summary['total_npv_billion_usd'].sum():.2f} 亿美元")
    print(f"\n全美总面积: {state_npv_summary['total_area_ha'].sum():,.2f} 公顷")
    print(f"全美总像素数: {state_npv_summary['pixel_count'].sum():,}")
    print(f"\n全美平均每公顷NPV: ${state_npv_summary['total_npv_usd'].sum() / state_npv_summary['total_area_ha'].sum():,.2f} USD")
    print("=" * 80)

各州累积盈亏情况 (2050年) - 基于空间连接


Unnamed: 0,State_name,pixel_count,total_area_ha,total_npv_billion_usd,npv_per_ha
40,Texas,1859240,136730500.0,1361605.17,995831.21
3,California,1047545,72209010.0,1201500.41,1663920.3
8,Georgia,890665,64815630.0,333698.49,514842.64
7,Florida,532915,40453780.0,263913.37,652382.47
4,Colorado,443875,29327440.0,205365.78,700251.33
30,North Carolina,636000,44439940.0,156239.5,351574.49
1,Arizona,115010,8262459.0,95751.67,1158876.15
41,Utah,436455,28580480.0,94744.34,331500.15
33,Oklahoma,161385,11252450.0,32285.02,286915.46
28,New Mexico,84535,5994601.0,20935.14,349233.26



汇总统计 (2050年)
盈利州数量: 14
亏损州数量: 34
盈亏平衡州数量: 0
总州数: 48

全美累积NPV: $90,001,569,060,352.56 USD
全美累积NPV: 900015.72 亿美元

全美总面积: 1,246,389,653.21 公顷
全美总像素数: 18,639,305

全美平均每公顷NPV: $72,209.82 USD


In [None]:
'''
指针：2.3的第四段、有关经济收益的统计，并且对比LNCS的成本
'''

# 按州统计各州累积盈亏情况（基于net_npv_usd字段，实际2050年净现值）- 使用空间连接方法
import geopandas as gpd
from shapely.geometry import Point

# 1. 读取数据（net_npv_usd已经是2050年数据，无需筛选年份）
merged_data_for_analysis = pd.read_csv('data/US_data/df_merged_data_for_analysis.csv')

# 不检查数据完整性，直接计算
df_analysis = merged_data_for_analysis[['lat', 'lon', 'area_m2', 'net_npv_usd']].copy()

# 2. 读取州边界shapefile
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')

# 转换州边界shapefile为4326坐标系
if us_states.crs != "EPSG:4326":
    us_states = us_states.to_crs(epsg=4326)

# 3. 创建点图层
geometry = [Point(xy) for xy in zip(df_analysis['lon'], df_analysis['lat'])]
pixel_gdf = gpd.GeoDataFrame(df_analysis, geometry=geometry, crs='EPSG:4326')

# 4. 空间连接，获得州标签
pixel_with_states = gpd.sjoin(pixel_gdf, us_states[['NAME', 'geometry']], how='left', predicate='within')

# 移除无州信息数据行（直接，且不输出警告）
pixel_with_states = pixel_with_states.dropna(subset=['NAME'])

# 5. 计算面积（只用公顷）及累积净现值
# 1公顷 = 10000平方米
pixel_with_states['area_ha'] = pixel_with_states['area_m2'] / 10000
pixel_with_states['total_npv_usd'] = pixel_with_states['net_npv_usd'] * pixel_with_states['area_ha']

# 6. 按州分组统计
state_npv_summary = pixel_with_states.groupby('NAME', as_index=False).agg({
    'total_npv_usd': 'sum',
    'area_ha': 'sum',
    'lat': 'count'  # 统计像素数量
})
state_npv_summary.columns = ['State_name', 'total_npv_usd', 'total_area_ha', 'pixel_count']

# 计算派生指标
state_npv_summary['total_npv_billion_usd'] = state_npv_summary['total_npv_usd'] / 1e9
state_npv_summary['npv_per_ha'] = state_npv_summary['total_npv_usd'] / state_npv_summary['total_area_ha']

# 排序和格式化
state_npv_summary = state_npv_summary.sort_values('total_npv_usd', ascending=False)

# 四舍五入
state_npv_summary['total_npv_usd'] = state_npv_summary['total_npv_usd'].round(2)
state_npv_summary['total_area_ha'] = state_npv_summary['total_area_ha'].round(2)
state_npv_summary['total_npv_billion_usd'] = state_npv_summary['total_npv_billion_usd'].round(2)
state_npv_summary['npv_per_ha'] = state_npv_summary['npv_per_ha'].round(2)

# 统计盈利和亏损州
profit_states = (state_npv_summary['total_npv_usd'] > 0).sum()
loss_states = (state_npv_summary['total_npv_usd'] < 0).sum()
zero_states = (state_npv_summary['total_npv_usd'] == 0).sum()

# 统计 npv_per_ha 的最小值和最大值
npv_per_ha_min = state_npv_summary['npv_per_ha'].min()
npv_per_ha_max = state_npv_summary['npv_per_ha'].max()

# 新增：计算最低和最高十分位的平均累积NPV
deciles = 10
sorted_by_npv = state_npv_summary.sort_values('total_npv_billion_usd', ascending=True).reset_index(drop=True)
n_states = len(sorted_by_npv)
decile_size = n_states // deciles

# lowest decile: first 10%
lowest_decile = sorted_by_npv.iloc[:decile_size]
lowest_decile_avg = lowest_decile['total_npv_billion_usd'].mean()

# highest decile: last 10%
highest_decile = sorted_by_npv.iloc[-decile_size:]
highest_decile_avg = highest_decile['total_npv_billion_usd'].mean()

print("=" * 80)
print("各州累积盈亏情况 (2050年净现值net_npv_usd) - 基于空间连接")
print("=" * 80)
display(state_npv_summary[[
    'State_name', 'pixel_count', 'total_area_ha',
    'total_npv_billion_usd', 'npv_per_ha'
]])

print("\n" + "=" * 80)
print("汇总统计 (2050年净现值net_npv_usd)")
print("=" * 80)
print(f"盈利州数量: {profit_states}")
print(f"亏损州数量: {loss_states}")
print(f"盈亏平衡州数量: {zero_states}")
print(f"总州数: {len(state_npv_summary)}")
print(f"\n全美累积NPV: ${state_npv_summary['total_npv_usd'].sum():,.2f} USD")
print(f"全美累积NPV: {state_npv_summary['total_npv_billion_usd'].sum():.2f} 亿美元")
print(f"\n全美总面积: {state_npv_summary['total_area_ha'].sum():,.2f} 公顷")
print(f"全美总像素数: {state_npv_summary['pixel_count'].sum():,}")
print(f"\n全美平均每公顷NPV: ${state_npv_summary['total_npv_usd'].sum() / state_npv_summary['total_area_ha'].sum():,.2f} USD")
print(f"npv_per_ha 最小值: {npv_per_ha_min:,.2f} USD")
print(f"npv_per_ha 最大值: {npv_per_ha_max:,.2f} USD")

# 增加decile对比输出
print(f"\n最低十分位（lowest decile）州均值累积NPV: {lowest_decile_avg:,.2f} 亿美元")
print(f"最高十分位（highest decile）州均值累积NPV: {highest_decile_avg:,.2f} 亿美元")
print("=" * 80)


各州累积盈亏情况 (2050年净现值net_npv_usd) - 基于空间连接


Unnamed: 0,State_name,pixel_count,total_area_ha,total_npv_billion_usd,npv_per_ha
40,Texas,7016,515964.22,513.81,995831.21
3,California,3953,272486.84,453.4,1663920.3
8,Georgia,3361,244587.28,125.92,514842.64
7,Florida,2011,152655.77,99.59,652382.47
4,Colorado,1675,110669.58,77.5,700251.33
30,North Carolina,2400,167697.89,58.96,351574.49
1,Arizona,434,31179.09,36.13,1158876.15
41,Utah,1647,107850.87,35.75,331500.15
33,Oklahoma,609,42462.07,12.18,286915.46
28,New Mexico,319,22621.14,7.9,349233.26



汇总统计 (2050年净现值net_npv_usd)
盈利州数量: 14
亏损州数量: 34
盈亏平衡州数量: 0
总州数: 48

全美累积NPV: $339,628,562,491.89 USD
全美累积NPV: 339.62 亿美元

全美总面积: 4,703,357.20 公顷
全美总像素数: 70,337

全美平均每公顷NPV: $72,209.82 USD
npv_per_ha 最小值: -1,266,171.27 USD
npv_per_ha 最大值: 1,663,920.30 USD

最低十分位（lowest decile）州均值累积NPV: -116.62 亿美元
最高十分位（highest decile）州均值累积NPV: 298.18 亿美元


## 5、3E-synergy intra-inter state analysis 

In [1]:
import pandas as pd 

state_analysis_df = pd.read_csv('data/US_data/US_analysis_reslut/state_level_analysis_with_wccd.csv')
multi_data = pd.read_csv('data/US_data/US_analysis_reslut/state_integration_analysis_simplified.csv')
sorted_states = state_analysis_df.sort_values('CCD_Mean', ascending=False)[['State_name', 'Overall_improvement_mean']]
print(sorted_states)

multi_data

        State_name  Overall_improvement_mean
36         Arizona                 47.382776
39      New Mexico                 66.075623
2       California                 36.387923
12        Colorado                 50.958862
0            Texas                 49.746795
4          Georgia                 15.532334
14            Utah                 67.582435
33        Oklahoma                 47.714996
18          Kansas                 60.370690
10         Florida                 19.964771
40          Nevada                105.594215
8   North Carolina                  8.060611
20         Alabama                  8.383989
23  South Carolina                  4.583928
31        Virginia                  8.144738
11       Louisiana                  4.316376
27       Tennessee                  4.453314
15     Mississippi                  3.094682
35         Wyoming                 33.577278
24        Arkansas                  3.268328
32        Maryland                  4.486485
22    Penn

Unnamed: 0,State_name,Solution_Type,Environmental_sustainability,Emission_mitigation_ability,Economic_feasibility
0,Alabama,Economic,54800.811287,1.978046e+08,2.792126e+09
1,Alabama,Emission_mitigation,61913.315653,2.235089e+08,3.317529e+09
2,Alabama,Environmental,55361.907643,1.974559e+08,2.902024e+09
3,Alabama,WCCD,62586.338170,2.232493e+08,3.199690e+09
4,Arizona,Economic,24118.433048,1.285646e+08,3.330161e+10
...,...,...,...,...,...
187,Wisconsin,WCCD,40774.809190,1.237927e+08,-2.521406e+10
188,Wyoming,Economic,4163.079119,4.717130e+07,-2.628205e+09
189,Wyoming,Emission_mitigation,10942.351031,1.092590e+08,-8.640717e+09
190,Wyoming,Environmental,3316.182512,1.836984e+07,-1.752189e+09


In [2]:
# 生成典型区量表
'''

指针：典型案例样，2.5第一段展示
内容：生成案例表
逻辑：精简column

'''
state_analysis_df['PV installed capacity_GW'] = state_analysis_df['abandoned_land_ha'] * 10000 * 0.17 / 1e6
state_analysis_df['abandoned_land_kha'] = state_analysis_df['abandoned_land_ha'] / 1000
state_analysis_df['Economic_viability_k_ha'] = state_analysis_df['Economic_NPV_per_ha'] / 1000
state_analysis_df['Emission_mitigation_ability_t_ha'] = state_analysis_df['Emission_mitigation_per_ha']
target_columns = ['State_name', 'abandoned_land_kha', 'PV installed capacity_GW',
                  'Environmental_suitability_per_ha', 'Emission_mitigation_ability_t_ha',
                  'Economic_viability_k_ha', 'CCD_Mean', 'Overall_improvement_mean']

# 1. 先生成原表
state_analysis_df = state_analysis_df.copy()
df_states = state_analysis_df[target_columns]

# 2. 生成Overall行，需要各指标按说明聚合
tot_abandoned_land_kha = df_states['abandoned_land_kha'].sum()
tot_PV_installed_capacity_GW = df_states['PV installed capacity_GW'].sum()

def weighted_avg(col):
    return (df_states[col] * df_states['abandoned_land_kha']).sum() / tot_abandoned_land_kha if tot_abandoned_land_kha != 0 else float('nan')

overall_row = {
    'State_name': 'Overall',
    'abandoned_land_kha': round(tot_abandoned_land_kha, 2),
    'PV installed capacity_GW': round(tot_PV_installed_capacity_GW, 2),
    'Environmental_suitability_per_ha': round(weighted_avg('Environmental_suitability_per_ha'), 2),
    'Emission_mitigation_ability_t_ha': round(weighted_avg('Emission_mitigation_ability_t_ha'), 2),
    'Economic_viability_k_ha': round(weighted_avg('Economic_viability_k_ha'), 2),
    'CCD_Mean': round(weighted_avg('CCD_Mean'), 2),
    'Overall_improvement_mean': round(df_states['Overall_improvement_mean'].mean(), 2)
}

# 3. 行合并
df_states_rounded = df_states.copy()
for col in ['abandoned_land_kha', 'PV installed capacity_GW',
            'Environmental_suitability_per_ha', 'Emission_mitigation_ability_t_ha',
            'Economic_viability_k_ha', 'CCD_Mean', 'Overall_improvement_mean']:
    df_states_rounded[col] = df_states_rounded[col].round(2)

full_df = pd.concat(
    [df_states_rounded, pd.DataFrame([overall_row])],
    ignore_index=True
)
full_df


Unnamed: 0,State_name,abandoned_land_kha,PV installed capacity_GW,Environmental_suitability_per_ha,Emission_mitigation_ability_t_ha,Economic_viability_k_ha,CCD_Mean,Overall_improvement_mean
0,Texas,515.96,877.14,0.72,3689.42,995.83,0.91,49.75
1,Illinois,295.4,502.17,0.93,3162.07,-42.82,0.82,8.78
2,California,272.49,463.23,0.74,4167.68,1663.92,0.94,36.39
3,Michigan,246.01,418.22,0.98,2937.64,-441.21,0.79,10.19
4,Georgia,244.59,415.8,0.99,3559.99,514.84,0.9,15.53
5,Indiana,223.56,380.06,0.95,3086.24,-440.81,0.79,4.23
6,Wisconsin,209.3,355.8,0.95,2999.38,-620.92,0.77,3.77
7,Montana,171.38,291.34,0.42,3479.28,-755.14,0.71,1.75
8,North Carolina,167.7,285.09,0.99,3401.52,351.57,0.87,8.06
9,Ohio,156.2,265.54,0.93,3057.05,-151.05,0.8,1.87


In [3]:
# 计算每个州的2050需求
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point


# ===== 步骤1: 加载撂荒地的发电数据 =====
merged_data_for_analysis = pd.read_csv('data/US_data/df_merged_data_for_analysis.csv')
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')
us_states_4326 = us_states.to_crs('EPSG:4326')
# 创建geometry
geometry = [Point(xy) for xy in zip(merged_data_for_analysis['lon'], merged_data_for_analysis['lat'])]
pixel_gdf = gpd.GeoDataFrame(merged_data_for_analysis, geometry=geometry, crs='EPSG:4326')

# Spatial join with us_states_4326
try:
    pixel_with_states = gpd.sjoin(pixel_gdf, us_states_4326[['NAME', 'geometry']], 
                                  how='left', predicate='within')
except TypeError:
    # 兼容旧版本geopandas
    pixel_with_states = gpd.sjoin(pixel_gdf, us_states_4326[['NAME', 'geometry']], 
                                  how='left', op='within')

# Convert state name to uppercase to match energy data
pixel_with_states['STATE'] = pixel_with_states['NAME'].str.upper()

# Remove geometry and index_right columns, keep data
merged_data_with_states = pixel_with_states.drop(columns=['geometry', 'index_right'], errors='ignore').copy()

# 计算power_pixel per year: power_generation_kwha * area_m2 / 10000 (单位: kWh)
merged_data_with_states['power_pixel'] = merged_data_with_states['power_generation_kwha'] * merged_data_with_states['area_m2'] / 10000 /30



# ===== 步骤2:  汇总NREL的平均场景数据=====


# 直接加载能源数据
energy_df = pd.read_csv('data/US_data/US_electricity/NREL/energy.csv.gzip', compression='gzip')
print(f"✅ 能源数据加载成功: {len(energy_df):,} 条记录")

if energy_df is not None:
    # 过滤电力数据
    energy_electricity = energy_df[energy_df['FINAL_ENERGY'] == 'ELECTRICITY'].copy()
    
    # 过滤有效年份 (2017-2050)
    valid_years = list(range(2017, 2051))
    energy_electricity = energy_electricity[energy_electricity['YEAR'].isin(valid_years)].copy()
    
    # 按州、场景、年份汇总
    energy_summary_by_state_scenario_year = energy_electricity.groupby(
        ['STATE', 'SCENARIO', 'YEAR']
    )['MMBTU'].sum().reset_index()
    
    print(f"✅ 能源数据汇总完成")
    print(f"   • 场景数: {energy_summary_by_state_scenario_year['SCENARIO'].nunique()}")
    print(f"   • 年份范围: {energy_summary_by_state_scenario_year['YEAR'].min()}-{energy_summary_by_state_scenario_year['YEAR'].max()}")
    
    # 单位转换: MMBTU → kWh (1 MMBTU = 293,071 kWh)
    MMBTU_TO_KWH = 293.07107
    energy_summary_by_state_scenario_year['MMBTU_kWh'] = energy_summary_by_state_scenario_year['MMBTU'] * MMBTU_TO_KWH
    
    # 生成平均情景: 对5个场景在各州做.mean()
    # 先获取2050年的数据
    energy_2050 = energy_summary_by_state_scenario_year[
        energy_summary_by_state_scenario_year['YEAR'] == 2050
    ].copy()
    
    # 计算各州的平均需求（跨场景）
    mean_scenario_2050 = energy_2050.groupby('STATE')['MMBTU_kWh'].mean().reset_index()
    mean_scenario_2050['SCENARIO'] = 'MEAN SCENARIO'
    mean_scenario_2050['YEAR'] = 2050
    
    print(f"✅ 平均情景生成完成")
    print(f"   • 平均情景包含 {len(mean_scenario_2050)} 个州")
    
    # 创建各州需求字典（平均情景，2050年）
    state_demand_dict = dict(zip(mean_scenario_2050['STATE'], mean_scenario_2050['MMBTU_kWh']))
    
    print(f"\n✅ 各州需求字典创建完成（平均情景，2050年）")
    print(f"   • 包含 {len(state_demand_dict)} 个州")
    print(f"\nSample state demands (kWh):")
    sample_states = list(state_demand_dict.keys())[:5]
    for state in sample_states:
        print(f"   • {state}: {state_demand_dict[state]:,.0f} kWh")
else:
    state_demand_dict = {}
    print("⚠️ 无法生成需求字典，将跳过需求约束功能")

import numpy as np
# 按'demand_by_name'生成的需求，为过滤后的full_df merge上需求
# 假定full_df的'国家/地区'信息与demand_by_name的'STATE'能一一对应可通过'full_df.State_name'和'demand_by_name.STATE'匹配
filtered_full_df = full_df.copy()
# 按'NAME'统计能源需求量
demand_by_name = merged_data_with_states[['STATE']].drop_duplicates().merge(
    mean_scenario_2050[['STATE', 'MMBTU_kWh']], on='STATE', how='left')
demand_by_name

# 将 State_name 列全部大写以确保匹配
# 将 State_name 列全部大写以确保正确匹配
filtered_full_df['State_name'] = filtered_full_df['State_name'].str.upper()
filtered_full_df = filtered_full_df.merge(
    demand_by_name.rename(columns={'STATE': 'State_name', 'MMBTU_kWh': 'Demand_MMBTU_kWh'}),
    on='State_name',
    how='left'
)
# 恢复 State_name 为首字母大写（title case）
filtered_full_df['State_name'] = filtered_full_df['State_name'].str.title()

# 将 filtered_full_df 中 State_name 为 'Overall' 的行的 Demand_MMBTU_kWh 设置为该列的总和（其他列不变）
if 'Overall' in filtered_full_df['State_name'].values:
    total_sum = filtered_full_df.loc[filtered_full_df['State_name'] != 'Overall', 'Demand_MMBTU_kWh'].sum()
    filtered_full_df.loc[filtered_full_df['State_name'] == 'Overall', 'Demand_MMBTU_kWh'] = total_sum
    display_df = filtered_full_df

# 优化 Demand_MMBTU_kWh 单位，将其从 kWh 换算成 TWh，便于阅读（1 TWh = 1e12 kWh）
filtered_full_df['Demand_TWh'] = filtered_full_df['Demand_MMBTU_kWh'] / 1e9

# 只提取 California, Indiana, Texas, New York, Georgia, 以及 Overall 行，按需显示
states_of_interest = ['California', 'Indiana', 'Texas', 'New York', 'Georgia', 'Overall']
display_df_subset = display_df[display_df['State_name'].isin(states_of_interest)].copy()
display_df_subset['Demand_TWh'] = (display_df_subset['Demand_MMBTU_kWh'] / 1e9).round(2)

# 只显示关心列，便于直观展示
columns_to_show = [
    'State_name', 
    'abandoned_land_kha', 
    'PV installed capacity_GW', 
    'Environmental_suitability_per_ha',
    'Emission_mitigation_ability_t_ha', 
    'Economic_viability_k_ha', 
    'CCD_Mean', 
    'Overall_improvement_mean', 
    'Demand_TWh'
]
display_columns = [col for col in columns_to_show if col in display_df_subset.columns]
display(display_df_subset[display_columns])


✅ 能源数据加载成功: 9,058,416 条记录
✅ 能源数据汇总完成
   • 场景数: 5
   • 年份范围: 2017-2050
✅ 平均情景生成完成
   • 平均情景包含 51 个州

✅ 各州需求字典创建完成（平均情景，2050年）
   • 包含 51 个州

Sample state demands (kWh):
   • ALABAMA: 107,952,555,453 kWh
   • ALASKA: 11,718,702,642 kWh
   • ARIZONA: 98,404,770,746 kWh
   • ARKANSAS: 65,305,206,393 kWh
   • CALIFORNIA: 529,721,168,133 kWh


Unnamed: 0,State_name,abandoned_land_kha,PV installed capacity_GW,Environmental_suitability_per_ha,Emission_mitigation_ability_t_ha,Economic_viability_k_ha,CCD_Mean,Overall_improvement_mean,Demand_TWh
0,Texas,515.96,877.14,0.72,3689.42,995.83,0.91,49.75,572.86
2,California,272.49,463.23,0.74,4167.68,1663.92,0.94,36.39,529.72
4,Georgia,244.59,415.8,0.99,3559.99,514.84,0.9,15.53,197.19
5,Indiana,223.56,380.06,0.95,3086.24,-440.81,0.79,4.23,131.56
28,New York,57.1,97.07,0.99,2973.04,-371.57,0.8,0.21,255.29
48,Overall,4703.36,7995.71,0.84,3446.23,72.21,0.84,38.36,5583.35


In [4]:
# ===== 步骤3: 计算需求约束截断点 =====

import numpy as np

print("步骤3: 计算各州的需求约束截断点...")

# 存储各州的截断点信息
state_cutoff_info = {}

if len(state_demand_dict) > 0:
    # 过滤有州标签的数据
    data_with_states = merged_data_with_states[merged_data_with_states['STATE'].notna()].copy()
    
    # 获取所有唯一的州
    unique_states = data_with_states['STATE'].unique()
    
    print(f"处理 {len(unique_states)} 个州...")
    
    # 统计满足需求的州
    states_meeting_demand = []
    states_not_meeting_demand = []
    
    for state in unique_states:
        # 获取该州的所有pixel
        state_data = data_with_states[data_with_states['STATE'] == state].copy()
        
        if len(state_data) == 0:
            continue
        
        # 获取该州的需求（kWh）
        state_demand_kwh = state_demand_dict.get(state, None)
        
        if state_demand_kwh is None:
            # 如果该州没有需求数据，跳过
            continue
        
        # 获取ccd_optimized和power_pixel
        ccd_optimized = state_data['ccd_optimized'].values
        power_pixel = state_data['power_pixel'].values
        
        # 按ccd_optimized降序排序
        ccd_optimized_indices = np.argsort(ccd_optimized)[::-1]
        
        # 计算累积发电量
        power_cumulative_benefits = np.cumsum(power_pixel[ccd_optimized_indices])
        
        # 找到满足该州需求的截断点
        cutoff_mask = power_cumulative_benefits >= state_demand_kwh
        if cutoff_mask.any():
            cutoff_idx = np.where(cutoff_mask)[0][0]
            cutoff_power = power_cumulative_benefits[cutoff_idx]
            
            # 记录截断点信息
            state_cutoff_info[state] = {
                'cutoff_idx': cutoff_idx,
                'cutoff_power': cutoff_power,
                'state_demand': state_demand_kwh,
                'total_pixels': len(state_data),
                'pixels_before_cutoff': cutoff_idx + 1,
                'pixels_after_cutoff': len(state_data) - (cutoff_idx + 1),
                'meets_demand': True  
            }
            states_meeting_demand.append(state)
        else:
            # 如果累积发电量无法满足需求，记录全部pixel
            state_cutoff_info[state] = {
                'cutoff_idx': len(state_data) - 1,
                'cutoff_power': power_cumulative_benefits[-1],
                'state_demand': state_demand_kwh,
                'total_pixels': len(state_data),
                'pixels_before_cutoff': len(state_data),
                'pixels_after_cutoff': 0,
                'warning': 'Demand not fully met',
                'meets_demand': False  
            }
            states_not_meeting_demand.append(state)
    
    print(f"✅ 截断点计算完成")
    print(f"   • 成功处理 {len(state_cutoff_info)} 个州")
    
    # 计算满足需求的州的比例
    total_states_processed = len(state_cutoff_info)
    states_meeting_count = len(states_meeting_demand)
    states_not_meeting_count = len(states_not_meeting_demand)
    
    if total_states_processed > 0:
        meeting_demand_ratio = states_meeting_count / total_states_processed * 100
        not_meeting_demand_ratio = states_not_meeting_count / total_states_processed * 100
        
        print(f"\n📊 需求满足情况统计:")
        print(f"   • 满足需求的州: {states_meeting_count} / {total_states_processed} ({meeting_demand_ratio:.1f}%)")
        print(f"   • 不满足需求的州: {states_not_meeting_count} / {total_states_processed} ({not_meeting_demand_ratio:.1f}%)")
        
        # 显示不满足需求的州列表（如果有）
        if states_not_meeting_demand:
            print(f"\n⚠️ 不满足需求的州列表 ({len(states_not_meeting_demand)} 个):")
            for state in states_not_meeting_demand[:16]:  # 只显示前10个
                info = state_cutoff_info[state]
                shortage = info['state_demand'] - info['cutoff_power']
                shortage_pct = (shortage / info['state_demand']) * 100
                print(f"     - {state}: 短缺 {shortage:,.0f} kWh ({shortage_pct:.1f}%)")

    
    # 为merged_data_with_states添加标记列，标识是否超出需求约束
    merged_data_with_states['exceeds_demand'] = False
    
    for state, cutoff_info in state_cutoff_info.items():
        state_data = merged_data_with_states[merged_data_with_states['STATE'] == state].copy()
        if len(state_data) == 0:
            continue
        
        # 获取该州的排序索引
        ccd_optimized = state_data['ccd_optimized'].values
        ccd_optimized_indices = np.argsort(ccd_optimized)[::-1]
        
        # 标记超出截断点的pixel
        cutoff_idx = cutoff_info['cutoff_idx']
        if cutoff_idx < len(ccd_optimized_indices) - 1:
            # 截断点之后的pixel标记为超出需求
            exceeded_indices = ccd_optimized_indices[cutoff_idx + 1:]
            state_indices = state_data.index[exceeded_indices]
            merged_data_with_states.loc[state_indices, 'exceeds_demand'] = True
    
    print(f"\n✅ 超出需求标记完成")
    print(f"   • 超出需求的pixel数: {merged_data_with_states['exceeds_demand'].sum()}")
    print(f"   • 满足需求的pixel数: {(~merged_data_with_states['exceeds_demand']).sum()}")
    
    # 显示一些统计信息
    print(f"\n各州截断点统计（前5个州）:")
    sample_states = list(state_cutoff_info.keys())[:5]
    for state in sample_states:
        info = state_cutoff_info[state]
        meets_status = "✅ 满足" if info.get('meets_demand', False) else "❌ 不满足"
        print(f"   • {state} ({meets_status}):")
        print(f"     - 总pixel数: {info['total_pixels']}")
        print(f"     - 截断点前: {info['pixels_before_cutoff']} pixels")
        print(f"     - 截断点后: {info['pixels_after_cutoff']} pixels")
        print(f"     - 累积发电量: {info['cutoff_power']:,.0f} kWh")
        print(f"     - 州需求: {info['state_demand']:,.0f} kWh")
        if not info.get('meets_demand', False):
            shortage = info['state_demand'] - info['cutoff_power']
            print(f"     - 短缺: {shortage:,.0f} kWh")
else:
    print("⚠️ 无需求数据，跳过截断点计算")
    state_cutoff_info = {}
    merged_data_with_states['exceeds_demand'] = False

步骤3: 计算各州的需求约束截断点...
处理 48 个州...
✅ 截断点计算完成
   • 成功处理 48 个州

📊 需求满足情况统计:
   • 满足需求的州: 33 / 48 (68.8%)
   • 不满足需求的州: 15 / 48 (31.2%)

⚠️ 不满足需求的州列表 (15 个):
     - ARIZONA: 短缺 10,788,743,039 kWh (11.0%)
     - TENNESSEE: 短缺 8,820,682,950 kWh (6.4%)
     - NEVADA: 短缺 2,559,123,034 kWh (6.1%)
     - VIRGINIA: 短缺 53,936,095,741 kWh (31.9%)
     - WEST VIRGINIA: 短缺 26,078,380,260 kWh (68.9%)
     - MARYLAND: 短缺 21,333,931,881 kWh (18.2%)
     - NEW JERSEY: 短缺 61,148,022,094 kWh (51.3%)
     - PENNSYLVANIA: 短缺 3,995,342,282 kWh (2.2%)
     - NEW YORK: 短缺 141,438,902,349 kWh (55.4%)
     - RHODE ISLAND: 短缺 13,023,095,678 kWh (98.0%)
     - MASSACHUSETTS: 短缺 83,587,542,514 kWh (91.2%)
     - CONNECTICUT: 短缺 48,265,159,403 kWh (97.3%)
     - NEW HAMPSHIRE: 短缺 16,723,236,159 kWh (84.9%)
     - MAINE: 短缺 8,267,066,692 kWh (39.2%)
     - VERMONT: 短缺 5,898,057,525 kWh (57.1%)

✅ 超出需求标记完成
   • 超出需求的pixel数: 37943
   • 满足需求的pixel数: 32394

各州截断点统计（前5个州）:
   • FLORIDA (✅ 满足):
     - 总pixel数: 2011
     - 截断

In [5]:
merged_data_with_states

Unnamed: 0,lat,lon,predicted_prob,gmm_density,sample_type,LNCS_expect,net_npv_usd,area_m2,E_yr_pixel,ccd_optimized,pv_potential_dens,power_generation_kwha,Expectation_net_benefit,NAME,STATE,power_pixel,exceeds_demand
0,25.295834,-80.287500,0.947286,2.806133e+19,prediction,811.854595,1.186559e+06,776295.361002,1.958127e+08,0.949997,4079.884063,7.567199e+07,3268.029468,Florida,FLORIDA,1.958127e+08,False
1,25.437500,-80.537500,0.999397,5.399806e+21,prediction,1348.189022,1.024540e+06,775385.854124,1.913273e+08,0.831374,3991.104240,7.402534e+07,2642.915217,Florida,FLORIDA,1.913273e+08,False
2,25.437500,-80.495834,0.963775,1.508036e+20,prediction,971.186274,1.012824e+06,775385.854124,1.910215e+08,0.890258,3984.725540,7.390703e+07,3013.539266,Florida,FLORIDA,1.910215e+08,False
3,25.445833,-80.454170,0.330575,1.800785e+16,prediction,972.811005,1.006151e+06,775332.207938,1.908320e+08,0.807544,3981.047631,7.383881e+07,3008.236626,Florida,FLORIDA,1.908320e+08,False
4,25.445833,-80.404170,0.999345,6.792866e+22,prediction,968.544647,1.034704e+06,775332.207938,1.915894e+08,0.893129,3996.846524,7.413184e+07,3028.301877,Florida,FLORIDA,1.915894e+08,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70332,48.995834,-99.995834,0.989598,5.692829e+14,prediction,377.830800,-1.040777e+06,563356.382622,1.180457e+08,0.758396,3389.228440,6.286200e+07,3011.397640,North Dakota,NORTH DAKOTA,1.180457e+08,True
70333,48.995834,-99.987500,0.985191,1.413612e+14,prediction,307.516600,-1.044904e+06,563356.382622,1.178997e+08,0.763759,3385.037475,6.278426e+07,3077.520875,North Dakota,NORTH DAKOTA,1.178997e+08,True
70334,48.995834,-99.895836,0.965398,3.161397e+13,prediction,324.674320,-1.070065e+06,563356.382622,1.170071e+08,0.749458,3359.408269,6.230891e+07,3034.733949,North Dakota,NORTH DAKOTA,1.170071e+08,True
70335,48.995834,-99.887500,0.969823,6.767585e+13,prediction,323.374660,-1.067688e+06,563356.382622,1.170906e+08,0.751877,3361.806267,6.235338e+07,3038.431607,North Dakota,NORTH DAKOTA,1.170906e+08,True


In [6]:
# === 生成 Extended Data 用于 policy guide 的 Table，优化 filtered_full_df 处理逻辑 ===

# 首先，规范化 filtered_full_df，仅保留指定列
guide_cols = [
    'State_name', 
    'abandoned_land_kha', 
    'PV installed capacity_GW', 
    'Environmental_suitability_per_ha', 
    'Emission_mitigation_ability_t_ha', 
    'Economic_viability_k_ha',
    'Demand_TWh', 
    'CCD_Mean', 
    'Overall_improvement_mean'
]

filtered_full_df_guide = filtered_full_df.copy()

# 只保留需要的列（如果有可能有多余列）
guide_cols_present = [col for col in guide_cols if col in filtered_full_df_guide.columns]
filtered_full_df_guide = filtered_full_df_guide[guide_cols_present].copy()

# 合并 Overall_improvement_mean 信息到 CCD_Mean
def format_ccd_mean(ccd, imp):
    try:
        if pd.isnull(imp):
            return f"{ccd:.2f}"
        if imp >= 0:
            return f"{ccd:.2f} (+{imp:.2f})"
        else:
            return f"{ccd:.2f} ({imp:.2f})"
    except Exception:
        return f"{ccd}"

if 'CCD_Mean' in filtered_full_df_guide.columns and 'Overall_improvement_mean' in filtered_full_df_guide.columns:
    filtered_full_df_guide['CCD_Mean'] = [
        format_ccd_mean(ccd, imp) 
        for ccd, imp in zip(filtered_full_df_guide['CCD_Mean'], filtered_full_df_guide['Overall_improvement_mean'])
    ]
    filtered_full_df_guide = filtered_full_df_guide.drop(columns=['Overall_improvement_mean'])

# 标记 Demand_TWh 是否超出需求
if 'State_name' in filtered_full_df_guide.columns and 'Demand_TWh' in filtered_full_df_guide.columns:
    # 构建州-超额需求状态映射
    state_exceed_dict = {}
    if 'exceeds_demand' in merged_data_with_states.columns:
        for state in filtered_full_df_guide['State_name']:
            # 检查该州在 merged_data_with_states 中是否存在被标记为超出需求的像元
            exceeds = False
            # 将 state 转换为大写以匹配 STATE 列
            state_upper = state.upper() if isinstance(state, str) else state
            if state_upper in merged_data_with_states['STATE'].values:
                exceeds = merged_data_with_states.loc[
                    merged_data_with_states['STATE'] == state_upper, 'exceeds_demand'
                ].any()
            state_exceed_dict[state] = exceeds

        # Demand_TWh后加*
        def demand_with_flag(demand, state):
            if pd.isnull(demand):
                return demand
            if state_exceed_dict.get(state, False):
                return f"{demand:.2f}*"
            else:
                return f"{demand:.2f}"

        filtered_full_df_guide['Demand_TWh'] = [
            demand_with_flag(demand, state) 
            for demand, state in zip(filtered_full_df_guide['Demand_TWh'], filtered_full_df_guide['State_name'])
        ]

# 重新排列表头，输出到 Extended data 指针变量
Extended_data_policy_guide = filtered_full_df_guide.reset_index(drop=True)
# 导出为CSV，尽量保持格式完整性
csv_path = "data/Extended_data_policy_guide.csv"
Extended_data_policy_guide.to_csv(
    csv_path, 
    index=False, 
    encoding='utf-8-sig', # 支持中文
    float_format="%.2f",   # 保留两位小数（如需更改格式，请适配需求）
)
print(f"已导出至: {csv_path}")


已导出至: data/Extended_data_policy_guide.csv
