# 0 Load data

In [84]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd

# all the dataframes are clipped to the US states 


df_economic = pd.read_csv('data/US_data/df_economic.csv')
df_weight = pd.read_csv('data/US_data/df_weight.csv')
df_strategies = pd.read_csv('data/US_data/df_strategies.csv')
df_net_benefit = pd.read_csv('data/US_data/df_net_benefit.csv')
df_pv_npv = pd.read_csv('data/US_data/df_pv_npv.csv')
df_agricultural_npv = pd.read_csv('data/US_data/df_agricultural_npv.csv')
df_afforestation_npv = pd.read_csv('data/US_data/df_afforestation_npv.csv')
df_natural_npv = pd.read_csv('data/US_data/df_natural_npv.csv')
df_pixel_optimized_data = pd.read_csv('data/US_data/df_pixel_optimized_data.csv')
df_mlp_scores = pd.read_csv('data/US_data/df_mlp_scores.csv')

us_nation = gpd.read_file(r'data\US_data\cb_2018_us_nation_5m.shp')
us_states = gpd.read_file(r'data\cb_2018_us_state_500k.shp')
us_counties = gpd.read_file('data/cb_2018_us_county_500k.shp')


us_nation_4326 = us_nation.to_crs('EPSG:4326')
us_states_4326 = us_states.to_crs('EPSG:4326')
us_counties_4326 = us_counties.to_crs('EPSG:4326')

# Generate csv for US state 

Logic is: 

to generate cvs for US state, this csv is like 

| State name | abandoned land (ha) | pv installed | Environmental suitability | Emission mitigation / ha | Economic NPV /ha| Power generation  | CCD Mean improvement |
|------------|---------------------|--------------|--------------------|-----------------|--------------|--------------|--------------|
|   Row1     |   Row1              |   Row1       |   Row1             |   Row1          |   Row1       |   Row1       |   Row1       |
|   Row2     |   Row2              |   Row2       |   Row2             |   Row2          |   Row2       |   Row2       |   Row2       |


In [76]:
import numpy as np

# 这里所合并的信息在inital的版本较为全面，如果考虑到后期呈现问题，直接Drop即可

emission_intensity = pd.DataFrame({
    'year': ['2018',  '2030', '2035', '2040'],
    'CO2_g_per_kWh': [400.3,  302,  279.2, 261.1]
})



df_weight['LNCS_expect'] = (
    df_weight['final_forest'] * df_weight['weighted_density_Forest'] +
    df_weight['final_agro'] * df_weight['weighted_density_Agricultural'] +
    df_strategies['final_veg'] * df_weight['weighted_density_Vegetation']
)

# 数据处理，df_net_benefit提取sites的面积信息、发电量（PIXEL为单位），pv_potential_dens是转化为碳的计算方式
# df_weight 提取环境变量 predicted_prob与Expectation_net_benefit
# df_economic 提取经济变量 net_npv_usd
power_data = df_net_benefit[['lat', 'lon', 'area_m2','E_yr_pixel']].copy()
env_data = df_weight[['lat', 'lon', 'predicted_prob','gmm_density','sample_type']].copy()
emission_data = df_weight[['lat', 'lon', 'Expectation_net_benefit','pv_potential_dens','LNCS_expect']].copy()
economic_2050 = df_economic[df_economic['analysis_year'] == 2050]
avg_npv = economic_2050.groupby(['lat', 'lon'])['net_npv_usd'].mean().reset_index()

merged_data_for_analysis = env_data.merge(emission_data, on=['lat', 'lon'], how='outer') \
                      .merge(avg_npv, on=['lat', 'lon'], how='outer') \
                      .merge(power_data, on=['lat', 'lon'], how='outer')


# Transform pv_transform (2020 emission factor) to 2030, 2040, 2050 using emission_intensity
pv_transform_2020 = 352  
# Interpolate emission intensity for 2020, 2030, 2040, 2050
years_target = [2020, 2030, 2040, 2050]
em_factors = np.interp(
    years_target,
    emission_intensity['year'].astype(int),
    emission_intensity['CO2_g_per_kWh']
)

# Calculate scaling factors relative to 2020
scaling_factors = em_factors / em_factors[0]

# 生成一个pv_transfer的dataframe
pv_transfer = pd.DataFrame({
    'year': [2020, 2030, 2040, 2050],
    'pv_transform': [pv_transform_2020] + list(pv_transform_2020 * scaling_factors[1:])
})

# 计算每10年区间的累计光伏转化能力，并最终汇总到2050
years = [2020, 2030, 2040, 2050]

for idx in range(len(years)-1):
    y_start, y_end = years[idx], years[idx+1]
    pv_start = pv_transfer.loc[pv_transfer['year'] == y_start, 'pv_transform'].values[0]
    pv_end = pv_transfer.loc[pv_transfer['year'] == y_end, 'pv_transform'].values[0]
    # 线性插值每年pv_transform，累计10年
    pv_sum = 0
    for y in range(y_start, y_end):
        pv_y = pv_start + (pv_end - pv_start) * (y - y_start) / (y_end - y_start)
        pv_sum += merged_data_for_analysis['E_yr_pixel'] * pv_y / 1000 / 1000 * 0.27 / (merged_data_for_analysis['area_m2'] / 10000)
    merged_data_for_analysis[f'pv_potential_{y_start}_{y_end-1}_sum'] = pv_sum

# 汇总2020-2050累计值，汇总site的累计光伏减排能力(t C)
merged_data_for_analysis['pv_potential_total(t)'] = sum(
    merged_data_for_analysis[f'pv_potential_{years[i]}_{years[i+1]-1}_sum'] for i in range(len(years)-1)
)

merged_data_for_analysis['power_generation_kwha'] = merged_data_for_analysis['E_yr_pixel'] * 30 / merged_data_for_analysis['area_m2'] * 10000

In [77]:
merged_data_for_analysis['Expectation_net_benefit_adjust'] = merged_data_for_analysis['pv_potential_total(t)'] - merged_data_for_analysis['LNCS_expect']
merged_data_for_analysis.drop(columns=['pv_potential_2040_2049_sum','pv_potential_2030_2039_sum','pv_potential_2020_2029_sum'], inplace=True)

In [78]:
merged_data_for_analysis.to_csv('data/US_data/df_merged_data_for_analysis.csv', index=False)

In [85]:
merged_data_for_analysis

Unnamed: 0,lat,lon,predicted_prob,gmm_density,sample_type,Expectation_net_benefit,pv_potential_dens,LNCS_expect,net_npv_usd,area_m2,E_yr_pixel,pv_potential_total(t),power_generation_kwha,Expectation_net_benefit_adjust
0,25.295834,-80.287500,0.000000,3.874315e+16,negative_sample,6382.303639,7191.865919,809.562299,407275.912577,776295.361002,1.958127e+08,5568.344207,7.567199e+07,4758.781908
1,25.437500,-80.537500,0.046746,2.054387e+18,prediction,5690.624063,7035.368191,1344.744161,283953.326746,775385.854124,1.913273e+08,5447.174926,7.402534e+07,4102.430766
2,25.437500,-80.495834,0.000000,4.247702e+15,negative_sample,6055.300591,7024.124059,968.823440,275019.106094,775385.854124,1.910215e+08,5438.469092,7.390703e+07,4469.645652
3,25.445833,-80.454170,0.000000,1.117299e+13,negative_sample,6048.645138,7017.640780,968.995616,269947.631894,775332.207938,1.908320e+08,5433.449375,7.383881e+07,4464.453759
4,25.445833,-80.404170,0.017465,4.463459e+17,prediction,6081.010322,7045.490473,964.480123,291620.314126,775332.207938,1.915894e+08,5455.012162,7.413184e+07,4490.532039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64977,48.995834,-111.970830,0.026153,1.493778e+17,prediction,6210.923349,6392.674772,181.751426,-220868.055076,563356.382622,1.263101e+08,4949.565792,6.726299e+07,4767.814366
64978,48.995834,-110.345830,0.068266,4.120084e+17,prediction,6206.053157,6395.853979,189.800818,-219393.096487,563356.382622,1.263729e+08,4952.027311,6.729644e+07,4762.226494
64979,48.995834,-106.020836,0.065097,3.232082e+17,prediction,6204.952188,6412.037900,207.085706,-205704.827949,563356.382622,1.266927e+08,4964.557807,6.746673e+07,4757.472100
64980,48.995834,-104.087500,0.109234,6.627032e+21,prediction,6077.421747,6292.888713,215.466966,-300314.323514,563356.382622,1.243385e+08,4872.305853,6.621305e+07,4656.838886


In [93]:
# merge时如果有重复的列名（如'lat','lon'），pandas会自动在右表加后缀_x/_y，这里只取新加的'Expectation_net_benefit_adjust'列，不受影响
df_weight['Expectation_net_benefit'] = df_weight.merge(
    merged_data_for_analysis[['lat', 'lon', 'Expectation_net_benefit_adjust']],
    on=['lat', 'lon'],
    how='left'
)['Expectation_net_benefit_adjust']


df_weight['pv_potential_dens'] = df_weight.merge(
    merged_data_for_analysis[['lat', 'lon', 'pv_potential_total(t)']],
    on=['lat', 'lon'],
    how='left'
)['pv_potential_total(t)']


In [92]:
df_weight.drop(columns=['Expectation_net_benefit'], inplace=True)
df_weight.drop(columns=['pv_potential_dens'], inplace=True)


In [94]:
df_weight

Unnamed: 0,lat,lon,time,acc_forest,cap_forest,final_forest,weighted_density_Forest,weighted_density_Agricultural,weighted_density_Vegetation,acc_agro,...,env_scores,Revenue_ratio,gmm_score,gmm_density,predicted_label,predicted_prob,sample_type,LNCS_expect,Expectation_net_benefit,pv_potential_dens
0,25.295834,-80.287500,2020-01-01,809.56850,859.4,809.5685,0.998714,0.000367,0.000918,772.493469,...,0.272397,8.883647,0.051633,3.874315e+16,0,0.000000,negative_sample,809.562299,4758.781908,5568.344207
1,25.437500,-80.537500,2020-01-01,1389.71330,1436.6,1389.7133,0.829869,0.007830,0.162302,1102.152466,...,0.279062,5.231752,0.107510,2.054387e+18,0,0.046746,prediction,1344.744161,4102.430766,5447.174926
2,25.437500,-80.495834,2020-01-01,1026.14620,1072.6,1026.1462,0.771417,0.022229,0.206354,750.145691,...,0.272678,7.250159,0.033807,4.247702e+15,0,0.000000,negative_sample,968.823440,4469.645652,5438.469092
3,25.445833,-80.454170,2020-01-01,1026.14620,1073.7,1026.1462,0.770923,0.007369,0.221708,756.315613,...,0.309372,7.242180,0.010552,1.117299e+13,0,0.000000,negative_sample,968.995616,4464.453759,5433.449375
4,25.445833,-80.404170,2020-01-01,1051.13070,1093.3,1051.1307,0.714253,0.099801,0.185946,728.004333,...,0.307317,7.304962,0.081528,4.463459e+17,0,0.017465,prediction,964.480123,4490.532039,5455.012162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64977,48.995834,-111.970830,2020-01-01,201.90521,182.2,182.2000,0.952007,0.017433,0.030560,170.484009,...,0.251081,35.172626,0.066566,1.493778e+17,0,0.026153,prediction,181.751426,4767.814366,4949.565792
64978,48.995834,-110.345830,2020-01-01,221.94190,198.1,198.1000,0.626705,0.332696,0.040599,175.665878,...,0.296190,33.697715,0.080337,4.120084e+17,0,0.068266,prediction,189.800818,4762.226494,4952.027311
64979,48.995834,-106.020836,2020-01-01,232.56694,207.1,207.1000,0.889626,0.003256,0.107118,209.715591,...,0.177785,30.963208,0.076823,3.232082e+17,0,0.065097,prediction,207.085706,4757.472100,4964.557807
64980,48.995834,-104.087500,2020-01-01,249.97507,225.2,225.2000,0.456741,0.057912,0.485347,204.508926,...,0.284555,29.205817,0.377389,6.627032e+21,0,0.109234,prediction,215.466966,4656.838886,4872.305853


In [None]:

# df_weight.to_csv('data/US_data/df_weight.csv', index=False)
