In [None]:
#comparing with previous literature

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

# Assuming N_bud_filtered(our data), data_pathak, zhang_21, and cropland_nitrogen_data are already loaded and processed as described

# Calculate the national average N_budget for each year
national_avg = N_bud_filtered.groupby('Year')['N_surplus_kg'].sum().reset_index()
national_avg['State Name'] = 'National Average'

# Convert N_surplus_kg to tonnes for both state-level data and national average
N_bud_filtered['N_surplus_tonnes'] = N_bud_filtered['N_surplus_kg'] / 1000
national_avg['N_surplus_tonnes'] = national_avg['N_surplus_kg'] / 1000

# Calculate the mean, min, and max for each year using N_budget columns
budget_columns = [f'N_budjet_{i}_kg' for i in range(1, 13)]
N_bud_filtered['mean_surplus'] = N_bud_filtered[budget_columns].mean(axis=1) / 1000
N_bud_filtered['min_surplus'] = N_bud_filtered[budget_columns].min(axis=1) / 1000
N_bud_filtered['max_surplus'] = N_bud_filtered[budget_columns].max(axis=1) / 1000

national_stats = N_bud_filtered.groupby('Year').agg(
    mean_surplus=('mean_surplus', 'sum'),
    min_surplus=('min_surplus', 'sum'),
    max_surplus=('max_surplus', 'sum')
).reset_index()

# Filtering the data for "Cropland nitrogen" element in tonnes
cropland_nitrogen_data = data[(data['Element'] == 'Cropland nitrogen') & (data['Unit'] == 'tonnes') & (data['Item'] == 'Soil nutrient budget')]

# Extracting year columns for the plot and removing non-numeric characters
year_columns = cropland_nitrogen_data.columns[8:]
years = year_columns.str.extract('(\d+)').astype(int).iloc[:, 0]
values = cropland_nitrogen_data.loc[cropland_nitrogen_data.index[0], year_columns].values

# Converting the 'Year' column in data_pathak to numeric
data_pathak['Year'] = pd.to_numeric(data_pathak['Year'], errors='coerce')

# R² Calculation
def compute_r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

# Align data for R² calculation
aligned_pathak = data_pathak.set_index('Year').reindex(national_stats['Year']).dropna()
aligned_zhang = zhang_21.set_index('Year').reindex(national_stats['Year']).dropna()
aligned_cropland = pd.Series(values, index=years).reindex(national_stats['Year']).dropna()

# Ensure all aligned indices are integers
aligned_pathak.index = aligned_pathak.index.astype(int)
aligned_zhang.index = aligned_zhang.index.astype(int)
aligned_cropland.index = aligned_cropland.index.astype(int)
national_stats.set_index('Year', inplace=True)

# Debugging output to verify alignment
print(f"Aligned Pathak: {aligned_pathak.index}")
print(f"Aligned Zhang: {aligned_zhang.index}")
print(f"Aligned Cropland: {aligned_cropland.index}")
print(f"National Stats: {national_stats.index}")

# Compute R² values
r2_pathak = compute_r2(national_stats.loc[aligned_pathak.index, 'mean_surplus'], aligned_pathak['N surplus (t)'])
r2_zhang = compute_r2(national_stats.loc[aligned_zhang.index, 'mean_surplus'], aligned_zhang['Benchmark_median_N_surplus_ton'])
r2_cropland = compute_r2(national_stats.loc[aligned_cropland.index, 'mean_surplus'], aligned_cropland)

# Plotting
plt.figure(figsize=(14, 8))

# Plot the national average with uncertainty
plt.fill_between(national_stats.index, national_stats['min_surplus'], national_stats['max_surplus'], color='gray', alpha=0.3, label='Uncertainty bound (our data)')
sns.lineplot(data=national_stats, x=national_stats.index, y='mean_surplus', color='black', linewidth=3, label='Cropland Nitrogen surplus(t) our data')

# Plot Cropland Nitrogen
plt.plot(years, values, marker='o', linestyle='-', color='green', label=f'Cropland Nitrogen surplus (t) Ludemann, Cameron I., et al.2024, R²={r2_cropland:.2f}')

# Plot Zhang data with uncertainty
plt.fill_between(zhang_21['Year'], zhang_21['Benchmark_min_N_surplus_ton'], zhang_21['Benchmark_max_N_surplus_ton'], color='red', alpha=0.3, label='Uncertainty bound (Zhang, Xin, et al.2021)')
sns.lineplot(data=zhang_21, x='Year', y='Benchmark_median_N_surplus_ton', color='red', linewidth=3, label=f'N surplus (t) Zhang, Xin, et al.2021, R²={r2_zhang:.2f}')


# Adding titles and labels
plt.xlabel('Year', fontsize=14)
plt.ylabel('N surplus (tonnes)', fontsize=14)
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(loc='upper left')
plt.tight_layout()
plt.savefig(r'E:\nsurplus_paper\data\final_our_data\nitrogen tonne _comparison.png', dpi=600, bbox_inches='tight')

plt.show()


In [None]:
#COMPARING N SURPLUS USING KG/HA VALUES

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score

# Constants
km2_per_ha = 100  # 1 square km is 100 hectares

# R² Calculation
def compute_r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

# Convert the Nitrogen Surplus from kg N km-2 to kg/ha for comparison
df_xin_zang['Nitrogen Surplus (unit: kg/ha)'] = df_xin_zang['Nitrogen Surplus (unit: kg N km-2)'] / km2_per_ha

# Filter the 'data_luedeman' to get the "Cropland nitrogen per unit area" data in kg/ha
cropland_nitrogen_per_area_luedeman = data_luedeman[
    (data_luedeman['Element'] == 'Cropland nitrogen per unit area') &
    (data_luedeman['Unit'] == 'kg/ha')
]

years_luedeman = cropland_nitrogen_per_area_luedeman.columns[8:]  # Skip non-year columns
values_luedeman = cropland_nitrogen_per_area_luedeman.loc[cropland_nitrogen_per_area_luedeman.index[0], years_luedeman].values

# Convert Luedeman years from 'Y1961' to '1961'
years_luedeman_converted = [int(year[1:]) if year.startswith('Y') else int(year) for year in years_luedeman]
nitrogen_surplus_xin_zang = df_xin_zang['Nitrogen Surplus (unit: kg/ha)']

# Ensure the format of year is the same in both datasets
df_xin_zang['Year'] = df_xin_zang['Year'].astype(int)
years_xin_zang_updated = df_xin_zang['Year'].values

# Sort the years and corresponding values to ensure correct plotting
sorted_indices_xin_zang = years_xin_zang_updated.argsort()
years_xin_zang_updated = years_xin_zang_updated[sorted_indices_xin_zang]
nitrogen_surplus_xin_zang = nitrogen_surplus_xin_zang.iloc[sorted_indices_xin_zang]

sorted_indices_luedeman = np.argsort(years_luedeman_converted)
years_luedeman_converted = np.array(years_luedeman_converted)[sorted_indices_luedeman]
values_luedeman = values_luedeman[sorted_indices_luedeman]

# Filter Luedeman data for the required years (1966 to 2017)
luedeman_mask = (years_luedeman_converted >= 1966) & (years_luedeman_converted <= 2017)
years_luedeman_converted = years_luedeman_converted[luedeman_mask]
values_luedeman = values_luedeman[luedeman_mask]

# Filter data for the required years
data = data_1[(data_1['Year'] >= 1966) & (data_1['Year'] <= 2017)]

# Calculate the required columns
data['Max_N_surplus_kg_ha'] = (10 * data['Benchmark_max_N_surplus_ton']) / data['Benchmark_min_Area_km2']
data['Min_N_surplus_kg_ha'] = (10 * data['Benchmark_min_N_surplus_ton']) / data['Benchmark_max_Area_km2']
data['Median_N_surplus_kg_ha'] = (10 * data['Benchmark_median_N_surplus_ton']) / data['Benchmark_median_Area_km2']

# Prepare data for boxplots
our_data_filtered = dk[(dk['Year'] >= 1966) & (dk['Year'] <= 2017)]
boxplot_data = []
years = sorted(our_data_filtered['Year'].unique())
for year in years:
    yearly_data = our_data_filtered[our_data_filtered['Year'] == year]
    boxplot_data.append(yearly_data[['N_budjet_1_kg_ha', 'N_budjet_2_kg_ha', 'N_budjet_3_kg_ha',
       'N_budjet_4_kg_ha', 'N_budjet_5_kg_ha', 'N_budjet_6_kg_ha',
       'N_budjet_7_kg_ha', 'N_budjet_8_kg_ha', 'N_budjet_9_kg_ha',
       'N_budjet_10_kg_ha', 'N_budjet_11_kg_ha', 'N_budjet_12_kg_ha']].values.flatten())

# Align datasets based on common years
common_years_xz = np.intersect1d(our_data_filtered['Year'], years_xin_zang_updated)
common_years_lud = np.intersect1d(our_data_filtered['Year'], years_luedeman_converted)
common_years_z21 = np.intersect1d(our_data_filtered['Year'], data['Year'])

aligned_our_data_xz = our_data_filtered[our_data_filtered['Year'].isin(common_years_xz)]
aligned_xz = nitrogen_surplus_xin_zang[np.isin(years_xin_zang_updated, common_years_xz)]

aligned_our_data_lud = our_data_filtered[our_data_filtered['Year'].isin(common_years_lud)]
aligned_lud = values_luedeman[np.isin(years_luedeman_converted, common_years_lud)]

aligned_our_data_z21 = our_data_filtered[our_data_filtered['Year'].isin(common_years_z21)]
aligned_z21 = data['Median_N_surplus_kg_ha'][data['Year'].isin(common_years_z21)]

# Calculate R² values
r2_value_1 = compute_r2(aligned_our_data_xz['mean_N_budjet_kg_ha'], aligned_xz)
r2_value_2 = compute_r2(aligned_our_data_lud['mean_N_budjet_kg_ha'], aligned_lud)
r2_value_3 = compute_r2(aligned_our_data_z21['mean_N_budjet_kg_ha'], aligned_z21)

# Plotting the temporal uncertainty plot
plt.figure(figsize=(18, 12))

# Plot temporal uncertainty line plot
plt.plot(data['Year'], data['Max_N_surplus_kg_ha'], label='Max N Surplus (kg/ha) Zhang, Xin, et al. 2021', color='#1f77b4', linestyle='--', linewidth=2 #, marker='^'
        )
plt.plot(data['Year'], data['Min_N_surplus_kg_ha'], label='Min N Surplus (kg/ha) Zhang, Xin, et al. 2021', color='#ff7f0e', linestyle='--', linewidth=2 #, marker='v'
        )
plt.plot(data['Year'], data['Median_N_surplus_kg_ha'], label='Median N Surplus (kg/ha) Zhang, Xin, et al. 2021 (R² = {:.2f})'.format(r2_value_3), color='#2ca02c', linestyle='-.', linewidth=2, marker='o')
plt.fill_between(data['Year'], data['Min_N_surplus_kg_ha'], data['Max_N_surplus_kg_ha'], color='grey', alpha=0.2, label='Uncertainty Range Zhang, Xin, et al. 2021')

# Plot for Xin and Zhang data
plt.plot(years_xin_zang_updated, nitrogen_surplus_xin_zang, marker='o', linestyle='-', color='#d62728', label='Zhang, Xin, et al. 2015 (R² = {:.2f})'.format(r2_value_1), linewidth=2)

# Plot for Luedeman data
plt.plot(years_luedeman_converted, values_luedeman, marker='s', linestyle='-', color='#9467bd', label='Cameron I. Ludemann et al. 2024 (R² = {:.2f})'.format(r2_value_2), linewidth=2)

# Create boxplots
box = plt.boxplot(boxplot_data, positions=years, widths=0.6, patch_artist=True,
                  boxprops=dict(facecolor='none', color='black'),
                  medianprops=dict(color='black'), whiskerprops=dict(color='black'),
                  capprops=dict(color='black'), flierprops=dict(markerfacecolor='black', marker='o', markersize=5))

# Connect the medians with a dark black line
medians = [np.median(data) for data in boxplot_data]
plt.plot(years, medians, color='black', linestyle='-', linewidth=2, label='Our study Median')

# Enhancing the x-axis and y-axis labels and ticks
plt.yticks(fontsize=12, color='black')
plt.xticks(rotation=45, fontsize=12, color='black')

# Adding labels and title
plt.xlabel('Year', fontsize=14, color='black')
plt.ylabel('N Surplus (kg/ha)', fontsize=14, color='black')
#plt.title('Comparison of Temporal Uncertainty of N Surplus', fontsize=18, color='black')

# Adding a grid with specific settings
plt.grid(True, which='major', linestyle='--', linewidth=0.5, color='grey')

# Custom legend
handles, labels = plt.gca().get_legend_handles_labels()
box_patch = plt.Line2D([0], [0], color='black', marker='s', markersize=10, markerfacecolor='none', label='Our study Boxplot')
handles.append(box_patch)
labels.append('Our study')
plt.legend(handles, labels, fontsize=12, loc='lower right',frameon=False)

# Making the plot look more professional
plt.tight_layout()

# Set x and y axis lines to black
ax = plt.gca()
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
ax.spines['top'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['bottom'].set_linewidth(2)
ax.spines['left'].set_linewidth(2)
ax.spines['top'].set_linewidth(2)
ax.spines['right'].set_linewidth(2)

# Add horizontal and vertical lines at the bottom and left to make the axes more visible
plt.axhline(y=ax.get_ylim()[0], color='black', linewidth=2)
plt.axvline(x=ax.get_xlim()[0], color='black', linewidth=2)

# Remove background white shade
plt.gca().patch.set_color('none')
plt.gcf().patch.set_color('none')

#plt.savefig(r'E:\nsurplus_paper\data\final_our_data\temporal_total_nitrogen_surplus_kg_ha_comparison_12.png', dpi=600, bbox_inches='tight')

plt.show()
