# GDW Dams Processing for Africa: Spatial Join, Validation, and Filtering
This notebook processes GDW dam data for Africa by:
1. Loading and spatially joining dam and arid region data
2. Validating ISO matching and saving arid SSA dams
3. Filtering for irrigation dams and saving
4. Summarizing dam counts by year

In [None]:
# Import required libraries
import geopandas as gpd
import sys
import os

# Add the project root to sys.path so we can import from Code.utils everywhere
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from Code.utils.utility import load_config, resolve_path, africa_iso_countries_filtered
# Load configuration
config = load_config()

## Load and spatially join dam and arid region data

In [2]:
# Load the SSA Arid by Country shapefile
ssa_arid_path = resolve_path(config['SSA_Arid_by_Country_shp_path'])
gdf_ssa_arid = gpd.read_file(ssa_arid_path)

# Load the GDW dam shapefile
gdw_dams_path = resolve_path(config['GDW_barrier_shp_path'])
gdf_dams = gpd.read_file(gdw_dams_path)

# Ensure both GeoDataFrames have the same CRS
gdf_ssa_arid = gdf_ssa_arid.to_crs(gdf_dams.crs)

# Perform a spatial join to assign ISO and NAME_0 from SSA Arid regions to dams
gdf_dams_spatial = gpd.sjoin(
    gdf_dams[['geometry', 'COUNTRY', 'GDW_ID']],  # Include the 'COUNTRY' and 'GDW_ID' column explicitly
    gdf_ssa_arid[['geometry', 'ISO', 'NAME_0']],
    how="inner",
    predicate="intersects",
    rsuffix="_bounds"  # Add a suffix to avoid column name conflicts
)

# Assign ISO from the utility file using the "Country" column
country_to_iso = {country: iso for iso, countries in africa_iso_countries_filtered.items() for country in countries}
gdf_dams_spatial['ISO_from_country'] = gdf_dams_spatial['COUNTRY'].map(country_to_iso)

# Filter dams that are inside the arid SSA bounds
dams_in_arid_ssa = gdf_dams[gdf_dams['GDW_ID'].isin(gdf_dams_spatial['GDW_ID'])].copy()

## 2. Validate ISO matching and save arid SSA dams

In [3]:
# Validate ISO matching
gdf_dams_spatial['ISO_match'] = gdf_dams_spatial['ISO'] == gdf_dams_spatial['ISO_from_country']

# Print validation results
num_total_dams = gdf_dams_spatial.shape[0]
num_matching_isos = gdf_dams_spatial['ISO_match'].sum()
num_mismatched_isos = num_total_dams - num_matching_isos

print(f"Total dams spatially matched: {num_total_dams}")
print(f"Number of dams with matching ISOs: {num_matching_isos}")
print(f"Number of dams with mismatched ISOs: {num_mismatched_isos}")

# Print mismatched rows for inspection
if num_mismatched_isos > 0:
    print("The following dams have mismatched ISOs:")
    print(gdf_dams_spatial[~gdf_dams_spatial['ISO_match']][['GDW_ID','COUNTRY', 'ISO', 'ISO_from_country']].drop_duplicates())
else:
    print("All dams have matching ISOs.")

# Add the ISO_from_country values to the filtered dataset
iso_mapping = gdf_dams_spatial.set_index('GDW_ID')['ISO_from_country'].to_dict()
dams_in_arid_ssa['ISO'] = dams_in_arid_ssa['GDW_ID'].map(iso_mapping)

# Save the finalized dataset to a shapefile
output_path = resolve_path(config['GDW_Arid_SSA_Final_shp_path'])
dams_in_arid_ssa.to_file(output_path, driver="ESRI Shapefile")

print(f"Finalized dataset saved to {output_path}")

Total dams spatially matched: 2764
Number of dams with matching ISOs: 2758
Number of dams with mismatched ISOs: 6
The following dams have mismatched ISOs:
       GDW_ID       COUNTRY  ISO ISO_from_country
62         63      Zimbabwe  ZMB              ZWE
6664     6665      Zimbabwe  ZAF              ZWE
40788   40789      Botswana  ZAF              BWA
40812   40813      Botswana  ZAF              BWA
40833   40834  South Africa  NAM              ZAF
40836   40837  South Africa  NAM              ZAF
Finalized dataset saved to /home/waves/data/Africa_Irrigation/Data/Processed/GDW_Arid_SSA_Final-shp/GDW_Arid_SSA_Final.shp


## 3. Filter for irrigation dams and save

In [4]:

dams_in_arid_ssa = gpd.read_file(resolve_path(config['GDW_Arid_SSA_Final_shp_path']))
#print(dams_in_arid_ssa.columns)

# Filter dams by MAIN_USE (e.g., "Irrigation")
gdf_dams_irrigation = dams_in_arid_ssa[dams_in_arid_ssa['MAIN_USE'].str.contains('Irrigation', case=False, na=False)].copy()
gdf_dams_filtered = gdf_dams_irrigation[gdf_dams_irrigation['DAM_HGT_M'] > 15].copy()

# Print the number of dams before and after filtering
print(f"Number of dams before filtering by MAIN_USE: {dams_in_arid_ssa.shape[0]}")
print(f"Number of dams after filtering by MAIN_USE: {gdf_dams_irrigation.shape[0]}")
print(f"Number of dams after filtering by DAM_HGT_M: {gdf_dams_filtered.shape[0]}")

# Save the finalized irrigation dataset to a shapefile
output_path = resolve_path(config['GDW_Arid_SSA_Final_Irr_shp_path'])
gdf_dams_filtered.to_file(output_path, driver="ESRI Shapefile")

print(f"Finalized dataset saved to {output_path}")

Number of dams before filtering by MAIN_USE: 2764
Number of dams after filtering by MAIN_USE: 270
Number of dams after filtering by DAM_HGT_M: 171
Finalized dataset saved to /home/waves/data/Africa_Irrigation/Data/Processed/GDW_Arid_SSA_Final-shp/GDW_Arid_SSA_Final_Irr.shp


## 4. Summarize dam counts by year

In [5]:
# Load the finalized irrigation dataset
gdf_dams_irrigation = gpd.read_file(resolve_path(config['GDW_Arid_SSA_Final_Irr_shp_path']))

# Define time intervals and initialize a dictionary to hold counts
years = list(range(1980, 2020, 5))
dam_counts = {}

# Count dams existing by each time interval
for year in years:
    count = gdf_dams_irrigation[gdf_dams_irrigation['YEAR_DAM'] <= year].shape[0]
    dam_counts[year] = count

# Print the counts
for year, count in dam_counts.items():
    print(f"Dams existing by {year}: {count}")

Dams existing by 1980: 114
Dams existing by 1985: 137
Dams existing by 1990: 153
Dams existing by 1995: 159
Dams existing by 2000: 161
Dams existing by 2005: 162
Dams existing by 2010: 164
Dams existing by 2015: 168
