# DHS Boundary Web Scraper

This notebook contains a web scraper that will download the DHS boundary data.  
It cycles through the urls that point to a different countries boundary data and then through the rows of the table for each country on the DHS boundary site. Not all countries have boundary data.

It then unzips the files and saves them in the appropriate folder.  

This notebook does take some time to run because it has to load the webpage for each country separately.

In [12]:
# import packages
import pandas as pd
import numpy as np
import os
import time
import zipfile
import shutil
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [13]:
#read in country codes csv

#country_codes csv was created when api was queried for download of DHS survey data
country_codes = pd.read_csv('DHS_files/country_codes.csv')

#view df
country_codes[0:20]

Unnamed: 0.1,Unnamed: 0,DHS_CountryCode,CountryName,ISO3_CountryCode
0,0,AF,Afghanistan,AFG
1,1,AL,Albania,ALB
2,2,AO,Angola,AGO
3,3,AM,Armenia,ARM
4,4,AZ,Azerbaijan,AZE
5,5,BD,Bangladesh,BGD
6,6,BJ,Benin,BEN
7,7,BO,Bolivia,BOL
8,8,BT,Botswana,BWA
9,9,BR,Brazil,BRA


In [14]:
# Open a log file in write mode
log_file = open("DHS_Error_logs/scraping_log1.txt", "w")

# Redirect standard output to the log file
sys.stdout = log_file

# Base directory for saving downloads
base_download_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "Individual_country_data"))

# Set up Selenium WebDriver with default download path
options = webdriver.ChromeOptions()
default_download_path = os.path.join(os.getcwd(), "temp_downloads")
os.makedirs(default_download_path, exist_ok=True)

#add options to run in headless mode
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

options.add_experimental_option("prefs", {
    "download.default_directory": default_download_path,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,
    "profile.default_content_setting_values.automatic_downloads": 1  # Allow multiple downloads
})

driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 10)

# Iterate over each country in the DataFrame
for _, row in country_codes.iterrows():
    country = row["CountryName"]
    iso2_code = row["DHS_CountryCode"]
    iso3_code = row["ISO3_CountryCode"]
    
    print(f"Processing {country} ({iso2_code} -> {iso3_code})...")

    # Construct URL and navigate to it
    url = f"https://spatialdata.dhsprogram.com/boundaries/#countryId={iso2_code}&view=table"
    driver.get(url)
    time.sleep(3)  # Wait for page to load
    driver.refresh()  # Force a full refresh
    time.sleep(5)  # Give time for elements to reload
    
    # Define the country-specific folder
    country_folder = os.path.join(base_download_dir, f"{iso3_code}_{'_'.join(country.split())}", "01_Boundary_data")
    os.makedirs(country_folder, exist_ok=True)

    try:
        # Wait for the table to appear
        table = wait.until(EC.presence_of_element_located((By.XPATH, "//table[@id='boundaries-table']")))
        
        # Get all survey rows
        rows = driver.find_elements(By.XPATH, "//table[@id='boundaries-table']/tbody/tr")

        for row in rows:
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", row)
                time.sleep(0.5)  # Allow page to settle
                
                survey_text = row.find_element(By.XPATH, ".//td[2]").text.strip().replace("View Boundaries", "").strip()
                parts = survey_text.split()
                year = next((part for part in parts if part.isdigit()), "Unknown")
                survey_type = parts[-1]
                survey_name = f"{iso3_code}{year}{survey_type}"
                
                survey_folder = os.path.join(country_folder, survey_name)
                 # Remove the existing survey folder if it exists
                if os.path.exists(survey_folder):
                    shutil.rmtree(survey_folder)  # Delete all existing files
                os.makedirs(survey_folder, exist_ok=True)

                download_button = row.find_element(By.XPATH, ".//td[3]/button")
                driver.execute_script("arguments[0].click();", download_button)
                time.sleep(5)  # Wait for download
                
                # Wait for download completion
                max_wait = 30
                wait_time = 0
                download_complete = False

                while wait_time < max_wait:
                    files = os.listdir(default_download_path)
                    if files and not any(f.endswith('.crdownload') for f in files):
                        download_complete = True
                        break
                    time.sleep(1)
                    wait_time += 1
                
                #extract zip file
                if download_complete:
                    for filename in os.listdir(default_download_path):
                        file_path = os.path.join(default_download_path, filename)
                        if os.path.isfile(file_path) and filename.endswith('.zip'):
                            try:
                                temp_extract_dir = os.path.join(survey_folder, "temp_extract")
                                os.makedirs(temp_extract_dir, exist_ok=True)

                                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                                    zip_ref.extractall(temp_extract_dir)

                                for extracted_file in os.listdir(temp_extract_dir):
                                    src = os.path.join(temp_extract_dir, extracted_file)
                                    dst = os.path.join(survey_folder, extracted_file)

                                    if os.path.exists(dst):
                                        os.remove(dst)  # Remove the existing file to prevent conflicts
                                    shutil.move(src, dst)  # Move the new file

                                os.rmdir(temp_extract_dir)
                                os.remove(file_path)

                                print(f"Downloaded and extracted data for {survey_text}")

                            except Exception as e:
                                print(f"Error extracting files for {survey_text}: {e}")
                                shutil.move(file_path, os.path.join(survey_folder, filename))
                else:
                    print(f"Download timeout for {survey_text}")

            except Exception as e:
                print(f"Error processing row for {country}: {e}")

    except Exception as e:
        print(f"Failed to process {country}: {e}")

print("Scraping complete.")
driver.quit()

log_file.close()

# Reset stdout to default (optional)
sys.stdout = sys.__stdout__

print("Scraping complete. Check scraping_log1.txt for details.")
