# Code to regularly update most current OSM data for 02_Settlement_Data

### Run this one!

I would suggest running this around once per month to ensure that the API rate load does not become too high. However, I have built in safeguards to work around rate-limiting when it comes to redownloading data. The script should run without any issues.

Note: the `population` column often saves data in weird and wonky ways. I do not believe there are any long-term issues with the warnings that are passed by this script, and just want to note that as far as I can tell, they are not errors. Also, some countries do not have population data available for individual settlements. This can cause a warning when creating the shapefiles, but I do not believe that it damages them.

In [2]:
# import required packages

import os
import pandas as pd
import numpy as np
import requests
import time
import datetime
import geopandas
from shapely.geometry import Point

# read in country metadata
print(os.getcwd())

metadata = pd.read_excel('../Plan-EO_Country_meta-data.xlsx')

/sfs/ceph/standard/Plan-EO_Storage/Capstone-25


In [7]:
# remove unused metadata columns and relabel country name to match storage folder format
metadata = metadata[['Name', 'ISO2', 'ISO3']]
metadata.Name = metadata.Name.str.replace(' ', '_')
metadata.head()

Unnamed: 0,Name,ISO2,ISO3
0,Afghanistan,AF,AFG
1,Algeria,DZ,DZA
2,Angola,AO,AGO
3,Argentina,AR,ARG
4,Armenia,AM,ARM


In [8]:
# This is a special Error class I have created to handle errors when the Overpass API rate-limits us.
# It does not need to be edited.

class APIRateLimitError(Exception):
    def __init__(self, message):
        super().__init__(message)

In [9]:
# we need this convert_to_csv() function in both scripts, it handles converting json to pandas DataFrame that can be saved as CSV

def convert_to_csv(settlements_json):
    
    # initialize empty list
    settlements = []
    
    # for updating: If there are no new settlements to update for this country, return None and exit the function
    # see below function for how this is handled
    if not settlements_json['elements']:
        return None
    
    # loop through new elements
    for element in settlements_json['elements']:
        osm_id = element['id']
        name = element['tags'].get('name')
        place_type = element['tags'].get('place')
        lat = element.get('lat')
        lon = element.get('lon')
        population = element['tags'].get('population')
        
        # add the extracted data to the settlements list
        settlements.append({'osm_id': osm_id, 'name': name, 'place': place_type, 'latitude': lat, 'longitude': lon, 'population': population})
    
    # convert the settlements list to a DataFrame
    df = pd.DataFrame(settlements)
    df.columns = ['osm_id', 'name', 'place', 'lat', 'lon', 'population']
    
    # this return, if any, will be concatenated with the old CSV to create a new, fully updated CSV
    return df


# this is the function to update the CSVs every month
# this allows to variably set days (if it's been longer or shorter) with the "cushion" argument
# but once we determine the schedule we will code it ourselves (probably with a buffer of a couple days, I'm thinking 5-10 extra)

def update_settlements(country_code, csv_path, cushion=5):
    
    # get time length that we want
    
    # check when file was last modified, convert to datetime
    last_mod = os.path.getmtime(csv_path)
    dt_last_mod = datetime.datetime.fromtimestamp(last_mod)
    
    # this is how far back we'll be searching for edits - default is 5 day overlap between updates, but can be changed
    since = dt_last_mod - datetime.timedelta(days=cushion)
    
    # write Overpass API query (note the 300 second timeout period, and how the "since" is incorporated)
    query = f"""
    [out:json][timeout:300];
        area["ISO3166-1"="{country_code}"]->.country;
    (
      node["place"](area.country)(newer:"{since:%Y-%m-%d}T00:00:00Z");
      way["place"](area.country)(newer:"{since:%Y-%m-%d}T00:00:00Z");
      relation["place"](area.country)(newer:"{since:%Y-%m-%d}T00:00:00Z");
    );
    out center;
    """
    
    # this query will return every settlement that has been changed since the last time we updated our settlement data
    
    # headers (to announce ourselves to the API as friends) but they track the IP address anyways
    
    headers = {
        'User-Agent': 'Plan-EO_Pipeline/1.1 (cwp5xyj@virginia.edu)'
    }
    
    
    # Overpass API URL
    url = 'http://overpass-api.de/api/interpreter'
    
    # send request
    response = requests.get(url, params={'data': query}, headers = headers)
    
    # check for errors
    if response.status_code != 200:
        # if we timeout, raise a custom error (see loop for how this is handled)
        raise APIRateLimitError(f'Error {response.status_code}: {response.text}') 
        
    # put our new data in json format, then use convert_to_csv() to turn it into a dataframe
    data_to_update = response.json() 
    
    new_df = convert_to_csv(data_to_update)
    
    # if the convert_to_csv returns None (a.k.a., no new nodes to update) raise an exception and exit the function
    if new_df is None:
        raise Exception(f'No updates needed to {csv_path}')
    
    # call in path, checking to see that it exists (if not, need to run initialize script)
        
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
    else:
        print(f'{country_code} seems to be missing a CSV file in our system. Please initialize a fresh CSV before running the update code.')
        return None

    # concatenate new CSV, updating every line that has been changed since the last update
    updated_df = pd.concat([existing_df[~existing_df['osm_id'].isin(new_df['osm_id'])], new_df], ignore_index=True) # bit slow but we will see

    # Save back to CSV
    updated_df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f'CSV updated successfully: {csv_path}, rows updated = {len(new_df)}') # statement to show success
    return updated_df

def make_shapefile(iso3, country, new_df):
    
    # this is very simple: convert updated dataframe into a spatial dataframe, then convert SDF to shapefile
    
    # get rid of all settlements without lat/lon data (no way to convert)
    to_convert = new_df.dropna(axis=0, subset = ['lat', 'lon'])
    
    # determine geometry and create geopandas SDF using lat and lon
    geometry = [Point(xy) for xy in zip(to_convert['lon'], to_convert['lat'])]
    gdf = geopandas.GeoDataFrame(to_convert, geometry=geometry, crs="EPSG:4326")
    
    # create shapefile path, then write new SDF to it (will automatically overwrite)
    shp_path = f'output_dir_test/{iso3}_{country}/02_Settlement_data/{iso3}_populated_places_points.shp'
    gdf.to_file(shp_path, driver='ESRI Shapefile')
    print(f'Shapefile updated successfully: {shp_path}')

In [12]:
%%time

# run the below loop to update all settlement data.

# for i in range(10): 
for i in range(len(metadata)):   
    
    # extract necessary metadata - index is clean, so "i" works fine
    iso2 = metadata.loc[i, 'ISO2']
    iso3 = metadata.loc[i,'ISO3']
    country = metadata.loc[i,'Name']

    # create path
    path = f'../Individual_country_data/{iso3}_{country}/02_Settlement_data/{iso3}_populated_places_points.csv'
    
    try:
        # this is what should happen if everything works perfectly
        # first, update_settlements() creates and saves an updated dataframe
        new_df = update_settlements(iso2, path)
        # then, make_shapefile() saves the updated shapefile
        make_shapefile(iso3, country, new_df)
        
    # if an exception is raised because there are no settlements to update for this country:
    except Exception as e:
        print(e) # print the message so the user knows, then move on to the next country
        pass
    
    # if an exception is raised because of rate limiting:
    except APIRateLimitError as e:
        # explain it to the user
        print('Initiating 5 minute buffer time to prevent rate limiting... updates will resume shortly.')
        time.sleep(300) # 5 minute buffer
        
        # then try it all again
        new_df = update_settlements(iso2, path)
        make_shapefile(iso3, country, new_df)
    
    # should probably introduce a sleep() function here because this will still be expensive
    # update: found that this is not necessary, but if we find that this is often breaking, uncomment and edit the below line
    # time.sleep(5)
    

CSV updated successfully: output_dir_test/AFG_Afghanistan/02_Settlement_data/AFG_populated_places_points.csv, rows updated = 4
Shapefile updated successfully: output_dir_test/AFG_Afghanistan/02_Settlement_data/AFG_populated_places_points.shp
CSV updated successfully: output_dir_test/DZA_Algeria/02_Settlement_data/DZA_populated_places_points.csv, rows updated = 25
Shapefile updated successfully: output_dir_test/DZA_Algeria/02_Settlement_data/DZA_populated_places_points.shp
CSV updated successfully: output_dir_test/AGO_Angola/02_Settlement_data/AGO_populated_places_points.csv, rows updated = 243
Shapefile updated successfully: output_dir_test/AGO_Angola/02_Settlement_data/AGO_populated_places_points.shp


  existing_df = pd.read_csv(csv_path)


CSV updated successfully: output_dir_test/ARG_Argentina/02_Settlement_data/ARG_populated_places_points.csv, rows updated = 70
Shapefile updated successfully: output_dir_test/ARG_Argentina/02_Settlement_data/ARG_populated_places_points.shp
CSV updated successfully: output_dir_test/ARM_Armenia/02_Settlement_data/ARM_populated_places_points.csv, rows updated = 17
Shapefile updated successfully: output_dir_test/ARM_Armenia/02_Settlement_data/ARM_populated_places_points.shp
CSV updated successfully: output_dir_test/AZE_Azerbaijan/02_Settlement_data/AZE_populated_places_points.csv, rows updated = 131
Shapefile updated successfully: output_dir_test/AZE_Azerbaijan/02_Settlement_data/AZE_populated_places_points.shp
CSV updated successfully: output_dir_test/BGD_Bangladesh/02_Settlement_data/BGD_populated_places_points.csv, rows updated = 12
Shapefile updated successfully: output_dir_test/BGD_Bangladesh/02_Settlement_data/BGD_populated_places_points.shp
No updates needed to output_dir_test/BLZ_Be