In [2]:
# import required packages

import os
import pandas as pd
import numpy as np
import requests
import time
import datetime
import geopandas
from shapely.geometry import Point

# read in country metadata
print(os.getcwd())

metadata = pd.read_excel('../Plan-EO_Country_meta-data.xlsx')

/sfs/ceph/standard/Plan-EO_Storage/Capstone-25


In [7]:
metadata = metadata[['Name', 'ISO2', 'ISO3']]
metadata.Name = metadata.Name.str.replace(' ', '_')
metadata.head()

Unnamed: 0,Name,ISO2,ISO3
0,Afghanistan,AF,AFG
1,Algeria,DZ,DZA
2,Angola,AO,AGO
3,Argentina,AR,ARG
4,Armenia,AM,ARM


In [8]:
class APIRateLimitError(Exception):
    def __init__(self, message):
        super().__init__(message)

In [9]:
# we need this convert_to_csv() function in both scripts, it handles converting json to pandas DataFrame that can be saved as CSV

def convert_to_csv(settlements_json):
    settlements = []
    
    if not settlements_json['elements']:
        return None
    for element in settlements_json['elements']:
        osm_id = element['id']
        name = element['tags'].get('name')
        place_type = element['tags'].get('place')
        lat = element.get('lat')
        lon = element.get('lon')
        population = element['tags'].get('population')
        
        settlements.append({'osm_id': osm_id, 'name': name, 'place': place_type, 'latitude': lat, 'longitude': lon, 'population': population})
    
    # Convert to DataFrame
    df = pd.DataFrame(settlements)
    df.columns = ['osm_id', 'name', 'place', 'lat', 'lon', 'population']
    
    return df



# this is the function to update the CSVs every month
# this allows to variably set days (if it's been longer or shorter)
# but once we determine the schedule we will code it ourselves (probably with a buffer of a couple days, I'm thinking 10-15 extra)

def update_settlements(country_code, csv_path, cushion=5):
    
    # get time length that we want
    
    # check when file was last modified, convert to datetime
    last_mod = os.path.getmtime(csv_path)
    dt_last_mod = datetime.datetime.fromtimestamp(last_mod)
    
    # this is how far back we'll be searching for edits - default is 5 day overlap between updates, but can be changed
    since = dt_last_mod - datetime.timedelta(days=cushion)
    
    # write Overpass API query
    query = f"""
    [out:json][timeout:300];
        area["ISO3166-1"="{country_code}"]->.country;
    (
      node["place"](area.country)(newer:"{since:%Y-%m-%d}T00:00:00Z");
      way["place"](area.country)(newer:"{since:%Y-%m-%d}T00:00:00Z");
      relation["place"](area.country)(newer:"{since:%Y-%m-%d}T00:00:00Z");
    );
    out center;
    """
    
    # headers (to announce ourselves to the API as friends) but they track the IP address anyways
    
    headers = {
        'User-Agent': 'Plan-EO_Pipeline/1.1 (cwp5xyj@virginia.edu)'
    }
    
    
    # Overpass API URL
    url = 'http://overpass-api.de/api/interpreter'
    
    # send request
    response = requests.get(url, params={'data': query}, headers = headers)
    
    # check for errors
    if response.status_code != 200:
        raise APIRateLimitError(f'Error {response.status_code}: {response.text}')
        
    data_to_update = response.json() 
    
    new_df = convert_to_csv(data_to_update)
    
    if new_df is None:
        raise Exception(f'No updates needed to {csv_path}')
    
    # call in path
        
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
    else:
        print(f'{country_code} seems to be missing a CSV file in our system. Please initialize a fresh CSV before running the update code.')
        return None

    updated_df = pd.concat([existing_df[~existing_df['osm_id'].isin(new_df['osm_id'])], new_df], ignore_index=True) # bit slow but we will see

    # Save back to CSV
    updated_df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f'CSV updated successfully: {csv_path}, rows updated = {len(new_df)}')
    return updated_df

def make_shapefile(iso3, country, new_df):
    
    to_convert = new_df.dropna(axis=0, subset = ['lat', 'lon'])
    
    geometry = [Point(xy) for xy in zip(to_convert['lon'], to_convert['lat'])]
    gdf = geopandas.GeoDataFrame(to_convert, geometry=geometry, crs="EPSG:4326")
    
    shp_path = f'output_dir_test/{iso3}_{country}/02_Settlement_data/{iso3}_populated_places_points.shp'
    gdf.to_file(shp_path, driver='ESRI Shapefile')
    print(f'Shapefile updated successfully: {shp_path}')

In [12]:
%%time
# for i in range(len(metadata)):
for i in range(10):    
    
    # extract necessary metadata - index is clean, so "i" works fine
    iso2 = metadata.loc[i, 'ISO2']
    iso3 = metadata.loc[i,'ISO3']
    country = metadata.loc[i,'Name']

    # create path
    path = f'../Individual_country_data/{iso3}_{country}/02_Settlement_data/{iso3}_populated_places_points.csv'
    
    try:
        new_df = update_settlements(iso2, path)
        # print(new_df)
        make_shapefile(iso3, country, new_df)
    except Exception as e:
        print(e)
        pass
    except APIRateLimitError as e:
        print('Initiating 5 minute buffer time to prevent rate limiting... updates will resume shortly.')
        time.sleep(300)
        new_df = update_settlements(iso2, path)
        make_shapefile(iso3, country, new_df)
    
    # should probably introduce a sleep() function here because this will still be expensive
    # time.sleep(5)
    

CSV updated successfully: output_dir_test/AFG_Afghanistan/02_Settlement_data/AFG_populated_places_points.csv, rows updated = 4
Shapefile updated successfully: output_dir_test/AFG_Afghanistan/02_Settlement_data/AFG_populated_places_points.shp
CSV updated successfully: output_dir_test/DZA_Algeria/02_Settlement_data/DZA_populated_places_points.csv, rows updated = 25
Shapefile updated successfully: output_dir_test/DZA_Algeria/02_Settlement_data/DZA_populated_places_points.shp
CSV updated successfully: output_dir_test/AGO_Angola/02_Settlement_data/AGO_populated_places_points.csv, rows updated = 243
Shapefile updated successfully: output_dir_test/AGO_Angola/02_Settlement_data/AGO_populated_places_points.shp


  existing_df = pd.read_csv(csv_path)


CSV updated successfully: output_dir_test/ARG_Argentina/02_Settlement_data/ARG_populated_places_points.csv, rows updated = 70
Shapefile updated successfully: output_dir_test/ARG_Argentina/02_Settlement_data/ARG_populated_places_points.shp
CSV updated successfully: output_dir_test/ARM_Armenia/02_Settlement_data/ARM_populated_places_points.csv, rows updated = 17
Shapefile updated successfully: output_dir_test/ARM_Armenia/02_Settlement_data/ARM_populated_places_points.shp
CSV updated successfully: output_dir_test/AZE_Azerbaijan/02_Settlement_data/AZE_populated_places_points.csv, rows updated = 131
Shapefile updated successfully: output_dir_test/AZE_Azerbaijan/02_Settlement_data/AZE_populated_places_points.shp
CSV updated successfully: output_dir_test/BGD_Bangladesh/02_Settlement_data/BGD_populated_places_points.csv, rows updated = 12
Shapefile updated successfully: output_dir_test/BGD_Bangladesh/02_Settlement_data/BGD_populated_places_points.shp
No updates needed to output_dir_test/BLZ_Be