# Initializing 02_Settlement Data

### Important: this should not need to be run again!

See `02_Settlements_OSM_update` for the code to update the settlement lists

In [1]:
# import required packages

import os
import pandas as pd
import numpy as np
import requests
import time
import datetime

# read in country metadata
print(os.getcwd())

metadata = pd.read_excel('../Plan-EO_Country_meta-data.xlsx', keep_default_na=False)

/sfs/ceph/standard/Plan-EO_Storage/Capstone-25


In [2]:
metadata = metadata[['Name', 'ISO2', 'ISO3']] # extract useful metadata fields

In [3]:
metadata.head() # inspect metadata

Unnamed: 0,Name,ISO2,ISO3
0,Afghanistan,AF,AFG
1,Algeria,DZ,DZA
2,Angola,AO,AGO
3,Argentina,AR,ARG
4,Armenia,AM,ARM


In [4]:
# this is the function to set up the CSV files for OSM data

def get_settlements_origin(country_code):
    # Overpass API query (note long timeout to prevent rate-limiting)
    query = f"""
    [out:json][timeout:600];
        area["ISO3166-1"="{country_code}"]->.country;
    (
      node["place"](area.country);
      way["place"](area.country);
      relation["place"](area.country);
    );
    out center;
    """
    
    # headers
    
    headers = {
        'User-Agent': 'Plan-EO_Pipeline/1.0 (cwp5xyj@virginia.edu)'
    }
    
    
    # Overpass API URL
    url = 'http://overpass-api.de/api/interpreter'
    
    # Send request
    response = requests.get(url, params={'data': query}, headers = headers)
    
    # Check for errors
    if response.status_code != 200:
        raise Exception(f'Error {response.status_code}: {response.text}')
        
    return response.json()

# this is the function to convert these initial calls into CSVs

def convert_to_csv(settlements_json):
    settlements = [] # initialize empty list
    for element in settlements_json['elements']:
        
        # extract necessary data (mostly straightforward)
        osm_id = element['id'] # unique OSM id
        name = element['tags'].get('name') # place name
        place_type = element['tags'].get('place') # place type
        lat = element.get('lat') # latitude
        lon = element.get('lon') # longitude
        population = element['tags'].get('population') # total population
        
        # fill empty list with the above extracted data
        settlements.append({'osm_id': osm_id, 'name': name, 'place': place_type, 'latitude': lat, 'longitude': lon, 'population': population})
    
    # Convert to DataFrame
    df = pd.DataFrame(settlements)
    df.columns = ['osm_id', 'name', 'place', 'lat', 'lon', 'population']
    
    return df



In [5]:
metadata.iloc[:20]

Unnamed: 0,Name,ISO2,ISO3
0,Afghanistan,AF,AFG
1,Algeria,DZ,DZA
2,Angola,AO,AGO
3,Argentina,AR,ARG
4,Armenia,AM,ARM
5,Azerbaijan,AZ,AZE
6,Bangladesh,BD,BGD
7,Belize,BZ,BLZ
8,Benin,BJ,BEN
9,Bhutan,BT,BTN


In [6]:
os.getcwd()

'/sfs/ceph/standard/Plan-EO_Storage/Capstone-25'

In [7]:
metadata[metadata['Name'] == 'Namibia']

Unnamed: 0,Name,ISO2,ISO3
79,Namibia,,NAM


In [8]:
metadata[metadata['Name'] == 'Syria']

Unnamed: 0,Name,ISO2,ISO3
106,Syria,SY,SYR


In [9]:
%%time
# To rerun this entire loop, uncomment the below comment and delete the line directly below it

# for i in range(len(metadata)):
for i in range(0,8):
    
    # extract necessary metadata - index is clean, so "i" works fine
    iso2 = metadata.loc[i, 'ISO2']
    iso3 = metadata.loc[i,'ISO3']
    country = metadata.loc[i,'Name']
    country = country.replace(' ', '_') # so that they match the Individual_country_data country folders
    
    json_looped = get_settlements_origin(iso2) # pass iso2 to API call function
    df_looped = convert_to_csv(json_looped) # pass json API response to csv creation
    
    # create path
    path = f'../Individual_country_data/{iso3}_{country}/02_Settlement_data/{iso3}_populated_places_points.csv'
    
    df_looped.to_csv(path, index = False)
    if i+1 % 5 != 0:
        time.sleep(60)
    else:
        time.sleep(300)

CPU times: user 2.09 s, sys: 326 ms, total: 2.42 s
Wall time: 10min 23s


Because Overpass has a limit on the amount of requests I can put in, it will take a long time to populate all of these with CSVs. The initialization will take quite a long time, but it is 100% worth it.

Fun little thing about the country metadata: Namibia's ISO2, "NA", gets turned into a null when read in by Pandas by default.