In [None]:
%%html
<script>
    // AUTORUN ALL CELLS ON NOTEBOOK-LOAD!
    require(
        ['base/js/namespace', 'jquery'], 
        function(jupyter, $) {
            $(jupyter.events).on("kernel_ready.Kernel", function () {
                console.log("Auto-running all cells-below...");
                jupyter.actions.call('jupyter-notebook:run-all-cells-below');
                jupyter.notebook.scroll_to_top();
                jupyter.actions.call('jupyter-notebook:save-notebook');                
                
            });
        });
        
        $( document ).ready(function(){
        code_shown=false;
        $('div.input').hide()});
    
    
</script>


Note: Above this cell is a hidden cell that hides and runs all code in the file. This is intended for those who do not want to see or interact with the code. It can be seen by converting the cell to markdown(see toolbar above) and then back to code

# Plot and conversion of demographic data

The following file uses the downloaded demographic data from [worldpop.org](https://www.worldpop.org/) and converts it into discrete (integer) numbers based on the users desired accuracy and plots a demographic bar charts.

This file use the *Density Exploration and Conversion* .h5 file to reference the area are down selected. 

The output from this file is then used *syntheticpopulation_starter.py* file to build a demographically accurate ABM. 

Due to the potential size of the population files syntheticpopulation uses hdf5 (saved as h5) file format to keep the data on disk rather than RAM. **Please be aware depending on the country this may take up substantial memory.** As an example a country like Niger based on geographic size takes up approximately 70 gigabytes. 


## 0: Import the Dependencies

In [None]:
from toggle_code import toggle_code as hide_code
from toggle_code import run_code as run_code
import os
import numpy as np
import pandas as pd
import json
import glob 
import rasterio
import re
import datetime
from collections import OrderedDict
import ipywidgets as widgets
from ipywidgets import interact, Layout
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure, show
from bokeh.tile_providers import get_provider, Vendors
from bokeh.palettes import RdYlGn, Spectral4
from bokeh.models import Legend, BoxAnnotation, Toggle, CustomJS,ColumnDataSource,LinearColorMapper, ColorBar, BasicTicker,\
                          PrintfTickFormatter, BasicTickFormatter, FactorRange
from bokeh.layouts import layout, gridplot, column
from bokeh.transform import transform, dodge
from bokeh.models.tools import *
tile_provider = get_provider('STAMEN_TERRAIN')
#create pyproj transformer to convert form lat/long to web mercator
from pyproj import Transformer
transformer = Transformer.from_crs('epsg:4326','epsg:3857')
import tables #To ensure binder build called through Pandas
output_notebook()
import warnings
warnings.filterwarnings("ignore", message="Cannot find a last shown plot to update.")

## 1: Select desired accuracy and if desired select country from saved files
    
If you only want specific files type in the Worldpops three letter code for those files. 

You may also leave this blank and all files will appear in step 2. 

6 decimal points for the coordinates represents and accuracy of ~0.11 meters at the equator

**You cannot increase in accuracy (e.g. 4 decimals to 6 decimals) if you lowered the accuracy in the *Density Exploration and Conversion* file.**

In [None]:
hide_code()
run_code()


print("Selected desired accuracy")
accuracy = widgets.Dropdown(options =["6 decimals (~0.11 meters)",
                                      "5 decimals (~1.1 meters)",
                                      "4 decimals (~11 meters)",
                                      "3 decimals (~110 meters)",
                                      "2 decimals (~1.1 kilometers)"],
                           value = "4 decimals (~11 meters)",
                           description = "Accuracy",
                           disabled = False)
def update(acc):
    return acc

acc_select = interact(update, acc=accuracy)


print("If you want a specific country enter the three letter code in the file name")
country = widgets.Text(value ='', placeholder='Input 3 letter country identifier',
    description='Country:',
    disabled=False)

def country_select(cntry): 
    return cntry

country_selected = interact(country_select, cntry=country)

## 2: Select Data Files
    
The code will grab and process all files within this folder. Ensure only the demographic files you want to work with are
stored at this location

In [None]:
hide_code()
run_code()

val = country.value
pot_list =["All"]
filepath= r"./data/*"
for pop_file in glob.glob(filepath):
    if val != '': 
        if val in pop_file: 
            if "_f_" in pop_file or "_m_" in pop_file: 
                pot_list.append(pop_file)
            
    else: 
        if "_f_" in pop_file or "_m_" in pop_file: 
            pot_list.append(pop_file)

pop_file = widgets.SelectMultiple(options=pot_list, value=[], 
                                  description="File: ",
                                  disabled = False,
                                  layout=Layout(width="50%", height="260px"))

def update(file):
    return file

pop_file_select = interact(update, file=pop_file)

#call the function 


#pop_tabl

## 3. Convert demographic files into table¶

The following code converts the downloaded worldpop file into a table of latitudes, longitudes and number of people.

The code gets the decimal individuals, determine their aggrgate number and then redistributes them back to the population. 

Due to this approah this requires the whole population to be factored and **may result in very large files (e.g. 10 to 100s of gigabytes)** depending on the geographic area of the population.



In [None]:
hide_code()
run_code()

# takes in a worldpop dataset url and populates a 3d array with 3 slices, one for the latitude, one for the longitude,
# and one for the population at that specified co-ordinate box
# The array is then loaded into the dictionary of all the worldpop age and sex demographics
def get_array(filename, data_store, res):#, demographic, struct_dict):
    col = re.split('_|\.', filename)
    col = col[2]+"_" +col[3]

    with rasterio.open(filename) as src:
        #read image
        image= src.read()      
        # transform image
        bands,rows,cols = np.shape(image)
        # bounding box of image
        l,b,r,t = src.bounds
        
        reso = src.res
        # meshgrid of X and Y
        x = np.arange(l,r, reso[-1])
        y = np.arange(t,b, -reso[-1])
        #adjust for rounding errors
        if len(x) != image[0].shape[1]:
            diff_x = len(x)-image[0].shape[1]
            x = x[0:-diff_x]
        if len(y) != image[0].shape[0]:
            diff_y = len(y)-image[0].shape[0]
            y = y[0:-diff_y]
        #TUrn into a two dimensional array of all lats and longs
        lon, lat = np.meshgrid(x, y)
        lon_flat = lon.flatten()
        lat_flat= lat.flatten()
        #Get rid of buffer locations
        pop = image[0].ravel()
        if type(res) == int: 
            res = np.where(pop==-99999.) #np.where flattens array
        pop = np.delete(pop,res)
        lon_flat = np.delete(lon_flat, res)
        lat_flat  = np.delete(lat_flat, res)
        #Create tables of structure ["longitude", "Latitude", "population"]
        pop_dict = {"longitude":lon_flat, "latitude":lat_flat, col:pop}
        pop_table = pd.DataFrame.from_dict(pop_dict)
        
        #Get total        
        total_peeps = np.sum(pop_dict[col])
        #print("There are approximately {} people.".format(total_peeps))
        data_store[col] = pop_table        
        return col,total_peeps, res
    
def get_pop(filename,data_store, res):
    col = re.split('_|\.', filename)
    col = col[2]+"_" +col[3]
    
    with rasterio.open(filename) as src:
        #read image
        image= src.read()                 
        pop = image[0].flatten()
        pop = np.delete(pop,res)
        total_peeps = np.sum(pop)
        pop_table = pd.DataFrame(data=pop, columns=[col])
        
        data_store[col] = pop_table
        
        return col, total_peeps
    
    
    
if len(pop_file.value)==0:
    print("waiting for inputs.")
else:     
    data_selection = []
    if pop_file.value[0] == "All":
        data_selection = pot_list[1:]
    else: 
        data_selection = pop_file.value

    #To track totals and file names
    data_from_demo_files = []

    
    #Create HDF5 file --to prevent memory errors
    data_store =pd.HDFStore(".\data\demographics.h5")
    #dummy variable of buffer locations
    res = 0
    
    # Create arrays to hold pandas dataframes created from demographic and population tifs    
    for file in data_selection:
        print("Creating demographic tables of " + file)
        if data_selection.index(file) == 0:
            col, total, res = get_array(file, data_store, res)
        else:
            col, total = get_pop(file, data_store, res)
        data_from_demo_files.append([col, total])
    
    #flush tables from memory
    data_store.close()
    print("There are {} demographic tables.".format(len(data_from_demo_files)))

# 4: Plot the data

The next cell will plot the data as a demogrpahic file se we can visualize our data and also get an understanding of how the data pipelines affects the data. 

In [None]:
hide_code()
run_code()
# File names are split to track which age/sex goes with which population total

age_lookup = {'0': '0-1', '1': '1-5', '5':'5-10', '10':'10-15', '15':'15-20',
             '20':'20-25', '25':'25-30', '30':'30-35', '35':'35-40', '40':'40-45',
             '45': '45-50', '50':'50-55', '55':'55-60', '60':'60-65', '65':'65-70',
             '70':'70-75', '75':'75-80', '80': '80 and Older'}

total = 0

def totals(data):
    consistency_check = [OrderedDict(),OrderedDict()]
    total = 0
    
    for d in data:
        dem = d[0].split("_")
        if dem[0] == 'f':
            consistency_check[0][int(dem[1])] = d[1]
            total += consistency_check[0][int(dem[1])]
        else:
            consistency_check[1][int(dem[1])] = d[1]
            total += consistency_check[1][int(dem[1])]
    return consistency_check, total


if len(pop_file.value)== 0:
    print("Waiting for inputs.")
else:
    #convert the data for plotting 
    consistency_check, total = totals(data_from_demo_files)    
    print("Total population is {}.".format(total))    
    
    #Ensure sorted by age --relies on int conversion in total function
    plotter_female = OrderedDict((key, consistency_check[0][key]) for key in sorted(consistency_check[0].keys()))
    plotter_male = OrderedDict((key, consistency_check[1][key]) for key in sorted(consistency_check[1].keys()))
    categories = []
    for ele in list(plotter_female.keys()):
        categories.append(age_lookup[str(ele)])
    
    
    male = list(plotter_male.values())
    female = list(plotter_female.values())
    sex = ['male', 'female']

    # Data is set up for Bokeh
    source = ColumnDataSource(data=dict(
        x=categories,
        male=male,
        female=female,
    ))

    output_notebook()

    # Graph presentation variables
    p = figure(
        title='Demographics',
        x_axis_label='Age',
        y_axis_label='Population',
        x_range=FactorRange(*categories),
        plot_width=800,
        plot_height=600,
        #tooltips='@$y{0}'
    )
    #rotate x axis labels
    p.xaxis.major_label_orientation = 'vertical'
    p.xaxis.axis_label_text_font_size = '20px' 
    p.xaxis.axis_label_text_font_style ='bold'
    p.yaxis.axis_label_text_font_size = '20px' 
    p.yaxis.axis_label_text_font_style ='bold'
    p.yaxis.major_label_text_font_size = "12px"
    p.xaxis.major_label_text_font_size = "12px"
    p.title.text_font_size = '24px'
    

    # Graph variables
    p.vbar(x=dodge('x', 0.17, range = p.x_range),
           top = 'male',
           width=0.3,
           alpha=0.8,
           color='blue',
           source=source
    )

    p.vbar(x=dodge('x', -0.17, range=p.x_range),
           top = 'female',
           width=0.3,
           alpha=0.8,
           color='pink',
           source=source
    )
    # Removing scientific notation from y-axis tick marks
    p.yaxis[0].formatter = BasicTickFormatter(use_scientific=False)

    # Displaying graph
    show(p)

## 5. See the visualized area

This code plots the selected area from the *Density Exploration and Conversion* file. In step 6, the code will reduce to the size of the desired area. 

In [None]:
hide_code()
run_code()


if len(pop_file.value)== 0:
    print("Waiting for inputs.")
else: 

    data_store =pd.HDFStore(".\data\density.h5")

    viz_table = data_store["visuals"]                            

    lat_min = viz_table["latitude"].min()
    lat_max = viz_table["latitude"].max()
    lon_min = viz_table["longitude"].min()
    lon_max = viz_table["longitude"].max()


    min_max_pts = [(lat_min, lon_min), (lat_max, lon_max)]
    bbox2 = []
    for pt in transformer.itransform(min_max_pts): 
        bbox2.append(pt)   

    colors = list(RdYlGn[8]) 
    colors.reverse()
    #Is there a better color mapper?
    mapper = LinearColorMapper(palette=colors, low=viz_table.Population.min(),
                               high=viz_table.Population.max())

    color_bar = ColorBar(color_mapper=mapper, location=(0, 0),
                     ticker=BasicTicker(desired_num_ticks=len(colors)))

    def heatmap():

        p2 = figure(plot_width=600, plot_height=600, title="Selected Area",
                    x_range=(bbox2[0][0], bbox2[1][0]),y_range=(bbox2[0][1], bbox2[1][1]),
                    x_axis_type="mercator", y_axis_type="mercator")
        p2.title.text_font_size = '20pt'
        map_base = p2.add_tile(tile_provider)
        map_base.level ='underlay'
        #convert source to selected dictionary value
        source = ColumnDataSource(viz_table)

        p2.rect(x='web_lon', y='web_lat', width=3500, height=3500, source=source,
                line_color=None, fill_color= transform("Population", mapper), alpha=0.07)

        p2.add_layout(color_bar, 'right')

        show(p2)




    heatmap_out = interact(heatmap)
    data_store.close()

## 6: Smooth the population

There are many decimal people, which cannot exist. So based on the desired latitude/longitude accuracy the goal is to get close to the total population. The challenge is how?

The following approach is based on Pareto distributions of populations or a rich get richer approach. In essence, if there is a high population density, then that area gets more people.

(This is obviously somewhat coarse and we welcome contributions.)

In [None]:
hide_code()
run_code()


def create_base(rd, file): 
    data_store = pd.HDFStore('./data/demographics.h5')
    print("Adjusting the accuracy...")
    grouped = data_store[file]
    
    #Round to desired accuracy
    grouped["longitude"] = grouped["longitude"].round(rd)
    grouped["latitude"] = grouped["latitude"].round(rd)
    
    index = grouped.index
    
    data_store[file] = grouped
           
    data_store.close()
    
    return index
 
def expand_base(total, file, index, idx, bds): 
    #Read in data
    data_store = pd.HDFStore('./data/demographics.h5')
    
    grouped = data_store[file]
    
    if idx == 0: 
        grouped = grouped[(grouped["latitude"]>=bds[0][0]) & (grouped["latitude"]<= bds[1][0]) &
                                       (grouped["longitude"] >= bds[0][1]) & (grouped["longitude"]<= bds[1][1])]
        index = grouped.index
    else: 
        #reduce file to desired area
        grouped = grouped.iloc[index, :] 
    
    
    goal = round(grouped[file].sum())
    print("The goal population of {} is {}.".format(file,goal))
    print("Calculating......")
       
    grouped["smalls"] = np.where((grouped[file]< 0.4) & (grouped[file]>0.0), grouped[file], 0)
    mod_sum = int(grouped["smalls"].sum())
    largest = grouped.nlargest(mod_sum, columns='smalls')
    grouped.at[largest.index, "smalls"] = 1
    
    grouped[file] = grouped[file] + grouped['smalls']
    
    grouped[file] = grouped[file].apply(np.rint)
    
    
    new_total = np.sum(grouped[file])
    
    diff = goal-new_total
    
    print("The aggregated total population is: " + str(new_total))
    print("The new aggregated total accounts for: " + str(round(new_total/goal*100,2))+"% of the population.")
    
    del grouped["smalls"]
    
   
    
    new_total = np.sum(grouped[file])
    
    data_store[file] = grouped
    
    data_store.close()
    
    return new_total, index

######################################################################
##                                                                 ##
##                    Main Code                                    ##
##                                                                 ##
#####################################################################
acc_dict = {"6 decimals (~0.11 meters)":6,
                  "5 decimals (~1.1 meters)":5,
                  "4 decimals (~11 meters)":4,
                  "3 decimals (~110 meters)":3,
                  "2 decimals (~1.1 kilometers)":2}
rd = acc_dict[accuracy.value]
    
new_totals = []

zeros = []

if len(pop_file.value)==0:
    print("Waiting for inputs.")
else:     
    #get just the list of files
    sub_files = list(list(zip(*data_from_demo_files))[0])
    #get just the list of populations
    sub_pops =  list(list(zip(*data_from_demo_files))[1])
    #do the first file
    
    #round lat long and get base index: 
    index = create_base(rd, sub_files[0])
       
    for idx in range(len(sub_files)):
        sub_pop, index = expand_base(sub_pops[idx], sub_files[idx], index, idx, min_max_pts)
        new_totals.append(sub_pop)
        #zeros.append(zero_list)
    
    print("Population smoothing complete.")

    sub_dict = {"columns": sub_files}
    
    with open("./data/demo_ref.json",'w') as f: 
        json.dump(sub_dict,f)
        
    print("Reference columns saved as demo_ref.json")

In [None]:
hide_code()
run_code()
# File names are split to track which age/sex goes with which population total

def totals(data):
    consistency_check = [OrderedDict(),OrderedDict()]
    total = 0
    
    for d in data:
        dem = d[0].split("_")
        if dem[0] == 'f':
            consistency_check[0][int(dem[1])] = d[1]
            total += consistency_check[0][int(dem[1])]
        else:
            consistency_check[1][int(dem[1])] = d[1]
            total += consistency_check[1][int(dem[1])]
    return consistency_check, total


if len(pop_file.value)== 0:
    print("Waiting for inputs.")
else:
    #convert the data for plotting 
    consistency_check, total = totals(zip(sub_files, new_totals))    
    print("Total population is {}.".format(total))    
    
    #Ensure sorted by age --relies on int conversion in total function
    plotter_female = OrderedDict((key, consistency_check[0][key]) for key in sorted(consistency_check[0].keys()))
    plotter_male = OrderedDict((key, consistency_check[1][key]) for key in sorted(consistency_check[1].keys()))
    categories = []
    for ele in list(plotter_female.keys()):
        categories.append(age_lookup[str(ele)])
    
    
    male = list(plotter_male.values())
    female = list(plotter_female.values())
    sex = ['male', 'female']

    # Data is set up for Bokeh
    source = ColumnDataSource(data=dict(
        x=categories,
        male=male,
        female=female,
    ))

    output_notebook()

    # Graph presentation variables
    p = figure(
        title='Demographics',
        x_axis_label='Age',
        y_axis_label='Population',
        x_range=FactorRange(*categories),
        plot_width=800,
        plot_height=600,
        #tooltips='@$y{0}'
    )
    #rotate x axis labels
    p.xaxis.major_label_orientation = 'vertical'
    p.xaxis.axis_label_text_font_size = '20px' 
    p.xaxis.axis_label_text_font_style ='bold'
    p.yaxis.axis_label_text_font_size = '20px' 
    p.yaxis.axis_label_text_font_style ='bold'
    p.yaxis.major_label_text_font_size = "12px"
    p.xaxis.major_label_text_font_size = "12px"
    p.title.text_font_size = '24px'
    

    # Graph variables
    p.vbar(x=dodge('x', 0.17, range = p.x_range),
           top = 'male',
           width=0.3,
           alpha=0.8,
           color='blue',
           source=source
    )

    p.vbar(x=dodge('x', -0.17, range=p.x_range),
           top = 'female',
           width=0.3,
           alpha=0.8,
           color='pink',
           source=source
    )
    # Removing scientific notation from y-axis tick marks
    p.yaxis[0].formatter = BasicTickFormatter(use_scientific=False)

    # Displaying graph
    show(p)