In [1]:
%%html
<script>
    // AUTORUN ALL CELLS ON NOTEBOOK-LOAD!
    require(
        ['base/js/namespace', 'jquery'], 
        function(jupyter, $) {
            $(jupyter.events).on("kernel_ready.Kernel", function () {
                console.log("Auto-running all cells-below...");
                jupyter.actions.call('jupyter-notebook:run-all-cells-below');
                jupyter.notebook.scroll_to_top();
                jupyter.actions.call('jupyter-notebook:save-notebook');                
                
            });
        });
        
        $( document ).ready(function(){
        code_shown=false;
        $('div.input').hide()});
    
    
</script>


Note: Above this cell is a hidden cell that hides and runs all code in the file. This is intended for those who do not want to see or interact with the code. It can be seen by converting the cell to markdown(see toolbar above) and then back to code

# Plot and conversion of demographic data

The following file uses the downloaded demographic data from [worldpop.org](https://www.worldpop.org/) and converts it into discrete (integer) numbers based on the users desired accuracy, plots a demographic bar chart and and saves the resulting information into json files. 

The output from this file is then intdned to be combined with the population density output file. These combined files can then create a strong foundation for a agent population. 

## 0: Import the Dependencies

In [1]:
from toggle_code import toggle_code as hide_code
from toggle_code import run_code as run_code
import os
import numpy as np
import pandas as pd
import bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.models import BasicTickFormatter
import glob 
import rasterio
import re
import datetime
from collections import OrderedDict
import ipywidgets as widgets
from ipywidgets import interact, Layout
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.tile_providers import get_provider, Vendors
from bokeh.palettes import Spectral4
from bokeh.models import Legend, BoxAnnotation, Toggle
from bokeh.layouts import layout, gridplot, column
tile_provider = get_provider('STAMEN_TERRAIN')
#create pyproj transformer to convert form lat/long to web mercator
from pyproj import Transformer
transformer = Transformer.from_crs('epsg:4326','epsg:3857')
#from IPython.display import HTML
output_notebook()
import warnings
warnings.filterwarnings("ignore", message="Cannot find a last shown plot to update.")

## 1: Insert full path to data
    
The code will grab and process all files within this folder. Ensure only the demographic files you want to work with are
stored at this location

In [2]:
hide_code()
run_code()


pot_list =["All"]
filepath= r"./data/*"
for pop_file in glob.glob(filepath):
    if "_f_" in pop_file or "_m_" in pop_file: 
        pot_list.append(pop_file)

pop_file = widgets.SelectMultiple(options=pot_list, value=[], 
                                  description="File: ",
                                  disabled = False,
                                  layout=Layout(width="50%", height="260px"))

def update(file):
    return file

pop_file_select = interact(update, file=pop_file)

#call the function 


#pop_table

interactive(children=(SelectMultiple(description='File: ', layout=Layout(height='260px', width='50%'), options…

## 2. Convert file into table¶

The following code converts the downloaded worldpop file into a table of latitudes, longitudes and number of people.

6 decimal points for the coordinates represents and accuracy of ~0.11 meters at the equator


In [3]:
hide_code()
run_code()

# takes in a worldpop dataset url and populates a 3d array with 3 slices, one for the latitude, one for the longitude,
# and one for the population at that specified co-ordinate box
# The array is then loaded into the dictionary of all the worldpop age and sex demographics
def get_array(filename):#, demographic, struct_dict):
    with rasterio.open(filename) as src:
        #read image
        image= src.read()
        # transform image
        bands,rows,cols = np.shape(image)
        image1 = image.reshape (rows*cols,bands)
        # bounding box of image
        l,b,r,t = src.bounds
        #resolution of image
        res = src.res
        # meshgrid of X and Y
        x = np.arange(l,r, res[-1])
        y = np.arange(t,b, -res[-1])
        #adjust for rounding errors
        if len(x) != image[0].shape[1]:
            diff_x = len(x)-image[0].shape[1]
            x = x[0:-diff_x]
        if len(y) != image[0].shape[0]:
            diff_y = len(y)-image[0].shape[0]
            y = y[0:-diff_y]
        #TUrn into a two dimensional array of all lats and longs
        lon, lat = np.meshgrid(x, y)
        lon_flat = lon.flatten()
        lat_flat= lat.flatten()
        pop_flat= image[0].flatten()
        x1, y1 = np.shape(lat)
        pop_dict = {"longitude":lon_flat, "latitude":lat_flat,"Population":pop_flat}
        pop_table = pd.DataFrame.from_dict(pop_dict)
        #Remove non values
        pop_table =pop_table[pop_table["Population"]!=-99999.0]
        total_peeps = sum(pop_table["Population"])
        #print("There are approximately {} people.".format(total_peeps))
        return pop_table,total_peeps
    

if len(pop_file.value)==0:
    print("waiting for inputs.")
else:     
    # Create arrays to hold pandas dataframes created from demographic and population tifs
    data_from_demo_files = []
    if pop_file.value[0] == "All": 
        data_selection = pot_list[1:]
    else: 
        data_selection = list(pop_file.value)
    for file in data_selection:
        print("Creating demographic tables of " + file)
        pop_table,total = get_array(file)
        data_from_demo_files.append([pop_table, total])
    print("There are {} demographic tables.".format(len(data_from_demo_files)))

waiting for inputs.


## 3. Select the level of accuracy needed

Next we can select the level of accuracy we want for our tables. 

In [4]:
hide_code()
run_code()

accuracy = widgets.Dropdown(options =["6 decimals (~0.11 meters)",
                                      "5 decimals (~1.1 meters)",
                                      "4 decimals (~11 meters)",
                                      "3 decimals (~110 meters)",
                                      "2 decimals (~1.1 kilometers)"],
                           value = "4 decimals (~11 meters)",
                           description = "Accuracy",
                           disabled = False)
def update(acc):
    return acc

acc_select = interact(update, acc=accuracy)

interactive(children=(Dropdown(description='Accuracy', index=2, options=('6 decimals (~0.11 meters)', '5 decim…

## 4: Smooth the population

There are many decimal people, which cannot exist. So based on the desired latitude/longitude accuracy the goal is to get close to the total population. The following code is based on Pareto distributions of populations or a rich get richer approach. In essence, if there is a high population density, then that area gets more people.

(This is obviously somewhat coarse and we welcome contributions.)

In [5]:
hide_code()
run_code()
def round_long_lat(pop_table, total, accuracy, name): 
    acc_dict = {"6 decimals (~0.11 meters)":6,
                  "5 decimals (~1.1 meters)":5,
                  "4 decimals (~11 meters)":4,
                  "3 decimals (~110 meters)":3,
                  "2 decimals (~1.1 kilometers)":2}
    rd = acc_dict[accuracy.value]

    goal = round(total)
    print("The goal population of {} is {}.".format(name, goal))
    print("Calculating......")

    #round longitude
    pop_table["longitude"] = pop_table["longitude"].round(rd)

    #round latitude
    pop_table["latitude"] = pop_table["latitude"].round(rd)


    grouped_poptable = pop_table.groupby(['longitude','latitude'], as_index=False).sum()
    #grouped_poptable.aggregate(np.sum)

    #compare total aggregated value against goal value
    new_total = grouped_poptable['Population'].sum()

    #first for every row greater than 1 round up
    grouped_poptable["Population"] = grouped_poptable["Population"].apply(np.rint)

    #sum rounded population values
    new_total = round(sum(grouped_poptable["Population"]))

    #find the difference
    diff = goal - new_total

    #Get the amount which should be added to each column
    #Make a copy
    grouped_poptable["Percent"] = grouped_poptable["Population"].copy()
    #Divide by the current total to get percent in each square
    grouped_poptable["Percent"]= grouped_poptable["Percent"].div(new_total)
    #Multiple that by the missing amount
    grouped_poptable["Percent"] = grouped_poptable["Percent"].multiply(diff)
    #Round to whole numbers
    grouped_poptable["Percent"] = grouped_poptable["Percent"].apply(np.rint)
    #Add to the population
    grouped_poptable["Population"] = grouped_poptable["Percent"] + grouped_poptable["Population"]
    #Get the new total
    new_total = round(sum(grouped_poptable["Population"]))

    print("The aggregated total population is: " + str(new_total))
    print("The new aggregated total accounts for: " + str(round(new_total/goal*100,2))+"% of the population.")
    print()
    return grouped_poptable, new_total

if len(pop_file.value)==0:
    print("Waiting for inputs.")
else:     
    for idx in range(len(data_from_demo_files)):
        data_from_demo_files[idx][0], data_from_demo_files[idx][1] = round_long_lat(data_from_demo_files[idx][0], 
                                    data_from_demo_files[idx][1],
                                    accuracy,
                                    data_selection[idx])
    
    print("Population smoothing complete.")

Waiting for inputs.


## 5: Data from processed files is formatted and graphed using Bokeh.

In [6]:
hide_code()
run_code()
# File names are split to track which age/sex goes with which population total
if len(pop_file.value)== 0:
    print("Waiting for inputs.")
else:
    file_list = []
    for file in data_selection:
        test = re.split('_|\.', file)
        file_list.append(test[2:5])

    #print(file_list)

    # Reformats data so bokeh can plot the demographic data
    position = 0
    #necessary to verify all categories are present
    consistency_check = [OrderedDict(),OrderedDict()]
    for data in file_list:
        if data[0] == 'f':
            consistency_check[0][int(data[1])] = data_from_demo_files[position][1]
        else:
            consistency_check[1][int(data[1])] = data_from_demo_files[position][1]
        position += 1

    for k in consistency_check[0].keys():
        if k not in consistency_check[1].keys():
            consistency_check[1][k] = 0
    for k in consistency_check[1].keys():
        if k not in consistency_check[0].keys():
            consistency_check[0][k] = 0
    #Ensure sorted by age
    plotter_female = OrderedDict((key, consistency_check[0][key]) for key in sorted(consistency_check[0].keys()))
    plotter_male = OrderedDict((key, consistency_check[1][key]) for key in sorted(consistency_check[1].keys()))
    categories = []
    for ele in list(plotter_female.keys()):
        categories.append(str(ele))
    
    male = list(plotter_male.values())
    female = list(plotter_female.values())

    sex = ['male', 'female']

    # Data is set up for Bokeh
    source = ColumnDataSource(data=dict(
        x=categories,
        male=male,
        female=female,
    ))

    output_notebook()

    # Graph presentation variables
    p = figure(
        title='Stacked Demographics',
        x_axis_label='Age',
        y_axis_label='Population',
        x_range=FactorRange(*categories),
        plot_width=800,
        plot_height=600,
        tooltips='@$name{0.0}'
    )

    # Graph variables
    p.vbar_stack(
        sex,
        x='x',
        width=0.4,
        alpha=0.5,
        color=['blue','pink'],
        source=source
    )

    # Removing scientific notation from y-axis tick marks
    p.yaxis[0].formatter = BasicTickFormatter(use_scientific=False)

    # Displaying graph
    show(p)

Waiting for inputs.


## 6: Save files

Due to the size, likely number of the demographic files and use of the population density files as the main reference file, the following code make two size saving decisions. 

1. It saves each files a as json instead of a .csv
2. It drops the populations rows with zero people

In [7]:
hide_code()
run_code()

if len(pop_file.value)== 0:
    print("Waiting for inputs.")
else:
    country_name = data_selection[0]
    country_name = re.split('data|_', country_name)
    country_name =country_name[1][1:]

    # Need to change this to run in loop on each file name saved to filename_list
    def save_file(name, df, col):
        df =df.rename(columns={"Population":col})
        filepath = os.path.join(r".\data", name)
        df = df[df[col]!=0]
        df.to_json(filepath)
        print("{} has been saved.".format(filepath))

    for ele in range(len(data_from_demo_files)):
        #give new column name for later merging
        col = "pop_"+file_list[ele][0] + "_" + file_list[ele][1]
        name = country_name + "_" + file_list[ele][0] + "_" + file_list[ele][1]+".json"
        save_file(name, data_from_demo_files[ele][0],col )

Waiting for inputs.
