#Final Project - Building an Interactive Graph

#Download sample data from bokeh  

We will work with the Gapminder data set. Specifically with data about historical child mortality and GDP of nations. The data set is already available within the Bokeh sample data.

*Gapminder is a non-profit venture promoting sustainable global development and achievement of the United Nations Millennium Development Goals by increased use and understanding of statistics and other information about social, economic and environmental development at local, national and global levels.*


In [1]:
import bokeh
bokeh.sampledata.download()

Using data directory: /Users/svenballentin/.bokeh/data
Downloading: CGM.csv (1589982 bytes)
   1589982 [100.00%]
Downloading: US_Counties.zip (3182088 bytes)
   3182088 [100.00%]
Unpacking: US_Counties.csv
Downloading: us_cities.json (713565 bytes)
    713565 [100.00%]
Downloading: unemployment09.csv (253301 bytes)
    253301 [100.00%]
Downloading: AAPL.csv (166698 bytes)
    166698 [100.00%]
Downloading: FB.csv (9706 bytes)
      9706 [100.00%]
Downloading: GOOG.csv (113894 bytes)
    113894 [100.00%]
Downloading: IBM.csv (165625 bytes)
    165625 [100.00%]
Downloading: MSFT.csv (161614 bytes)
    161614 [100.00%]
Downloading: WPP2012_SA_DB03_POPULATION_QUINQUENNIAL.zip (5148539 bytes)
   5148539 [100.00%]
Unpacking: WPP2012_SA_DB03_POPULATION_QUINQUENNIAL.csv
Downloading: gapminder_fertility.csv (64346 bytes)
     64346 [100.00%]
Downloading: gapminder_population.csv (94509 bytes)
     94509 [100.00%]
Downloading: gapminder_life_expectancy.csv (73243 bytes)
     73243 [100.00%]
Downl

#Define some basic functions to process the data  

We define three functions that will help us to process the Gapminder sample data into a format that can be processed more easily for our plot.

1. `def _process_gapminder_data:`
    + Make column names into strings
    + Turn population into bubble sizes
    + Use pandas categories and map to colors
    
2. `def get_gapminder_1964_data():`
    + Get a dataframe consisting of data from 1964
    
3. `def get_gapminder_1964_scatter_data():`
    +

In [2]:
from collections import OrderedDict
import pandas as pd
import numpy as np

from bokeh.models import (
    ColumnDataSource, Plot, Circle, Range1d,
    LinearAxis, HoverTool, Text,
    SingleIntervalTicker, Slider, CustomJS
)

def _process_gapminder_data():
    from bokeh.sampledata.gapminder import fertility, life_expectancy, population, regions

    # Make the column names ints not strings for handling
    columns = list(fertility.columns)
    years = list(range(int(columns[0]), int(columns[-1])))
    rename_dict = dict(zip(columns, years)) # a dict containing year-strings as keys and year-ints as values

    fertility = fertility.rename(columns=rename_dict) # mapping from string to int
    life_expectancy = life_expectancy.rename(columns=rename_dict)
    population = population.rename(columns=rename_dict)
    regions = regions.rename(columns=rename_dict)

    # Turn population into bubble sizes. Use min_size and factor to tweak.
    scale_factor = 200
    population_size = np.sqrt(population / np.pi) / scale_factor
    min_size = 3
    population_size = population_size.where(population_size >= min_size).fillna(min_size)

    # Use pandas categories and categorize & color the regions
    regions.Group = regions.Group.astype('category') # Apply category type on regions
    regions_list = list(regions.Group.cat.categories) # Just store all unique regions in a list

    def get_color(r):
        return Spectral6[regions_list.index(r.Group)] #Map index in list to color on palette
    regions['region_color'] = regions.apply(get_color, axis=1) #Add regions-color column

    return fertility, life_expectancy, population_size, regions, years, regions_list


def get_gapminder_1964_data():
    fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions = _process_gapminder_data()
    year = 1964
    region_color = regions_df['region_color']
    region_color.name = 'region_color'
    fertility = fertility_df[year] # get only data for 1964
    fertility.name = 'fertility'
    life = life_expectancy_df[year]
    life.name = 'life'
    population = population_df_size[year]
    population.name = 'population'
    new_df = pd.concat([fertility, life, population, region_color], axis=1) #concat pandas Series' to a DF
    return new_df


def get_gapminder_1964_scatter_data():
    fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions = _process_gapminder_data()
    xyvalues = OrderedDict()
    xyvalues['1964'] = list(
        zip(
            fertility_df[1964].dropna().values,
            life_expectancy_df[1964].dropna().values
        )
    )
    return xyvalues

In [4]:
from bokeh.io import output_notebook, show
output_notebook()

#We define a function that creates an empty responsive plot without any glyphs

In [7]:
from bokeh.models import Range1d, Plot

def get_plot():
    xdr = Range1d(1, 9)
    ydr = Range1d(20, 100)
    plot = Plot(
        x_range=xdr,
        y_range=ydr,
        title="",
        plot_width=800,
        plot_height=400,
        outline_line_color=None,
        toolbar_location=None,
        responsive=True
    )
    return plot
show(get_plot())

#We define some axis formats that are later passed to the xaxis and yaxis of the plot

In [8]:
AXIS_FORMATS = dict(
    minor_tick_in=None,
    minor_tick_out=None,
    major_tick_in=None,
    major_label_text_font_size="10pt",
    major_label_text_font_style="normal",
    axis_label_text_font_size="10pt",

    axis_line_color='#AAAAAA',
    major_tick_line_color='#AAAAAA',
    major_label_text_color='#666666',

    major_tick_line_cap="round",
    axis_line_cap="round",
    axis_line_width=1,
    major_tick_line_width=1
)

#We define a function that adds a y- and x-axis to the plot

In [9]:
from bokeh.models import LinearAxis, SingleIntervalTicker

def add_axes(plot):
    xaxis = LinearAxis(axis_label="Children per woman (total fertility)", **AXIS_FORMATS)
    yaxis = LinearAxis(axis_label="Life expectancy at birth (years)", **AXIS_FORMATS)
    plot.add_layout(xaxis, 'below')
    plot.add_layout(yaxis, 'left')
    return plot
show(add_axes(get_plot()))

In [None]:
#We define a function that adds the background text to the plot

In [10]:
from bokeh.models import ColumnDataSource, Text

text_source = ColumnDataSource({'year': ['1964']})

def add_text(plot):
    plot = add_axes(plot)
    # Add the year in background (add before circle)
    text = Text(x=2, y=35, text='year', text_font_size='150pt', text_color='#EEEEEE')
    plot.add_glyph(text_source, text) #we will manipulate this text_source later using JS
    return plot

show(add_text(get_plot()))

In [None]:
#We define a function that adds the circles to the plot

In [11]:
from bokeh.models import Circle, HoverTool
from bokeh.palettes import Spectral6

renderer_source = ColumnDataSource(get_gapminder_1964_data())

def add_circles(plot):
    plot = add_text(plot)
    # Add the circle
    circle_glyph = Circle(
        x='fertility', y='life', size='population',
        fill_color='region_color', fill_alpha=0.8,
        line_color='#7c7e71', line_width=0.5, line_alpha=0.5)
    circle_renderer = plot.add_glyph(renderer_source, circle_glyph)

    # Add the hover (only against the circle and not other plot elements)
    tooltips = "@index"
    plot.add_tools(HoverTool(tooltips=tooltips, renderers=[circle_renderer]))
    return plot

show(add_circles(get_plot()))


#We get the dataframes from the gapminder dataset

In [12]:
fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions = _process_gapminder_data()

#We create a function that adds the legend to the plot

In [17]:
def add_legend(plot):
    plot = add_circles(plot)
    # Add a custom legend
    text_x = 7
    text_y = 95
    for i, region in enumerate(regions):
        plot.add_glyph(Text(x=text_x, y=text_y, text=[region], text_font_size='10pt', text_color='#666666'))
        plot.add_glyph(Circle(x=text_x - 0.1, y=text_y + 2, fill_color=Spectral6[i], size=10, line_color=None, fill_alpha=0.8))
        text_y = text_y - 5
    return plot
    
show(add_legend(get_plot()))

#We create a function that builds the interactive plot

In [21]:
import pandas as pd
from bokeh.models import CustomJS, Slider
from bokeh.plotting import vplot, output_file
sources2 = {}

region_color = regions_df['region_color']
region_color.name = 'region_color'

def make_interactive(plot):
    plot = add_legend(plot)
     
    # We create a dictionary of columndatasources with _year as keys
    for year in years:
        fertility = fertility_df[year]
        fertility.name = 'fertility'
        life = life_expectancy_df[year]
        life.name = 'life'
        population = population_df_size[year]
        population.name = 'population'
        new_df = pd.concat([fertility, life, population, region_color], axis=1)
        sources2['_' + str(year)] = ColumnDataSource(new_df) 

    # we create a JS source array that maps the year to the sources we just created (e.g. sources_2000)
    dictionary_of_sources = dict(zip([x for x in years], ['_%s' % x for x in years]))
    js_source_array = str(dictionary_of_sources).replace("'", "")

    # Add the slider and the customJS
    
    # We manipulate the data that is loaded into the plot by referencing another source
    # and we change the text_source for the background text
    code = """
        var year = slider.get('value'),
            sources = %s,
            new_source_data = sources[year].get('data');
        renderer_source.set('data', new_source_data);
        renderer_source.trigger('change');
        text_source.set('data', {'year': [String(year)]});
        text_source.trigger('change');
    """ % js_source_array

    callback = CustomJS(args=sources2, code=code)
    slider = Slider(start=years[0], end=years[-1], value=1, step=1, title="Year", callback=callback)
    callback.args["slider"] = slider
    callback.args["renderer_source"] = renderer_source
    callback.args["text_source"] = text_source

    
    return vplot(plot, slider)


output_file("figures/bokeh_final_project.html")
make_interactive(get_plot())
show(make_interactive(get_plot()))

#Everything together

#Setting up the data
The plot animates with the slider showing the data over time from 1964 to 2013. We can think of each year as a seperate static plot, and when the slider moves, we use the Callback to change the data source that is driving the plot.

We could use bokeh-server to drive this change, but as the data is not too big we can also pass all the datasets to the javascript at once and switch between them on the client side.

This means that we need to build one data source for each year that we have data for and are going to switch between using the slider. We build them and add them to a dictionary sources that holds them under a key that is the name of the year preficed with a _.

In [None]:
fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions = _process_gapminder_data()

sources = {}

region_color = regions_df['region_color']
region_color.name = 'region_color'

for year in years:
    fertility = fertility_df[year]
    fertility.name = 'fertility'
    life = life_expectancy_df[year]
    life.name = 'life'
    population = population_df_size[year]
    population.name = 'population'
    new_df = pd.concat([fertility, life, population, region_color], axis=1)
    sources['_' + str(year)] = ColumnDataSource(new_df)

In [None]:

def get_gapminder_plot():
    fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions = _process_gapminder_data()

    sources = {}

    region_color = regions_df['region_color']
    region_color.name = 'region_color'

    for year in years:
        fertility = fertility_df[year]
        fertility.name = 'fertility'
        life = life_expectancy_df[year]
        life.name = 'life'
        population = population_df_size[year]
        population.name = 'population'
        new_df = pd.concat([fertility, life, population, region_color], axis=1)
        sources['_' + str(year)] = ColumnDataSource(new_df)

    dictionary_of_sources = dict(zip([x for x in years], ['_%s' % x for x in years]))
    js_source_array = str(dictionary_of_sources).replace("'", "")

    xdr = Range1d(1, 9)
    ydr = Range1d(20, 100)
    plot = Plot(
        x_range=xdr,
        y_range=ydr,
        title="",
        plot_width=800,
        plot_height=400,
        outline_line_color=None,
        toolbar_location=None,
        responsive=True,
    )
    AXIS_FORMATS = dict(
        minor_tick_in=None,
        minor_tick_out=None,
        major_tick_in=None,
        major_label_text_font_size="10pt",
        major_label_text_font_style="normal",
        axis_label_text_font_size="10pt",

        axis_line_color='#AAAAAA',
        major_tick_line_color='#AAAAAA',
        major_label_text_color='#666666',

        major_tick_line_cap="round",
        axis_line_cap="round",
        axis_line_width=1,
        major_tick_line_width=1,
    )

    xaxis = LinearAxis(SingleIntervalTicker(interval=1), axis_label="Children per woman (total fertility)", **AXIS_FORMATS)
    yaxis = LinearAxis(SingleIntervalTicker(interval=20), axis_label="Life expectancy at birth (years)", **AXIS_FORMATS)
    plot.add_layout(xaxis, 'below')
    plot.add_layout(yaxis, 'left')

    # ### Add the background year text
    # We add this first so it is below all the other glyphs
    text_source = ColumnDataSource({'year': ['%s' % years[0]]})
    text = Text(x=2, y=35, text='year', text_font_size='150pt', text_color='#EEEEEE')
    plot.add_glyph(text_source, text)

    # Add the circle
    renderer_source = sources['_%s' % years[0]]
    circle_glyph = Circle(
        x='fertility', y='life', size='population',
        fill_color='region_color', fill_alpha=0.8,
        line_color='#7c7e71', line_width=0.5, line_alpha=0.5)
    circle_renderer = plot.add_glyph(renderer_source, circle_glyph)

    # Add the hover (only against the circle and not other plot elements)
    tooltips = "@index"
    plot.add_tools(HoverTool(tooltips=tooltips, renderers=[circle_renderer]))

    # Add the legend
    text_x = 7
    text_y = 95
    for i, region in enumerate(regions):
        plot.add_glyph(Text(x=text_x, y=text_y, text=[region], text_font_size='10pt', text_color='#666666'))
        plot.add_glyph(Circle(x=text_x - 0.1, y=text_y + 2, fill_color=Spectral6[i], size=10, line_color=None, fill_alpha=0.8))
        text_y = text_y - 5

    # Add the slider
    code = """
        var year = slider.get('value'),
            sources = %s,
            new_source_data = sources[year].get('data');
        renderer_source.set('data', new_source_data);
        text_source.set('data', {'year': [String(year)]});
    """ % js_source_array

    callback = CustomJS(args=sources, code=code)
    slider = Slider(start=years[0], end=years[-1], value=1, step=1, title="Year", callback=callback, name='testy')
    callback.args["renderer_source"] = renderer_source
    callback.args["slider"] = slider
    callback.args["text_source"] = text_source

    # Lay it out
    return vplot(plot, slider)