# Bayesian Database Search API Tutorial

## Preamble
Import the libraries we need.

In [None]:
import requests
import math
import numpy
import pandas
import plotly.figure_factory as figure_factory
import plotly.graph_objs as graph_objs
import plotly.offline as offline
import plotly.plotly as py

Configure plotly to work in offline mode.

In [None]:
offline.init_notebook_mode(connected=False)

Set HTTP headers to be used across all requests.

In [None]:
headers = {'content-type': 'application/json'}

Define a function to render bar charts with large column names.

In [None]:
def bar_chart(columns, title="", x_axis=""):
    """Takes an array of dictionaries that have the keys 'column' and 'score'.
    The value for the 'column' key is a string representing the name of the column.
    The value for the 'score' key is an integer.
    """
    short_names = (column['column'][:40] for column in columns)
    truncated_names = ['{}…'.format(name) if len(name) > 40 else name 
                       for name in short_names]
    scores = [column['score'] for column in columns]

    data = [graph_objs.Bar(
        x=scores,
        y=truncated_names,
        orientation = 'h'
    )]

    layout = graph_objs.Layout(
        margin=graph_objs.layout.Margin(),
        title=title,
        yaxis=dict(
            tickfont=dict(
                size=9,
                color='rgb(107, 107, 107)'
            ),
            tickangle=30,
            automargin=True
        ),
        xaxis=dict(title=x_axis)
    )

    return graph_objs.Figure(data=data, layout=layout)

def choropleth(fips=[], values=[], title='', legend_title=None, color_scale=None):
    assert len(fips) == len(values), 'Length of fips ({}) and length of values ({}) do not match.'.format(len(fips), len(values))
    
    default_color_scale = ["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1",
                           "#85bcdb","#6baed6","#57a0ce","#4292c6","#3082be","#2171b5","#1361a9",
                           "#08519c","#0b4083","#08306b"]
    default_color_scale.reverse()
    color_scale = color_scale or default_color_scale
    
    binning_endpoints = list(numpy.linspace(min(values), max(values), len(color_scale) - 1))
    
    return figure_factory.create_choropleth(
        fips=fips, 
        values=values,
        scope=['usa'],
        colorscale=color_scale,
        binning_endpoints=binning_endpoints,
        county_outline={
            'color': 'rgb(15, 15, 55)', 
            'width': 0.5
        },
        show_hover=True, 
        centroid_marker={'opacity': 0},
        asp=2.9, 
        title=title,
        showlegend=(legend_title is not None),
        **(dict(legend_title=legend_title) if legend_title else {})
    )

def scatterplot(xs=[], ys=[], text=[], title='', x_axis='', y_axis=''):
    assert len(xs) == len(ys) == len(text),\
        'Length of xs ({}) and ys ({}) and text ({}) must match.'.format(len(xs), len(ys), len(text))

    trace = graph_objs.Scatter(
        x=xs,
        y=ys,
        text=text,
        mode='markers'
    )
    
    layout = graph_objs.Layout(
        title=title,
        hovermode='closest',
        xaxis=dict(title=x_axis),
        yaxis=dict(title=y_axis),
        showlegend= False
    )

    return graph_objs.Figure(data=[trace], layout=layout)

## Visualize the data table

In [None]:
http_response = requests.get('http://bayesrest:5000/table-data', ())
assert http_response.status_code == 200

response_json = http_response.json()
data = response_json['data']

df = pandas.DataFrame(
    data=data,
    columns=response_json['columns']
)
df.index = df['rowid']

def with_columns(rdf, columns=[]):
    return rdf.merge(
            df.loc[:, ['rowid'] + columns], 
            on='rowid', 
            how='left'
    )

df[:5]

## Columns most predictive of `Opioid_Deaths`

#### Fetch

In [None]:
fac_payload = dict(column='Opioid_Deaths')
fac_response = requests.post(
    'http://bayesrest:5000/find-associated-columns',
    json=fac_payload, 
    headers=headers
)
assert fac_response.status_code == 200

#### Bar chart

In [None]:
associated_columns = fac_response.json()
fac_bar_chart = bar_chart(
    associated_columns[:25],
    title='Columns most predictive of Opioid_Deaths', 
    x_axis='Relevance to Opioid_Deaths'
)
offline.iplot(fac_bar_chart)

## Counties with unlikely numbers of opioid deaths

### Unlikely counties without context

#### Fetch

In [None]:
fa_payload_0 = {
        'target-column': 'Opioid_Deaths', 
        'context-columns': []
}
fa_response_0 = requests.post(
    'http://bayesrest:5000/find-anomalies', 
    json=fa_payload_0, 
    headers={'content-type': 'application/json'}
)
assert fa_response_0.status_code == 200

In [None]:
fa_df_0 = pandas.DataFrame(fa_response_0.json(), columns=['rowid', 'probability'])
fa_df_0 = fa_df_0[fa_df_0['probability'].notnull()]
fa_df_0 = with_columns(fa_df_0, ['state_county_fips', 'Opioid_Deaths', 'Location'])
fa_df_0[:5]

#### Choropleth

In [None]:
fa_choropleth_0 = choropleth(
    fa_df_0['state_county_fips'], 
    fa_df_0['probability'].transform(lambda p: math.log(p)), 
    title='Counties with unlikely values for Opioid_Deaths'
)
offline.iplot(fa_choropleth_0)

#### Scatterplot

In [None]:
fa_scatter_0 = scatterplot(
    xs=fa_df_0['Opioid_Deaths'],
    ys=fa_df_0['probability'],
    text=fa_df_0['Location'],
    x_axis='Opioid_Deaths', 
    y_axis='Probability'
)
offline.iplot(fa_scatter_0)

### Unlikely counties in the context of predictively relevant columns

#### Fetch

In [None]:
fa_payload_1 = {
    'target-column': 'Opioid_Deaths', 
    'context-columns': [
        '"Trump 2016"', 
        '"Total Population: Foreign Born: Not a Citizen"', 
        '"Families: Income in  below poverty level: Married Couple Family: with Related Child Living  Bellow Poverty Level"'
    ]
}
fa_response_1 = requests.post('http://bayesrest:5000/find-anomalies', json=fa_payload_1, headers=headers)
assert fa_response_1.status_code == 200

In [None]:
fa_df_1 = pandas.DataFrame(fa_response_1.json(), columns=['rowid', 'probability'])
fa_df_1 = fa_df_1[fa_df_1['probability'].notnull()]
fa_df_1 = with_columns(fa_df_1, ['state_county_fips', 'Opioid_Deaths', 'Location'])
fa_df_1[:5]

#### Choropleth

In [None]:
fa_choropleth_1 = choropleth(
    fa_df_1['state_county_fips'], 
    fa_df_1['probability'].transform(lambda p: math.log(p)), 
    title='''Anomalous counties in terms of opioid deaths, in the context of support for Trump, 
    <br>density of immigrants, and poverty'''
)
offline.iplot(fa_choropleth_1)

#### Scatterplot

In [None]:
fa_scatter_1 = scatterplot(
    fa_df_1['Opioid_Deaths'],
    fa_df_1['probability'],
    fa_df_1['Location'],
    x_axis='Opioid_Deaths', 
    y_axis='Anomalousness Score',
    title='''Anomalous counties in terms of opioid deaths, in the context of support for Trump, 
    <br> density of immigrants, and poverty'''
)
offline.iplot(fa_scatter_1)

## Counties similar to a county with a low number of opioid deaths

Here we'll focus on counties similar to a county with a low number of opioid deaths: **Fillmore County, Minnesota**.

#### Fetch

In [None]:
fp_payload = {
    'target-row': 27, 
    'context-column': 'Opioid_Deaths'
}

fp_response = requests.post(
    'http://bayesrest:5000/find-peers', 
    json=fp_payload, 
    headers=headers)
assert fp_response.status_code == 200

In [None]:
fp_df = pandas.DataFrame(fp_response.json(), columns=['rowid', 'similarity'])
fp_df = fp_df[fp_df['similarity'].notnull()]
fp_df = with_columns(fp_df, ['state_county_fips', 'Opioid_Deaths', 'Location'])
fp_df[:5]

#### Choropleth

In [None]:
fp_choropleth = choropleth(
    fips=fp_df['state_county_fips'], 
    values=fp_df['similarity'], 
    legend_title='Similarity Score',
    title='Counties Similar to Fillmore County, Minnesota with respect to opioid deaths',
    color_scale=["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1", "#85bcdb","#6baed6","#57a0ce","#4292c6"]
)
offline.iplot(fp_choropleth)

#### Scatterplot

In [None]:
fp_scatter = scatterplot(
    xs=fp_df['Opioid_Deaths'],
    x_axis='Opioid_Deaths', 
    ys=fp_df['similarity'],
    y_axis='Similarity Score',
    text=fp_df['Location'],
    title='Counties Similar to Fillmore County, Minnesota with respect to opioid deaths'
)
offline.iplot(fp_scatter)