### Plot provider cost predictions and residuals

In [1]:
import pandas as pd
import requests, zipfile, io
from bokeh.charts import Bar, Scatter, output_notebook, show, output_file
from bokeh.charts.attributes import CatAttr, color
from bokeh.models import HoverTool, Range1d, Span, LabelSet, ColumnDataSource, Title
from bokeh.models.widgets import Panel, Tabs
from bokeh.models.glyphs import Text
from bokeh.plotting import figure
from bokeh.palettes import PuOr5, RdYlBu10
import statsmodels.api as sm
lowess = sm.nonparametric.lowess



### Read the zipped file

In [8]:
# df = pd.read_stata('../../Data/ucr13_offenses/36122-0001-Data.dta')
# df = pd.read_stata('../../Data/ucr13_arrests/arrest_dems13_clean.dta')
df = pd.read_csv('../../Data/ucr13_arrests_offenses.csv')
# r = requests.get('http://crimedata.io/data/36122-0001-Data.dta.zip')
# z = zipfile.ZipFile(io.BytesIO(r.content))
# df = pd.read_csv(z.open('36122-0001-Data.dta'))

### See what the data look like

In [9]:
df.head(2)

Unnamed: 0,ori,black_arrests_012,black_arrests_080,black_arrests_090,black_arrests_100,black_arrests_120,black_arrests_130,black_arrests_140,black_arrests_150,black_arrests_160,...,cleared_jun_murder,cleared_jun_manslaugher,murder_offenses_total,rape_offenses_total,robbery_offenses_total,aggravated_assault_offenses_total,burglary_offenses_total,larceny_offenses_total,motor_vehicle_theft_offenses_total,total_offenses_offenses_total
0,AK00101,1.0,325,1.0,5,11,4.0,70,14,83,...,1,0,29,500,688,8540,1460,12112,1140,24473
1,AK00102,,37,,0,1,,8,4,0,...,0,0,0,47,47,1229,164,1409,133,3029


In [5]:
len(df)

22202

#### Define the offenses beforehand

In [30]:
offenses = ['murder', 'rape', 'sex_offense', 'robbery', 'aggravated_assault', 'burglary',
            'motor_vehicle_theft', 'fraud', 'larceny', 'drunkenness']

In [34]:
percent_black = pd.DataFrame()
for offense in offenses:
    total_arrests = df[offense + '_arrests_total'].sum()
    black_arrests = df['black_arrests_' + offense].sum()
    percent_arrests_black = black_arrests / total_arrests

    row1 = {'offense': offense, 'percent_black': percent_arrests_black}
    percent_black = percent_black.append(row1, ignore_index = True)

percent_black['offense'] = percent_black['offense'].str.replace('_', ' ')
percent_black['offense'] = percent_black['offense'].str.title()
percent_black['percent_black'] = (percent_black['percent_black'] * 100).round(1)

In [38]:
p = Bar(percent_black, label=CatAttr(columns=['offense'], sort=False), values='percent_black',
        tools="resize,reset,previewsave", height=550, width=800, legend=False,
        title="Percent of Arrestees Black by Offense", xlabel="Offense", ylabel="Percent Black")

# Horizontal line showing % of US population that is black (13%) - sample is 13.0% black
hline = Span(location=13, dimension='width', line_color='gray', line_width=2, line_dash=[4,4])
p.renderers.extend([hline])

msg = """Note: Data are from 2013 Uniform Crime Reports (UCR). Dashed gray line shows percentage of US population that is black (13)."""
caption = Title(text=msg, align='left', text_font_size='8pt')
p.add_layout(caption, 'below')

output_file("output/ucr_percent_black.html")
show(p)

In [39]:
percent_black

Unnamed: 0,offense,percent_black
0,Murder,51.8
1,Rape,30.6
2,Sex Offense,24.2
3,Robbery,52.5
4,Aggravated Assault,31.8
5,Burglary,28.3
6,Motor Vehicle Theft,27.4
7,Fraud,31.5
8,Larceny,27.2
9,Drunkenness,14.8
