### Plot provider cost predictions and residuals

In [1]:
import pandas as pd
import zipfile
from bokeh.charts import Bar, Scatter, output_notebook, show, output_file
from bokeh.charts.attributes import CatAttr, color
from bokeh.models import HoverTool, Range1d, Span, LabelSet, ColumnDataSource
from bokeh.models.widgets import Panel, Tabs
from bokeh.models.glyphs import Text
from bokeh.plotting import figure
from bokeh.palettes import PuOr5
import statsmodels.api as sm
lowess = sm.nonparametric.lowess



### Read the zipped file

In [2]:
#df = pd.read_csv('../../Data/offenders_2013_caplg_ucr_clean2.csv')
zf = zipfile.ZipFile('http://crimedata.io/data/offenders_lemas_acs_2013_clean2.csv.zip')
df = pd.read_csv(zf.open('offenders_2013_caplg_ucr_clean2.csv'))
# Exclude non-Black/White offenders
df = df[df['black_not_white'].notnull()]

  interactivity=interactivity, compiler=compiler, result=result)


#### Define the offenses beforehand

In [3]:
offenses = ['robbery', 'aggravated_assault', 'simple_assault', 'intimidation', 'weapon',
            'shoplifting', 'vandalism', 'drugs_narcotics', 'drug_equipment']

#### Set some constants for plotting

In [4]:
black_color = PuOr5[-1]
white_color = PuOr5[0]
TOOLS = "pan,wheel_zoom,box_zoom,reset,previewsave"

### See what the data look like

In [5]:
df.tail(2)

Unnamed: 0,offense_prostitution_purchase,com_ptnr,victim3_ucr_offense_code2,b_residents_percent,arrest_statutory_rape,total_residents,offense_auto_parts_theft,offense_operating_gambling,o_residents,arrest_operating_gambling,...,agency_offenders_count,agency_arrestees_count,county_offenders_count,county_arrestees_count,county_bdgt,agency_bdgt_per_offender,agency_bdgt_per_arrestee,county_offenders_per_resident,county_arrestees_per_resident,county_bdgt_per_resident
3983540,,,-8,0.986241,,8.213,,,0,,...,85,21,85,21,,,,0.010349,0.002557,
3983541,,,-8,0.986241,,8.213,,,0,,...,85,21,85,21,,,,0.010349,0.002557,


##### There are about 4 million rows, each representing a person reported to police as a criminal offender

In [6]:
df[5:10][['black_not_white', 'ori', 'arrested', 'offense_robbery', 'w_officers_percent']]

Unnamed: 0,black_not_white,ori,arrested,offense_robbery,w_officers_percent
5,0,AL0011200,1,,89.240506
6,0,AL0011200,1,,89.240506
7,0,AL0011200,0,,89.240506
8,0,AL0011200,0,,89.240506
9,1,AL0011200,1,,89.240506


#### How many offenders are black and white?

In [7]:
df['black_not_white'].value_counts()

0    2215013
1    1129855
Name: black_not_white, dtype: int64

In [8]:
df['black_not_white'].value_counts(normalize = True)

0    0.662212
1    0.337788
Name: black_not_white, dtype: float64

##### About 34% of offenders are black.  Keep in mind that all offenders are male.

### What percentage of offenders are black versus white?  How does this vary by offense?

In [9]:
percent_black = pd.DataFrame()
for offense in offenses:
    subset = df[df['offense_' + offense] == 1]
    arrested_subset = subset[subset['arrested'] == 1]
    offenders_black = subset['black_not_white'].mean()
    offenders_black_byagency = subset.groupby('ori')['black_not_white'].mean().reset_index().mean()[0]
    arrested_black = arrested_subset['black_not_white'].mean()
    arrested_black_byagency = arrested_subset.groupby('ori')['black_not_white'].mean().reset_index().mean()[0]
    row1 = {'offense': offense, 'level': 'Offender', 'offenders_black': offenders_black, 'arrested_black': arrested_black}
    row2 = {'offense': offense, 'level': 'Agency', 'offenders_black': offenders_black_byagency,
            'arrested_black': arrested_black_byagency}
    percent_black = percent_black.append(row1, ignore_index = True)
    percent_black = percent_black.append(row2, ignore_index = True)

percent_black['offense'] = percent_black['offense'].str.replace('_', ' ')
percent_black['offense'] = percent_black['offense'].str.title()
percent_black['offenders_black'] = (percent_black['offenders_black'] * 100).round(1)
percent_black['arrested_black'] = (percent_black['arrested_black'] * 100).round(1)

In [10]:
output_notebook()

In [11]:
p1 = Bar(percent_black, label=CatAttr(columns=['offense'], sort=False),
        values='offenders_black', group='level', legend = "top_right", tools=TOOLS,
        title="Percent Offenders Black by Race and Offense", xlabel="Offense", ylabel="Percent Offenders Black")
p1.y_range = Range1d(0, 85)
# Fix bar width issue
for r in p1.renderers:
    try:
        r.glyph.width = 0.3
    except AttributeError:
        pass

# Horizontal line showing % of US population that is black (13%)
hline = Span(location=13, dimension='width', line_color='gray', line_width=2, line_dash=[4,4])
p1.renderers.extend([hline])

source_offenders = ColumnDataSource(percent_black[percent_black['level'] == 'Offender'])
source_agencies = ColumnDataSource(percent_black[percent_black['level'] == 'Agency'])
offender_labels = LabelSet(x="offense", y="offenders_black", text="offenders_black", y_offset=8, x_offset=-11,
                  text_font_size="8pt", text_color="#555555", source=source_offenders, text_align='center')
agency_labels = LabelSet(x="offense", y="offenders_black", text="offenders_black", y_offset=8, x_offset=11,
                  text_font_size="8pt", text_color="#555555", source=source_agencies, text_align='center')
p1.add_layout(offender_labels)
p1.add_layout(agency_labels)


p2 = Bar(percent_black, label=CatAttr(columns=['offense'], sort=False),
        values='arrested_black', group='level', legend = "top_right", tools=TOOLS,
        title="Percent Arrested Offenders Black by Race and Offense",
         xlabel="Offense", ylabel="Percent Arrested Offenders Black")
p2.y_range = Range1d(0, 85)
# Fix bar width issue
for r in p2.renderers:
    try:
        r.glyph.width = 0.3
    except AttributeError:
        pass

# Horizontal line showing % of US population that is black (13%)
hline = Span(location=13, dimension='width', line_color='gray', line_width=2, line_dash=[4,4])
p2.renderers.extend([hline])

source_offenders_arrested = ColumnDataSource(percent_black[percent_black['level'] == 'Offender'])
source_agencies_arrested = ColumnDataSource(percent_black[percent_black['level'] == 'Agency'])
offender_arrested_labels = LabelSet(x="offense", y="arrested_black", text="arrested_black", y_offset=8, x_offset=-11,
                  text_font_size="8pt", text_color="#555555", source=source_offenders_arrested, text_align='center')
agency_arrested_labels = LabelSet(x="offense", y="arrested_black", text="arrested_black", y_offset=8, x_offset=11,
                  text_font_size="8pt", text_color="#555555", source=source_agencies_arrested, text_align='center')
p2.add_layout(offender_arrested_labels)
p2.add_layout(agency_arrested_labels)


tab1 = Panel(child=p1, title="All Offenders")
tab2 = Panel(child=p2, title="Arrested Offenders")
tabs_percent_black = Tabs(tabs=[ tab1, tab2 ])

output_file("output/percent_black.html")
show(tabs_percent_black)

### What percentage of black and white offenders are arrested?

##### Create dataset with % black and white male offenders arrested by offense type

In [12]:
arrest_rates = pd.DataFrame()
for offense in offenses:
    subset = df[df['offense_' + offense] == 1]
    
    arrested_byagency = subset.groupby(['ori', 'black_not_white'])['arrested'].mean().reset_index()
    arrested_byagency = arrested_byagency.groupby('black_not_white')['arrested'].mean().reset_index()
    arrested_byagency = arrested_byagency.rename(columns = {'arrested': 'arrested_byagency'})
    
    arrested = subset.groupby('black_not_white')['arrested'].mean().reset_index()
    arrested = arrested.merge(arrested_byagency, on = ['black_not_white'])
    arrested['offense'] = offense

    arrest_rates = arrest_rates.append(arrested, ignore_index = True)
    
arrest_rates.loc[arrest_rates['black_not_white'] == 1, 'black_not_white_text'] = 'Black'
arrest_rates.loc[arrest_rates['black_not_white'] == 0, 'black_not_white_text'] = 'White'

arrest_rates['offense'] = arrest_rates['offense'].str.replace('_', ' ')
arrest_rates['offense'] = arrest_rates['offense'].str.title()

arrest_rates['arrested'] = (arrest_rates['arrested'] * 100).round(1)
arrest_rates['arrested_byagency'] = (arrest_rates['arrested_byagency'] * 100).round(1)

In [13]:
output_notebook()

In [14]:
p1 = Bar(arrest_rates, label=CatAttr(columns=['offense'], sort=False),
         values='arrested', group='black_not_white_text', legend='top_left', tools=TOOLS,
         title="Percent Offenders Arrested by Race and Offense", xlabel="Offense", ylabel="Percent Arrested",
         color = color(columns='black_not_white', palette=[black_color, white_color]))
p1.y_range = Range1d(0, 95)
hover1 = p1.select(dict(type=HoverTool))
hover1.tooltips = """<div>Offense: @offense</div>
<div>Race: @black_not_white</div>
<div>Percent arrested: @y{1.1}</div>
"""
# Fix bar width issue
for r in p1.renderers:
    try:
        r.glyph.width = 0.3
    except AttributeError:
        pass

p2 = Bar(arrest_rates, label=CatAttr(columns=['offense'], sort=False),
         values='arrested_byagency', group='black_not_white_text', legend='top_left',
         title="Agencies' Percent Offenders Arrested by Race and Offense", xlabel="Offense", ylabel="Agencies' Percent Arrested",
         tools="hover,pan,wheel_zoom,reset", color = color(columns='black_not_white', palette=[black_color, white_color]))
p2.y_range = Range1d(0, 95)
hover2 = p2.select(dict(type=HoverTool))
hover2.tooltips = """<div>Offense: @offense</div>
<div>Race: @black_not_white</div>
<div>Percent arrested_byagency: @y{1.1}</div>
"""
# Fix bar width issue
for r in p2.renderers:
    try:
        r.glyph.width = 0.3
    except AttributeError:
        pass

source_arrests_black = ColumnDataSource(arrest_rates[arrest_rates['black_not_white_text'] == 'Black'])
source_arrests_white = ColumnDataSource(arrest_rates[arrest_rates['black_not_white_text'] == 'White'])

offender_labels_white = LabelSet(x="offense", y="arrested", text="arrested", y_offset=8, x_offset=-11,
                  text_font_size="8pt", text_color="#555555", source=source_arrests_white, text_align='center')
offender_labels_black = LabelSet(x="offense", y="arrested", text="arrested", y_offset=8, x_offset=11,
                  text_font_size="8pt", text_color="#555555", source=source_arrests_black, text_align='center')
p1.add_layout(offender_labels_white)
p1.add_layout(offender_labels_black)

agency_labels_white = LabelSet(x="offense", y="arrested_byagency", text="arrested_byagency", y_offset=8, x_offset=-11,
                  text_font_size="8pt", text_color="#555555", source=source_arrests_white, text_align='center')
agency_labels_black = LabelSet(x="offense", y="arrested_byagency", text="arrested_byagency", y_offset=8, x_offset=11,
                  text_font_size="8pt", text_color="#555555", source=source_arrests_black, text_align='center')
p2.add_layout(agency_labels_white)
p2.add_layout(agency_labels_black)

tab1_arrested = Panel(child=p1, title="Offender")
tab2_arrested = Panel(child=p2, title="Agency")
tabs_arrested = Tabs(tabs=[ tab1_arrested, tab2_arrested ])

output_file("output/percent_arrested.html")
show(tabs_arrested)

### What percentage of offenders are arrested within each agency?

In [15]:
output_notebook()

In [17]:
tabs_list = []
for offense in offenses:
    offense_title = offense.replace('_', ' ').title()
    subset = df[df['offense_' + offense] == 1]
    subset['arrested'] = subset['arrested'] * 100
    subset_black = subset[subset['black_not_white'] == 1]
    subset_white = subset[subset['black_not_white'] == 0]
    agg_data = subset.groupby(['ori', 'black_not_white'])[['arrested', 'w_officers_percent']].mean().reset_index()
    black_data = agg_data[agg_data['black_not_white'] == 1]
    white_data = agg_data[agg_data['black_not_white'] == 0]

    predictions_black = pd.DataFrame(lowess(black_data['arrested'], black_data['w_officers_percent']))
    predictions_white = pd.DataFrame(lowess(white_data['arrested'], white_data['w_officers_percent']))
    
    p = figure(tools=TOOLS, title="Percent of Offenders Arrested by Race and Percent Officers White")
    p.scatter(black_data['w_officers_percent'], black_data['arrested'], fill_color=black_color, line_color=None, legend="Black")
    p.scatter(white_data['w_officers_percent'], white_data['arrested'], fill_color=white_color, line_color=None, legend="White")
    p.line(x = predictions_black[0], y = predictions_black[1], color = black_color, line_width=2)
    p.line(x = predictions_white[0], y = predictions_white[1], color = white_color, line_width=2)
    p.xaxis.axis_label = "Percent Officers White"
    p.yaxis.axis_label = "Percent of Offenders Arrested"
    p.y_range = Range1d(-1, 101)
    p.x_range = Range1d(-1, 101)
    p.legend[0].location = "top_left"
    tab = Panel(child=p, title=offense_title)
    tabs_list.append(tab)

tabs_object = Tabs(tabs=tabs_list)
output_file("output/arrested_wofficers_byagency.html")
show(tabs_object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
