### Plot provider cost predictions and residuals

In [1]:
import pandas as pd
import requests, zipfile, io
import statsmodels.api as sm
lowess = sm.nonparametric.lowess
from bokeh.charts import Bar, Scatter, output_notebook, show, output_file
from bokeh.charts.attributes import CatAttr, color
from bokeh.models import HoverTool, Range1d, Span, LabelSet, ColumnDataSource, Title, NumeralTickFormatter
from bokeh.models.widgets import Panel, Tabs
from bokeh.models.glyphs import Text
from bokeh.plotting import figure
from bokeh.palettes import PuOr5, RdYlBu10



### Read the zipped file

In [2]:
df = pd.read_csv('../../Data/offenders_2013_caplg_ucr_clean2.csv')
# r = requests.get('http://crimedata.io/data/offenders_2013_caplg_ucr_clean2.csv.zip')
# z = zipfile.ZipFile(io.BytesIO(r.content))
# df = pd.read_csv(z.open('offenders_2013_caplg_ucr_clean2.csv'))
# Exclude non-Black/White offenders
df = df[df['black_not_white'].notnull()]
# Fix an issue in the data (wofficers_divres was missing)
df['wofficers_divres'] = df['w_officers_percent'] / df['w_residents_percent']
# Only include men
df = df[df['sex'] == 'male']

  interactivity=interactivity, compiler=compiler, result=result)


#### Define the offenses beforehand

In [3]:
offenses = ['robbery', 'aggravated_assault', 'simple_assault', 'intimidation', 'weapon',
            'shoplifting', 'vandalism', 'drugs_narcotics', 'drug_equipment']

In [4]:
# Create city, state abbreviation column
states_dict = {'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO',
               'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
               'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
               'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
               'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
               'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
               'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
               'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA',
               'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}
df['state'] = df['state_name'].map(states_dict)
df['city_state'] = df['city'] + ', ' + df['state']
df.loc[df['city_state'].isnull(), 'city_state'] = ''

### Why do agencies with fewer offenders arrest relatively fewer black offenders?  Are smaller agencies faced with similar proportions of black and white offenders originally?

In [5]:
continuous_variables = {'male_officers_percent': 'Percent of Officers Male', 'b_officers_percent': 'Percent of Officers Black',
                        'bofficers_divres': 'Percent Officers / Residents Black', 'bdgt_ttl': 'Total Agency Budget',
           'agency_bdgt_per_offender': 'Agency Budget per Offender', 'bdgt_per_ftoff': 'Agency Budget per Full-Time Officer',
                        'ftsworn': 'Full-Time Sworn Officers', 'mean_inc': 'County Mean Income',
                        'w_residents_percent': 'Percent of Residents White', 'total_residents': 'Total County Population'}

In [6]:
black_color = PuOr5[-1]
white_color = PuOr5[0]

for variable, variable_nice in continuous_variables.items():
    tabs_list = []
    for offense in offenses:
        offense_title = offense.replace('_', ' ').title()
        subset = df[df['offense_' + offense] == 1]
        subset['arrested'] = subset['arrested'] * 100
        subset_black = subset[subset['black_not_white'] == 1]
        subset_white = subset[subset['black_not_white'] == 0]
        agg_data = subset.groupby(['ori', 'black_not_white'])[['arrested', variable]].mean().reset_index()
        city_data = subset.groupby('ori')['city_state'].max().reset_index()
        agg_data = agg_data.merge(city_data, on = ['ori'], how = 'inner')
        
        black_data = agg_data[agg_data['black_not_white'] == 1]
        white_data = agg_data[agg_data['black_not_white'] == 0]

        predictions_black = pd.DataFrame(lowess(black_data['arrested'], black_data[variable]))
        predictions_white = pd.DataFrame(lowess(white_data['arrested'], white_data[variable]))

        p = figure(tools="previewsave", title="Percent of Offenders Arrested", height=600, width=600)
        p.scatter(black_data[variable], black_data['arrested'], fill_color=black_color, line_color=None, legend="Black")
        p.scatter(white_data[variable], white_data['arrested'], fill_color=white_color, line_color=None, legend="White")
        p.line(x = predictions_black[0], y = predictions_black[1], color = black_color, line_width=2)
        p.line(x = predictions_white[0], y = predictions_white[1], color = white_color, line_width=2)
        p.xaxis.axis_label = variable_nice
        p.yaxis.axis_label = "Percent of Offenders Arrested"
        p.y_range = Range1d(-1, 101)
#         p.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
        p.legend[0].location = "top_right"
        
        tab = Panel(child=p, title=offense_title)
        tabs_list.append(tab)
        msg = """Note: Data are from 2013 NIBRS and LEMAS, and only male offenders are included."""
        caption = Title(text=msg, align='left', text_font_size='8pt')
        p.add_layout(caption, 'below')

    tabs_object = Tabs(tabs=tabs_list)
    output_file("output/bw_" + variable + ".html")
    show(tabs_object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
binary_variables = ['tech_typ_vpat', 'com_mis', 'min_hiring_educ_gths']