In [None]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
from geopandas import GeoDataFrame

from shapely.geometry import Point
import pandas as pd, numpy as np 
import shutil, os, ast, folium
import geopandas as gpd

init_notebook_mode(connected=True)

def _read_shape_gdf(_base_dir, selected_dept):
    shape_pth = _base_dir + "/shapefiles/department.shp"
    shape_gdf = gpd.read_file(shape_pth)
    return shape_gdf

def _get_latlong_point(point):
    _ll = str(point)
    _ll = _ll.replace("POINT (","").replace(")", "")
    _ll = list(reversed([float(_) for _ in _ll.split()]))
    return _ll

depts_config = {
    'Dept_23-00089' : {'_rowid' : "DISTRICT", "ct_num" : "18"},  
    'Dept_49-00035' : {'_rowid' : "pol_dist", "ct_num" : "06"},  
    'Dept_24-00013' : {'_rowid' : "PRECINCT", "ct_num" : "27"},  
    'Dept_24-00098' : {'_rowid' : "gridnum",  "ct_num" : "27"},   
    'Dept_49-00033' : {'_rowid' : "number",   "ct_num" : "06"},    
    'Dept_11-00091' : {'_rowid' : "ID",       "ct_num" : "25"},         
    'Dept_49-00081' : {'_rowid' : "company",  "ct_num" : "06"},   
    'Dept_37-00049' : {'_rowid' : "Name",     "ct_num" : "48"},      
    'Dept_37-00027' : {'_rowid' : "CODE",     "ct_num" : "48"},     
    'Dept_49-00009' : {'_rowid' : "objectid", "ct_num" : "53"}, 
}

_identifier = "LOCATION_DISTRICT"

def _agebin(x):
    if str(x).lower() == "nan":
        return None
    
    ranges = [20, 24, 34, 44, 54, 59, 100]
    tags = ["<20", "20-24", "25-34", "35-44", "45-54", "55-59", "60+"]
    for i, rng in enumerate(ranges):
        if int(x) <= rng:
            return tags[i]

def _get_dfs(_dept):
    _base_dir = "../input/3-example-runs-of-automation-pipeline/CPE_ROOT/" + _dept

    enriched_df = pd.read_csv(_base_dir + "/enriched_df.csv")
    police_df = pd.read_csv(_base_dir + "/police_df.csv")
    shape_gdf = _read_shape_gdf(_base_dir, _dept)

    ## Convert Dictionary Columns 
    for c in police_df.columns:
        if c != _identifier:
            police_df[c] = police_df[c].apply(ast.literal_eval)

    shape_gdf = shape_gdf.rename(columns = {depts_config[_dept]['_rowid'] : _identifier})
    shape_gdf[_identifier] = shape_gdf[_identifier].astype(str)
    enriched_df[_identifier] = enriched_df[_identifier].astype(str)
    police_df[_identifier] = police_df[_identifier].astype(str)
    police_df = police_df.merge(shape_gdf[[_identifier, "geometry"]], on=_identifier)
    
    events_df = pd.read_csv(_base_dir + "/events/events_df.csv", low_memory=False, parse_dates = ["INCIDENT_DATE"])[1:]
    
    if "SUBJECT_AGE" in events_df.columns:
        events_df["agebin"] = events_df["SUBJECT_AGE"].apply(lambda x : _agebin(x))
    
    return enriched_df, police_df, shape_gdf, _base_dir, events_df

# <font color="#703bdb">Part 4.2 Analysis Report - Los Angeles Department (49-00033)</font> <hr>

<a href="http://policingequity.org/">Center of Policing Equity</a> is a research and action think tank that works collaboratively with law enforcement, communities, and political stakeholders to identify ways to strengthen relationships with the communities they serve. CPE is also the home of the nationâ€™s first and largest <a href="http://policingequity.org/national-justice-database/">database</a> tracking national statistics on police behavior. 

The main aim of CPE is to bridge the divide created by communication problems, suffering and generational mistrust, and forge a path towards public safety, community trust, and racial equity. This kernel series is my contribution to the <a href="https://www.kaggle.com/center-for-policing-equity/data-science-for-good">Data Science for Good: Center for Policing Equity</a>. The contribution is focused on providing a generic, robust, and automated approach to integrate, standardize the data and further diagnose disparities in policing, shed light on police behavior, and provide actionable recommendations. 

Following are parts of Kernels Submissions in order:  

<ul>
    <li><a href="https://www.kaggle.com/shivamb/1-solution-workflow-science-of-policing-equity/">Part 1: Solution Workflow - The Science of Policing Equity </a>  </li>
    <li><a href="https://www.kaggle.com/shivamb/2-automation-pipeline-integration-processing">Part 2: Data Integration and Processing : Automation Pipeline</a>  </li>
    <li><a href="https://www.kaggle.com/shivamb/3-example-runs-of-automation-pipeline">Part 3: Example Runs of Automation Pipeline </a>  </li> 
    <li><a href="https://www.kaggle.com/shivamb/4-1-analysis-report-minneapolis-24-00013">Part 4.1: Analysis Report - Measuring Equity - Minneapolis Police Department </a>   </li>
    <li><a href="https://www.kaggle.com/shivamb/4-2-analysis-report-lapd-49-00033">Part 4.2: Analysis Report - Los Angles Police Department (49-00033) </a>   </li>
    <li><a href="https://www.kaggle.com/shivamb/4-3-analysis-report-officer-level-analysis">Part 4.3: Analysis Report - Indianapolis Officer Level Analysis (23-00089) </a>   </li></ul>

The complete overview of the solution is shared in the *first kernel*. It explains the process and flow of automation, standardization, processing, and analysis of data. In the *second kernel*, the first component of the solution pipeline : data integration and processing is implemented. It processes both core level data as well as department level data. In the *third kernel*, this pipeline is executed and run for several departments. After all the standardized and clean data is produced, it is analysed with different formats of the Analysis Framework in 4.1, 4.2 and 4.3 kernels. In *kernel 4.1*, core analysis is done along with link with crime rate and poverty data. In *kernel 4.2*, core analysis is done along with statistical analysis. In *kernel 4.3*, officer level analysis is done. 

<hr>

This kernel, is the fourth of the series. In this kernel, Analysis report for Los Angeles Department is shared. Other Analysis Reports : <br>

<a href="https://www.kaggle.com/shivamb/4-1-analysis-report-minneapolis-24-00013"> 4.1  Analysis Report :  24-00013  + <b>(What can account for Racial Disparities ?)</b> </a>  
<a href="https://www.kaggle.com/shivamb/4-2-analysis-report-lapd-49-00033"> 4.2 Analysis Report : 49-00033 + <b>(Statistical Analysis : Regression + Correlations)</b> </a>  
<a href="https://www.kaggle.com/shivamb/4-3-analysis-report-officer-level-analysis"> 4.3 Analysis Report : 23-00089 + <b>(Officer Level Analysis)</b> </a>  

<br>

<b>Contents of this kernel:</b> 

<ul>
    <li><a href="#1">1. Key Highlights </a>  </li>
    <li><a href="#2">2. Deep Exploration of Policy Activity </a>  </li>
    <ul>
        <li><a href="#2.1">2.1 Overview : Police Arrests </a>  </li>
        <li><a href="#2.2">2.2 Are there Racial Disparities ? </a>  </li>
        <li><a href="#2.3">2.3 Does disparity exist in different types of Use of Force ? </a>  </li>
        <li><a href="#2.4">2.4 Not all age groups are targeted similarly ! </a>  </li>
        <li><a href="#2.5">2.5 Does Vehicle Stops tells a different story ? </a>  </li>
        <li><a href="#2.6">2.6 Vehicle Stops Proportion by Race in Different Districts </a>  </li>
        <li><a href="#2.7">2.7 Vechile Stops - Blacks Vs Whites as a proportion of Population ? </a>  </li>
        <li><a href="#2.8">2.8 A look at other socio-econometirc factors that can depict Racial Bias </a></li></ul>
    <li><a href="#3">3. Statistical Analysis </a>  </li>
    <ul>
        <li><a href="#3.1">3.1 Measuring Correlations</a>  </li>
        <li><a href="#3.2">3.2 Regression Analysis </a>  </li></ul>
</ul>

<a id="1"></a>
## <font color="#703bdb">1. Key Highlights <hr></font>

All these insights are from the analysis framework, the detailed plots and explanaions are provided in section 2. 

> - Maximum number of police arrests were observed in district 1, 6, and 14 where approximately 13K, 12K, and 11K arrests were done. While least arrests occured in district 8 with only 2300 arrests.  
> - The data suggests that about half of the total police arrests were of Hispanic population. Black's was the other most arrested race followed by Whites with on an average (28% for blacks and 18% for whites)  
> - About half of the police arrests were made on Hispanics, but interesting to note that their population porportion is not very far away, it is about 44%. About 30% of the police incidents are targeted on Blacks (population : 8%) while 20% incidents are targeted on Whites (population : 47%)  
> - Though the aggregted black's population is only 9% in all the districts, but the different arrests by police is on-an-average about 30% on blacks on average.  
> - There are a couple of districts in which Hispanic population is stopped more often than any other population. For example, in district 2, 4, 10, 11, 13, and 20, A much higher hispanic population is the target of vehicle stops.  



<a id="2"></a>
## <font color="#703bdb">2. Deep Exploration of Police Activity : Police Arrests <hr></font>

All the analysis is first done at high level and then by controlling the socio-econometric or demographic factors of the area which the department serves. The main idea of this analysis is to measure following points : 

- Are there racial disparities in policy activities ?   
- To what extent the racial disparities exist in the department ?  
- What are the key factors that can explain racial disparities ?  

As the first step, we set the department and load the processed datasets that were produced from the data processing and analysis pipeline. Additionally, for the analysis of this department, we also used external dataset about vehicle stops, so we will also load that. For every department, we have stored the processed data which was the result of component A of the pipeline. Let's look at the glimpse of the two main datasets that will be used to analyse the racial bias in this department.

- Enriched DF : The dataframe containing estimated demographics information  
- Police DF : The dataframe containing the police incidents aggregated by demographics variables  

In [None]:
_dept = "Dept_49-00033"
enriched_df, police_df, shape_gdf, _base_dir, events_df = _get_dfs(_dept)

############# Also load the external dataset used #######################
vstops_df = pd.read_csv("../input/external-datasets-cpe/la_stops/vehicle-and-pedestrian-stop-data-2010-to-present.csv", low_memory=False)
vstops_df["Stop Date"] = pd.to_datetime(vstops_df["Stop Date"])
vstops_df["Year"] = vstops_df["Stop Date"].dt.year
vstops_df["month"] = vstops_df["Stop Date"].dt.month
vstops = vstops_df[(vstops_df['Year'] == 2015) & (vstops_df['Stop Type'] == "VEH")]

enriched_df.head()

In [None]:
police_df.head()

<a id="2.1"></a>
## <font color="#703bdb">2.1 Overview : Police Arrests <hr></font>

First, let's look at an overall high level summary of the police activity : Arrests in the different districts covered by this department. The following plot shows the aggregated counts of police activity according to the districts of the department. In the following graph, following representations are used: 

- Department Districts :  <font color="blue">blue polygons</font>  
- Aggregated Total incidents : <font color="green">green circles</font>  
- 2016 Use-of-Force incidents : <font color="red">red points</font>

In [None]:
"""
About the Code in this Cell 

This code produces the folium map as the base map, on which we plot three items : district polygons, aggregated number of total incidents by district, recent incidents
"""

## Add some more features
police_df["total_incidents"] = police_df["arrest_sex"].apply(lambda x : sum(x.values()))
police_df["total_vstops"] = police_df["vstops_sex"].apply(lambda x : sum(x.values()))

## Plot the base map
center_pt = police_df.geometry[0].centroid
center_ll = _get_latlong_point(center_pt)
mapa = folium.Map(center_ll, zoom_start=10, tiles='CartoDB dark_matter')
folium.GeoJson(shape_gdf, style_function = lambda feature: { 'fillColor': "blue", 'color' : "blue", 'weight' : 1, 'fillOpacity' : 0.2}).add_to(mapa)

## plot recent incidents 
for i, row in events_df[events_df["INCIDENT_REASON"].isin(["Disturbing the Peace", "Non-Criminal Detention"])].iterrows():
    folium.CircleMarker([float(row["LOCATION_LATITUDE"]), float(row["LOCATION_LONGITUDE"])], 
                        radius=1, color='red').add_to(mapa)

## plot aggregated number of total incidents
for i, row in police_df.iterrows():
    dist_ll = _get_latlong_point(row["geometry"].centroid)
    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>UseOfForce:</b> "+str(row["total_incidents"]), 
                       radius=float(row["total_incidents"])*0.003, color='green', fill=True).add_to(mapa)
    
mapa

**Graph Interpreatation** 

<i>The size of green circles represents the count of police activity. Higher the size, higher is the police activity incidents in that district and vice verca. One can hover on the white bubbles to view the aggregated count of incidents by district.  </i>

**Inferences**
> - Different district's have witnessed **different number of police arrests incidents** by police. The possible cause of this distribution may be one or more of the following factrs : **high crime rates, police characteristics, and community and police relationships**.   
> - **Maximum number of police arrests were observed in district 1, 6, and 14** where approximately 13K, 12K, and 11K arrests were done. While least arrests occured in district 8 with only 2300 arrests. 

<a id="2.2"></a>
## <font color="#703bdb">2.2 Are there Racial Disparities ? <hr></font>

Now we try to answer the most important question that we are looking for : Are there Racial Disparities in the police activities ? We will go deeper into these incidents and identify citizens of which race are targeted the most. One point to note that to establish the credible results, a controlled analysis is required, hence this analysis will is broken into two parts : 

A - First we will visualize the **breakdown of police activity in by different reason by different Race**  
B - Next we also analyze the **actual population porportion of race in every district** to get a clear picture.  

We now plot the police activity in different quarters: 


In [None]:
def month_to_quarter(x):
    if x <= 3:
        return "Q1"
    elif x <= 6:
        return "Q2"
    elif x <= 9:
        return "Q3"
    else:
        return "Q4"

events_df["INCIDENT_MONTH"] = events_df["INCIDENT_DATE"].dt.month
events_df["INCIDENT_QUARTER"] = events_df["INCIDENT_MONTH"].apply(lambda x : month_to_quarter(x))

# reasons = ["Moving Traffic Violations", "Driving Under Influence", "Disturbing the Peace","Miscellaneous Other Violations", "Other Assaults"]
reasons = ["Q1", "Q2", "Q3", "Q4"]
year_dic = {}
for reason in reasons:
    yeardf = events_df[events_df["INCIDENT_QUARTER"] == reason]
    t1 = dict(yeardf["SUBJECT_RACE"].value_counts())
    t2 = {}
    for k,v in t1.items():
        t2[k] = round(100*float(v) / sum(t1.values()), 2)
    if reason not in year_dic:
        year_dic[reason] = t2

## Generate Bar Plot
data = []
races = ["Black", "White", "Hispanic", "Other"]
colors = ["Black", "White", "orange", "Green"]
for i, rac in enumerate(races):
    trace1 = go.Bar(x=list(year_dic.keys()), y=[year_dic[_][rac] for _ in year_dic], name=rac, 
                    marker=dict(color=colors[i], opacity=1.0, line=dict(color='black',width=1)))
    data.append(trace1)     
layout = go.Layout(title='Police Arrests by Race : 2015', height=350, 
                   yaxis=dict(range=(0,100)), legend=dict(orientation="h", x=0.1, y=1.2))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='style-bar')

**Graph Interpreatation** 

*Every bar represents the percentage of police arrests by a particular race in different Quarters of 2015* 

**Inferences**  
> - The plot shows clear **disproportionate numbers about police activity** among different races for different reasons of arrests.  
> - This data suggests that about **half of the total police arrests were of Hispanic population**. Black's was the other most arrested race followed by Whites with on an average (28% for blacks and 18% for whites)   

B - Currently, these plot shows only the absolutes, we need to first control these numbers according to the population proportion. So, next we plot the comparative charts of police activity and population porportion. 

In [None]:
## Add Some features
races = ["Hispanic", "Black", "White"]
colors = ["orange", "black", "white"]
for c in races:
    police_df[c.lower() + "_cnt"] = police_df["arrest_race"].apply(lambda x : x[c])
    police_df[c.lower() + "_per"] = police_df["arrest_race"].apply(lambda x : 100*float(x[c]) / sum(x.values()))
    enriched_df[c.lower() + "_per"] = 100*enriched_df[c.lower() + "_pop"] / enriched_df["total_pop"]

wp = sum(enriched_df["white_pop"])
bp = sum(enriched_df["black_pop"])
hp = sum(enriched_df["hispanic_pop"])
tp = wp + hp + bp

wp1 = sum(police_df["white_cnt"])
bp1 = sum(police_df["black_cnt"])
hp1 = sum(police_df["hispanic_cnt"])
tp1 = wp1 + hp1 + bp1

data = [go.Bar(x=[100*float(wp1)/tp1, 100*float(wp)/tp], y=["Arrests", "Population"], name="Whites", 
               marker=dict(color="white", opacity=1, line=dict(color='black',width=1)), orientation='h'),
       go.Bar(x=[100*float(bp1)/tp1, 100*float(bp)/tp], y=[ "Arrests", "Population"], name="Blacks", 
              marker=dict(color="black", opacity=1.0, line=dict(color='black',width=1)), orientation='h'),
       go.Bar(x=[100*float(hp1)/tp1, 100*float(hp)/tp], y=[ "Arrests", "Population"], name="Hispanic", 
              marker=dict(color="orange", opacity=0.5, line=dict(color='black',width=1)), orientation='h')]

layout = go.Layout(barmode='stack',height=400, title='Police Arrests and Population Proportion : By Race', 
                   legend = dict(orientation="h", x=0.1, y=1.15) , showlegend=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig)


########### Next Part : Breakdown by Districts 

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=["Population Breakdown", "Arrests BreakDown"])
for i, rac in enumerate(races):
    trace1 = go.Bar(y = "D"+enriched_df[_identifier], x = enriched_df[rac.lower() + "_per"], orientation = "h", name=rac, 
                    marker=dict(color=colors[i], opacity=1.0, line=dict(color='black',width=1)))
    fig.append_trace(trace1, 1, 1)
    
    trace2 = go.Bar(y = "D"+police_df[_identifier], x = police_df[rac.lower() + "_per"], orientation = "h", name=rac, 
                marker=dict(color=colors[i], opacity=1.0, line=dict(color='black',width=1)))
    fig.append_trace(trace2, 1, 2)
    
fig["layout"].update(barmode='stack', showlegend = False, 
                     hovermode='closest', title="Race Distribution in different Districts - Population and Use-of-Force", 
                     height=900)
iplot(fig)

**Graph Interpretations**

*The first plot shows the proportion of arrests vs population of different races : whites, blacks and hispanics. The next two plots represents the same information but sliced by different police department's districts.* 

**Inferences**

> - These plots indicate a very strong level of racial disparity among the police activity.  
> - About half of the police arrests were made on Hispanics, but interesting to note that their population porportion is not very far away, it is about 44%. **About 30% of the police incidents are targeted on Blacks (population : 8%) while 20% incidents are targeted on Whites (population : 47%)**  
> - Among the 21 districts served by this police department, The **maximum disparity exists in District 12 and 18** with the difference in Black population proportion and police arrests on them close to 60%. Similarly, in District 1 and 3, blacks arrests are much higher than their population proportion.  
> - The **least disparity among blacks is observed in district 4**, however their exists maximum disparity in terms of Hispanic population in this district.  

<a id="2.3"></a>
## <font color="#703bdb">2.3 Does disparity exist in different reasons for police arrests  ?  <hr></font>

Another way to slice the data is to observe the different reasons of police arrests 

In [None]:
wp = 100 * sum(enriched_df["white_pop"]) / sum(enriched_df["total_pop"])
bp = 100 * sum(enriched_df["black_pop"]) /  sum(enriched_df["total_pop"])
hp = 100 * sum(enriched_df["hispanic_pop"]) /  sum(enriched_df["total_pop"])
op = 100 - wp - bp
xx = ["Blacks (" + str(int(bp)) + "%)", "Whites (" + str(int(wp)) + "%)", "Other (" + str(int(op)) + "%)" , "Hispanic (" + str(int(hp)) + "%)"]
        
data = [go.Scatter(x = xx, y=["","","",""], mode='markers', name="",  
                   marker=dict(color=["black","white","green" ,"orange"], opacity=1.0, size= [bp, wp, op, hp]))]
layout = go.Layout(barmode='stack', height=300, margin=dict(l=100), title='Population Distribution by Race', 
                   legend = dict(orientation="h", x=0.1, y=1.15),plot_bgcolor='#d7e9f7', paper_bgcolor='#d7e9f7', showlegend=False)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

# Police Behaviour Graph by different categories 
traces = []
titles = []
for vi in events_df["INCIDENT_REASON"].value_counts().index[:20]:
    if vi == "0":
        continue
    tempdf = events_df[events_df['INCIDENT_REASON'] == vi]
    tdoc = dict(tempdf["SUBJECT_RACE"].value_counts()[:3])
    pdoc = {"H" : 100*float(tdoc["Hispanic"]) / sum(tdoc.values()), 
            "W" : 100*float(tdoc["White"]) / sum(tdoc.values()),
            "B" : 100*float(tdoc["Black"]) / sum(tdoc.values())}

    xx = ["Blacks", "White", "Hispanic"]
    yy = ["", "", ""]
    ss = [pdoc[_[0]] for _ in xx]
    xx = ["Blacks (" + str(int(ss[0])) + "%)", "Whites(" + str(int(ss[1])) + "%)", "Hispanic (" + str(int(ss[2])) + "%)"]
    trace0 = go.Scatter(x=xx, y=yy, mode='markers', name="",  marker=dict(color=["black", "white", "orange"], opacity=1.0, size=ss))
    traces.append(trace0)
    titles.append(vi)

fig = tools.make_subplots(rows=5, cols=3, print_grid=False, subplot_titles = titles[:15])

r, c = 1, 1
for trace in traces[:15]:
    fig.append_trace(trace, r, c)
    c += 1 
    if c == 4:
        r += 1
        c = 1

fig["layout"].update(showlegend = False, height = 1000, plot_bgcolor='#d7e9f7', paper_bgcolor='#d7e9f7',
                     title = "Different Police Arrests Reasons",
                     xaxis=dict(showgrid=False))
iplot(fig, filename='bubblechart-color')  
# Add hover text

**Graph Interpretations**

*Every subplot represents a reason for pollice arrest and distribution by race. Size of the bubbles represents the percentage of respective race which was targeted.* 

[NOTE] - The 4 bubbles in first chart do not sum to 100, because there are mainly two groups first : hispanics and non-hispanics, And then non-hispanics are divided as whites, blacks, others. 

**Inferences**

> - The above plot shows an interesting angle, By comparing the porportion of blacks population, the different arrest reasons if always higher on blacks than their population proportion. Though the aggregted **black's population is only 9% in all the districts, but the different arrests by police is on-an-average about 30% on blacks** on average. 
> - Hispanics have the highest arrests and half of the population is hispanic.

<a id="2.4"></a>
## <font color="#703bdb">2.4 Not all Age Groups are targeted similarly !  <hr></font>

Let's look at how these incidents are distributed by different age-groups and by race. 

In [None]:
def cbp(df, col1, col2, aggcol, func, title, cs, bottom_margin=None):
    tempdf = df.groupby([col1, col2]).agg({aggcol : func}).reset_index()
    tempdf[aggcol] = tempdf[aggcol].apply(lambda x : int(x))
    tempdf = tempdf.sort_values(aggcol, ascending=False)

    sizes = list(reversed([i for i in range(10,31)]))
    intervals = int(len(tempdf) / len(sizes))
    size_array = [9]*len(tempdf)
    
    st = 0
    for i, size in enumerate(sizes):
        for j in range(st, st+intervals):
            size_array[j] = size 
        st = st+intervals
    tempdf['size_n'] = size_array
    # tempdf = tempdf.sample(frac=1).reset_index(drop=True)

    cols = list(tempdf['size_n'])

    trace1 = go.Scatter( x=tempdf[col1], y=tempdf[col2], mode='markers', text=tempdf[aggcol],
        marker=dict( size=tempdf.size_n, color=cols, colorscale=cs ))
    data = [trace1]
    if bottom_margin:
        layout = go.Layout(title=title, margin=dict(b=150))
    else:
        layout = go.Layout(title=title)
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    
    
###

tmp = events_df.groupby("agebin").agg({'SUBJECT_GENDER' : "count"}).reset_index().rename(columns={"SUBJECT_GENDER" : "count"})
tmp = tmp[~tmp["agebin"].isin(["60+", "<20"])]
tmp["percentage"] = tmp["count"].apply(lambda x : 100*float(x) / sum(tmp["count"]))

keys = ["20_24_pop", "25_34_pop", "35_44_pop", "45_54_pop", "55_59_pop"]
tmp2 = pd.DataFrame()
tmp2["agebin"] = [x.replace("_","-").replace("-pop", "") for x in keys]
tmp2["cnt"] = [sum(enriched_df[k]) for k in keys]
tmp2["percentage"] = [100*y/sum(tmp2["cnt"]) for y in tmp2["cnt"]]

trace1 = go.Bar(x = tmp2["agebin"], y = tmp2["percentage"], marker=dict(color="purple", opacity=0.6))
trace2 = go.Bar(x = tmp["agebin"], y = tmp["percentage"], marker=dict(color="orange", opacity=0.6))

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=["Population % Breakdown by Age", "Use-of-force % BreakDown by Age"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig["layout"].update(barmode='group', showlegend = False, 
                     hovermode='closest', title="", 
                     height=450)
iplot(fig)

###

cbp(events_df[~events_df["SUBJECT_RACE"].isin(["-"])], 'SUBJECT_RACE', 'agebin', 'SUBJECT_GENDER', 'count', "Use of Force breakdown by Age and Race", 'Jet')

**Graph Interpretation** 

*Size of the bubble represents the respective count of a race and an age group*

**Inferences** 

> - By observing population % breakdown and use-of-force % breakdown, it is noted that **police activity aligns with the population percentage** of respective group. However, when we look at the breakdown of police activity by different age groups, a form of racial bias appears. 
 > - **Hispanic population** which belongs to **25-34 age group** is mostly arrested by police, followed by Blacks in the same age group, and by 20-24 among the age groups. 
 > - **Agegroup 25-34 are more likely to be arrested by police** than any other agegroup irrespective of their race. 
 
 <a id="2.5"></a>
## <font color="#703bdb">2.5 Does Vehicle Stops tells a different story ?   <hr></font>

Till now, we only looked at arrest data, Let's now use Vehicle Stops data for the analysis purposes that may help to provide a more clearer picture. 

First, let's view an overview of 2017 vehicle stops by police. In the following map, the base districtrs are represented as blue polygons, the white points represents the recent vehicle stops, and the orange circles representes the aggregated number of vehicle stops by district. 

In [None]:
mapb = folium.Map(center_ll, zoom_start=10, tiles='CartoDB dark_matter')
folium.GeoJson(shape_gdf, style_function = lambda feature: { 'fillColor': "blue", 'color' : "blue", 'weight' : 1, 'fillOpacity' : 0.5}).add_to(mapb)

## plot recent incidents 
small_df = events_df[(events_df["INCIDENT_REASON"].isin(["Moving Traffic Violations"])) & (events_df["INCIDENT_MONTH"] > 7)]
for i, row in small_df.iterrows():
    folium.CircleMarker([float(row["LOCATION_LATITUDE"]), float(row["LOCATION_LONGITUDE"])], 
                        radius=1, color='orange').add_to(mapb)

## plot aggregated number of total incidents
for i, row in police_df.iterrows():
    dist_ll = _get_latlong_point(row["geometry"].centroid)
    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Vehicle Stops:</b> "+str(row["total_vstops"]), 
                       radius=float(row["total_vstops"])*0.001, color='white', fill=True).add_to(mapb)
mapb

<a id="2.6"></a>
## <font color="#703bdb">2.6. Vehicle Stops Proportion by Race in Different Districts   <hr></font>

It will be interesting to see how vehicle stops are distributed by Race (espicially Blacks and Whites) across different districts of this department. 

In [None]:
races = ["Blacks", "Whites", "Hispanic"]
for c in races:
    police_df[c.lower() + "_vcnt"] = police_df["vstops_race"].apply(lambda x : x[c[0]])
    police_df[c.lower() + "_vper"] = police_df["vstops_race"].apply(lambda x : 100*float(x[c[0]]) / (x["B"] + x["W"] + x["H"]))

plt.figure(figsize=(16,16))

for i in range(1, 22):
    tmp = police_df[police_df[_identifier] == str(i)]
    plt.subplot(5, 5, i)
    docs = [tmp["blacks_vper"].iloc(0)[0], tmp["whites_vper"].iloc(0)[0], tmp["hispanic_vper"].iloc(0)[0]]
    plt.pie(docs, labels=["Blacks", "Whites", "Hispanic"],
            colors=["black", "white", "green"],wedgeprops={"edgecolor":"orange",'linewidth': 1})
    plt.title("Dist: " + str(i))
    
    
plt.rcParams['axes.facecolor']='red'
plt.rcParams['savefig.facecolor']='red'

**Inferences**

> - The piecharts indicates that **more percentage of black population is victim of Vehicle Stops** in district 1, 3, 12, and 18. So their exist some level of racial disparity, because the black population percentage is very low in these districts (17%, 32%, 40%, and 27% black population only).  
> - And there are a couple of districts in which **Hispanic population is stopped more often** than any other population. For example, in district 2, 4, 10, 11, 13, and 20, A much higher hispanic population is the target of vehicle stops.  

<a id="2.7"></a>
 ## <font color="#703bdb">2.7. Vehicle Stops - Blacks Vs Whites as a proportion of Population    <hr></font>

Let's compre the percentage of blacks and whites as a proportion of their population at the overall department level. 

In [None]:
t1  = enriched_df[[_identifier, 'black_per', 'white_per']]
t1  = t1.merge(police_df[[_identifier, 'blacks_vper', 'whites_vper']], on=_identifier)


data_radar = [
    go.Scatterpolar(
      r = list(t1.black_per.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'toself',
      name = 'Blacks Population Proportion'
    ),
    go.Scatterpolar(
      r = list(t1.blacks_vper.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'tonext',
      name = 'Blacks Vehicle Stops Proportion'
    )
]

layout = go.Layout(margin=dict(l=120), width=800, title="Vehicle Stops and Population Proportion of Blacks",
                   legend=dict(orientation="h"),  polar = dict(
    radialaxis = dict(visible = False )))

fig = go.Figure(data=data_radar, layout=layout)
iplot(fig)


data_radar = [
    go.Scatterpolar(
      r = list(t1.white_per.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'toself',
      name = 'Whites Population Proportion'
    ),
    go.Scatterpolar(
      r = list(t1.whites_vper.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'toself',
      name = 'Whites Vechile Stops Proportion'
    )
]

layout = go.Layout(margin=dict(l=120), width=800, title="Vehicle Stops and Population Proportion of Whites", legend=dict(orientation="h"),  polar = dict(
    radialaxis = dict(visible = False)))

fig = go.Figure(data=data_radar, layout=layout)
iplot(fig)

**Graph Interpretation**  

*In these radial charts, One color (BLUE) represents the amount population proportion in 5 districts, other (ORANGE) represents the number of vehicle stops by districts. Basically, these two information - population and vehicle stops are overlaped over one another in order to measure the extent of biasness.* 

**Inferences**

> We can observe that in all the 21 districts served by this department, the population **proportion of blacks is much lower (orange area)** while their **vehicle stops proportion is higher (blue area)**. However, this is not the case with white population. A reverse scenario can be observed, **the population proportion (represented as blue) of whites** is much higher and **the vehicle stops proportion (represented as orange) is much lower**. And this difference in white population to their vehicle stops is quite higher. 

These insights suggest a level of racial disparity in the these districts.  

 <a id="2.8"></a>
## <font color="#703bdb">2.8 A look at other socio-econometirc factors that can depict Racial Bias     <hr></font>

There are many other fields that are processed by the data processing and integration pipeline : 

- Median Income  - By Race  
- Employment to Population Ratio - By Race  

In [None]:
tempdf = police_df[["LOCATION_DISTRICT", "total_vstops", "total_incidents"]]
tempdf = tempdf.merge(enriched_df[['LOCATION_DISTRICT', 'blacks_income', 'whites_income']], on="LOCATION_DISTRICT")
tempdf

trace0 = go.Scatter(x=tempdf["blacks_income"], y=tempdf["whites_income"],
    mode='markers', marker=dict(size=tempdf["total_vstops"]*0.003, color="green", opacity=0.7))
data = [trace0]
layout = go.Layout(title="Vehicle Stops by Median Income", xaxis=dict(title="Median Income of Blacks"), yaxis=dict(title="Median Income of Whites"))
fig = go.Figure(data=data, layout = layout)
iplot(fig, filename='bubblechart-color')


####### 

tempdf = police_df[["LOCATION_DISTRICT", "total_vstops", "total_incidents"]]
tempdf = tempdf.merge(enriched_df[['LOCATION_DISTRICT', 'blacks_ep_ratio', 'whites_ep_ratio']], on="LOCATION_DISTRICT")

trace0 = go.Scatter(x=tempdf["blacks_ep_ratio"], y=tempdf["whites_ep_ratio"], mode='markers', 
                    marker=dict(opacity=1, size=tempdf["total_vstops"]*0.003, color="pink"))
data = [trace0]
layout = go.Layout(title="Vehicle Stops by Employment to Population Ratio", 
                   xaxis=dict(title="Blacks : Employment to Population Ratio"), yaxis=dict(title="Whites: Employment to Population Ratio"))
fig = go.Figure(data=data, layout = layout)
iplot(fig, filename='bubblechart-color')

**Inferences**

> - In these plots, the bubble with largest size means that maximum number of vehicle stops were made in that area. 
> - Interesting to note that, in a particular area where vehicle stops were maximum, the **median income of whites is almost four times than that of blacks** ie. median income ratio of whites and blacks is 60,000 USD : 16,000 USD. This may indicate that police makes more vehicle stops in the area where black population earns less.    
> - No strong insight can be established with the employment to population ratio.  

<a id="3"></a>
## <font color="#703bdb">3. Statistical Analysis <hr></font>

In the following section, we explore the statistical associations between the factors and police behaviours. First we explore the correlations and then we model the police activity using regression analysis. 

<a id="3.1"></a>
## <font color="#703bdb">3.1 Correlations <hr></font>

 In this section, we explore the statistical inferences to measure the possible control variable's effect on the racial bias as target. 

In [None]:
enriched_cols = ['LOCATION_DISTRICT', 'whites_income',
       'blacks_income', 'hispanic_income', 'below_pov_pop',
       'whites_ep_ratio', 'blacks_ep_ratio', 'whites_unemp_ratio',
       'blacks_unemp_ratio', 'hispanic_per', 'black_per', 'white_per']

police_cols = ['LOCATION_DISTRICT', 'total_incidents' , 'total_vstops']


merged_df = police_df[police_cols].merge(enriched_df[enriched_cols], on=_identifier)

## give a high level overview summary as well
merged_df = merged_df.rename(columns={"whites_income" : "Median Income : Whites", 
                                      "blacks_income" : "Median Income : Blacks",
                                      "hispanic_income" : "Median Income : Hispanic",
                                      "below_pov_pop" : "Below Poverty Population",
                                      "whites_ep_ratio" : "Employment : Whites",
                                      "blacks_ep_ratio" : "Employment : Blacks",
                                      "whites_unemp_ratio" : "Unemployment : Whites",
                                      "blacks_unemp_ratio" : "Unemployment : Blacks",
                                      "hispanic_per" : "Hispanic Population %",
                                      "black_per" : "Black Population %",
                                      "white_per" : "White Population %",
                                      "total_vstops" : "Total Vehicle Stops", 
                                      "total_incidents" : "Total Arrests"
                                     })

import plotly.figure_factory as ff

corr = merged_df.corr(method='pearson').round(2)
xcols = list(merged_df.columns)[1:]
ycols = list(merged_df.columns)[1:]

layout = dict(
    title = 'Ordinal feature correlations',
    width = 900,
    height = 900,
    # margin=go.Margin(l=200, r=50, b=50, t=250, pad=4),
    margin=go.layout.Margin(l=200, r=50, b=50, t=250, pad=4),
)
fig = ff.create_annotated_heatmap(
    z=corr.values,
    x=list(xcols),
    y=list(ycols),
    colorscale='Reds',
    reversescale=False,
    showscale=True,
    )
fig['layout'].update(layout)
iplot(fig, filename='OrdinalCorrelations')

**Inferences** 

> - Some of the interesting correlations of **total arrests** are moderate negative correlations with **Median Income of Blacks and Hispanics**  
> - Some of the interesting correlations of **vehicle stops** are moderate negative correlations with **Median Income of Blacks and Hispanics**  and moderate positive correlation with **Population of Blacks**.

This suggest that areas in which median income of blacks and hispanics is higher, total arrests and vehicle stops are expected to be lesser. Additionally, areas in which black population is higher, vehicle stops are also higher. Thus, another signs of racial disparity by police acitivity. 


<a id="3.2"></a>
## <font color="#703bdb">3.2 Regression Analysis <hr></font>
 
 In this section, we try to find the most important and significant variables that can explain the police behaviour. For this purpose we fit a linear model on the data and evaluate its results. 


In [None]:
import statsmodels.api as sm_api
from sklearn import preprocessing

merged_df = police_df[police_cols].merge(enriched_df, on=_identifier)
cols = [c for c in merged_df.columns if c not in ["LOCATION_DISTRICT", "total_incidents"]]

x = merged_df[cols].values
min_max_scaler = preprocessing.StandardScaler()
t4 = min_max_scaler.fit_transform(x)
t4 = pd.DataFrame(t4, columns = cols)

t4["whites_to_blacks_ratio"] = t4["white_pop"] / t4["black_pop"]
cols.extend(["whites_to_blacks_ratio"])

target = "total_vstops"
features = [c for c in cols if target not in c]
features = [c for c in features if "per" not in c.lower()]
features = [c for c in features if "pop" not in c.lower()]
features = [c for c in features if c not in ["whites_ep_ratio"]]
X = t4[features]
Y = t4[target]
model = sm_api.OLS(Y, X)
results = model.fit()
results.summary()
# cols

**Inferences** 

> - In the above model, we estimate the relationship of control variables with the target which in this case is number of vehicle stops. The idea is to understand what are the **most significant variables** important for estimating the vehicle stops by police in an area (after controlling the socio-econometric factors.)  
> - From the model summary, we can observe that there are two significant variables : **blacks_income** and **whites_to_blacks_ratio**. These variables has low p-value (less than 0.05).  
> - The coefficients of both these variables has **negative sign** which indicates that ares where blacks income is increased, vehicle stops will tend to decrease. Similarly, areas in which ratio of whites to black population is increased, number of vehicle stops tend to decrease.  
> - These effects explains that there exists racial bias in police activity (vehicle stops) as the occurance is somewhat affacted by demographics of the popoulation of an area. 

### <font color="#703bdb">End Notes</font>

In this report, we saw that racial bias exists in the department's police activity and there is a need to rework on the policies used by police. Hispanic population followed by black population are more likely to be the victim of use of force by the police. 

In the next [kernel](https://www.kaggle.com/shivamb/4-3-analysis-report-officer-level-analysis), I have shared another analysis template built on top of this pipeline which explains officer level analysis. 
