In [None]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
from geopandas import GeoDataFrame

from shapely.geometry import Point
import pandas as pd, numpy as np 
import shutil, os, ast, folium
import geopandas as gpd

init_notebook_mode(connected=True)

def _read_shape_gdf(_base_dir, selected_dept):
    shape_pth = _base_dir + "/shapefiles/department.shp"
    shape_gdf = gpd.read_file(shape_pth)
    return shape_gdf

def _get_latlong_point(point):
    _ll = str(point)
    _ll = _ll.replace("POINT (","").replace(")", "")
    _ll = list(reversed([float(_) for _ in _ll.split()]))
    return _ll

def _agebin(x):
    if str(x).lower() == "nan":
        return None
    
    ranges = [20, 24, 34, 44, 54, 59, 100]
    tags = ["<20", "20-24", "25-34", "35-44", "45-54", "55-59", "60+"]
    for i, rng in enumerate(ranges):
        if int(x) <= rng:
            return tags[i]


_identifier = "LOCATION_DISTRICT"

def _get_dfs(_dept):
    _base_dir = "../input/3-example-runs-of-automation-pipeline/CPE_ROOT/" + _dept

    enriched_df = pd.read_csv(_base_dir + "/enriched_df.csv")
    police_df = pd.read_csv(_base_dir + "/police_df.csv")
    shape_gdf = _read_shape_gdf(_base_dir, _dept)

    ## Convert Dictionary Columns 
    for c in police_df.columns:
        if c != _identifier:
            police_df[c] = police_df[c].apply(ast.literal_eval)

    shape_gdf = shape_gdf.rename(columns = {"PRECINCT" : _identifier})
    shape_gdf[_identifier] = shape_gdf[_identifier].astype(str)
    enriched_df[_identifier] = enriched_df[_identifier].astype(str)
    police_df[_identifier] = police_df[_identifier].astype(str)
    police_df = police_df.merge(shape_gdf[[_identifier, "geometry"]], on=_identifier)
    
    events_df = pd.read_csv(_base_dir + "/events/events_df.csv", low_memory=False, parse_dates = ["INCIDENT_DATE"])[1:]
    
    if "SUBJECT_AGE" in events_df.columns:
        events_df["agebin"] = events_df["SUBJECT_AGE"].apply(lambda x : _agebin(x))
    
    return enriched_df, police_df, shape_gdf, _base_dir, events_df

# <font color="#703bdb">4.1 Analysis Report - Minneapolis Police Department (24-00013)</font><hr>

<a href="http://policingequity.org/">Center of Policing Equity</a> is a research and action think tank that works collaboratively with law enforcement, communities, and political stakeholders to identify ways to strengthen relationships with the communities they serve. CPE is also the home of the nationâ€™s first and largest <a href="http://policingequity.org/national-justice-database/">database</a> tracking national statistics on police behavior. 

The main aim of CPE is to bridge the divide created by communication problems, suffering and generational mistrust, and forge a path towards public safety, community trust, and racial equity. This kernel series is my contribution to the <a href="https://www.kaggle.com/center-for-policing-equity/data-science-for-good">Data Science for Good: Center for Policing Equity</a>. The contribution is focused on providing a generic, robust, and automated approach to integrate, standardize the data and further diagnose disparities in policing, shed light on police behavior, and provide actionable recommendations. 

Following are parts of Kernels Submissions in order:  

<ul>
    <li><a href="https://www.kaggle.com/shivamb/1-solution-workflow-science-of-policing-equity/">Part 1: Solution Workflow - The Science of Policing Equity </a>  </li>
    <li><a href="https://www.kaggle.com/shivamb/2-automation-pipeline-integration-processing">Part 2: Data Integration and Processing : Automation Pipeline</a>  </li>
    <li><a href="https://www.kaggle.com/shivamb/3-example-runs-of-automation-pipeline">Part 3: Example Runs of Automation Pipeline </a>  </li> 
    <li><a href="https://www.kaggle.com/shivamb/4-1-analysis-report-minneapolis-24-00013">Part 4.1: Analysis Report - Minneapolis Police Department (24-00013) </a>   </li>
    <li><a href="https://www.kaggle.com/shivamb/4-2-analysis-report-lapd-49-00033">Part 4.2: Analysis Report - Los Angles Police Department (49-00033) </a>   </li>
    <li><a href="https://www.kaggle.com/shivamb/4-3-analysis-report-officer-level-analysis">Part 4.3: Analysis Report - Indianapolis Officer Level Analysis (23-00089) </a>   </li></ul>

The complete overview of the solution is shared in the *first kernel*. It explains the process and flow of automation, standardization, processing, and analysis of data. In the *second kernel*, the first component of the solution pipeline : data integration and processing is implemented. It processes both core level data as well as department level data. In the *third kernel*, this pipeline is executed and run for several departments. After all the standardized and clean data is produced, it is analysed with different formats of the Analysis Framework in 4.1, 4.2 and 4.3 kernels. In *kernel 4.1*, core analysis is done along with link with crime rate and poverty data. In *kernel 4.2*, core analysis is done along with statistical analysis. In *kernel 4.3*, officer level analysis is done. 

<hr>

This kernel, is the fourth of the series. In this kernel, Analysis report for Minneapolis Police Department is shared. Other Analysis Reports : <br>

<a href="https://www.kaggle.com/shivamb/4-1-analysis-report-minneapolis-24-00013"> 4.1  Analysis Report :  24-00013  + <b>(What can account for Racial Disparities ?)</b> </a>  
<a href="https://www.kaggle.com/shivamb/4-2-analysis-report-lapd-49-00033"> 4.2 Analysis Report : 49-00033 + <b>(Statistical Analysis : Regression + Correlations)</b> </a>  
<a href="https://www.kaggle.com/shivamb/4-3-analysis-report-officer-level-analysis"> 4.3 Analysis Report : 23-00089 + <b>(Officer Level Analysis)</b> </a>  

<br>

<b>Contents of this kernel:</b> 

<ul>
    <li><a href="#1">1. Key Highlights </a>  </li>
    <li><a href="#2">2. Deep Exploration of Policy Activity </a>  </li>
    <ul>
        <li><a href="#2.1">2.1 Overview : Use-of-Force </a>  </li>
        <li><a href="#2.2">2.2 Are there Racial Disparities ? </a>  </li>
        <li><a href="#2.3">2.3 Racial Bias in Subject Injuries ? </a>  </li>
        <li><a href="#2.4">2.4 Does disparity exist in different types of Use of Force ? </a>  </li>
        <li><a href="#2.5">2.5 Not all age groups are targeted similarly ! </a>  </li>
        <li><a href="#2.6">2.6 Vehicle Stops : Do they tell the same story ? </a>  </li>
        <li><a href="#2.7">2.7. Vehicle Stops : Is there any Racial Bias ? </a>  </li>
        <li><a href="#2.8">2.8 Vechile Stops : Blacks Vs Whites as a proportion of Population ? </a>  </li>
        <li><a href="#2.9">2.9 A look at other socio-econometirc factors that can depict Racial Bias </a></li></ul>
    <li><a href="#3">3. What can account for Racial Disparity ? </a>  </li>
    <ul>
        <li><a href="#3.1">3.1 High Crime Rate </a>  </li>
        <li><a href="#3.2">3.2 Low Poverty </a>  </li></ul>
</ul>

<a id="1"></a>
## <font color="#703bdb">1. Key Highlights <hr></font>


- Different district's have witnessed different number of use-of-force incidents by police. Maximum use of force incidents are centered in district 1 where approximately 4,000 incidents occured in last 5 years. While in District 2 and District 3, only about 1000-1200 incidents occured.  
- This data suggests that about use of force on Blacks was 3 times that of Whites. The percentage of Blacks being targeted equals 60% on an average which is quite higher than the other races, espicially Whites.  
- About 71% of the police incidents are targeted on Blacks (population : 20%) while only, 27% incidents are targeted on Whites (population : 72%). Similarly, Asian population proportion is very low (about 1.5%) in all districts, but the use-of-force on them is higher (6.5% incidents).  
- In three districts (1, 3, and 4), the subjects, the proportion of blacks being injured is very high ( about 50% in Dist 1, and 3, and about 80% in dist 4). However their population proportion in these districts is only (26%, 17%, and 40% respectively)  
- Though the aggregted black's population is only 18% in all the districts, but the different use-of-force by police is on-an-average about 60% on blacks.  
- While in districts 2 and 5, surprizingly a higher percentage of white population is stopped for vehicle checks. This also aligns with the subject injuries insights that we saw were higher among whites in district 2 and 5.  


All insights are obtained from the analysis framework and the detailed explanaions are provided in section 2. 

<a id="2"></a>
## <font color="#703bdb">2. Deep Exploration of Police Activity : Use of Force <hr></font>

All the analysis is first done at high level and then by controlling the socio-econometric or demographic factors of the area which the department serves. The main idea of this analysis is to measure following points : 

- Are there racial disparities ?   
- To what extent the racial disparities exist in the department ?  
- What are the key factors that can explain racial disparities ?  

As the first step, we select the department and load the relevant datasets (that were produced from the data processing and analysis pipeline). There are two main datasets : Enriched district level data and processed Police Incidents Data. 

In [None]:
_dept = "Dept_24-00013"
enriched_df, police_df, shape_gdf, _base_dir, events_df = _get_dfs(_dept)

############# Also load the external dataset used #######################
external_data_path = "../input/external-datasets-cpe/minneapolis_stops/Minneapolis_Stops.csv"
vstops = pd.read_csv(external_data_path, low_memory=False)
vstops["responseDate"] = pd.to_datetime(vstops["responseDate"])
vstops["year"] = vstops["responseDate"].dt.year
vstops["month"] = vstops["responseDate"].dt.month

enriched_df.head()

<a id="2.1"></a>
## <font color="#703bdb">2.1 Overview : Use-of-Force (2012-2015) <hr></font>

Let's look at an high level overview of the police activity (use of force) in different districts. The following plot shows the aggregated police activity in different districts served by the department. The leged of the graph is explained in following representations:  

- Department Districts :  <font color="blue">blue polygons</font>  
- Aggregated Total incidents : <font color="green">green circles</font>  
- 2015 Use-of-Force incidents : <font color="red">red points</font>

In [None]:
"""
About the Code in this Cell 

This code produces the folium map as the base map, on which we plot three items : district polygons, aggregated number of total incidents by district, recent incidents
"""

## Add some more features
police_df["total_incidents"] = police_df["uof_sex"].apply(lambda x : sum(x.values()))
police_df["total_vstops"] = police_df["vstops_sex"].apply(lambda x : sum(x.values()))

## Plot the base map
center_pt = police_df.geometry[0].centroid
center_ll = _get_latlong_point(center_pt)
mapa = folium.Map(center_ll, zoom_start=10.5, tiles='CartoDB dark_matter')
folium.GeoJson(shape_gdf, style_function = lambda feature: { 'fillColor': "blue", 'color' : "blue", 'weight' : 1, 'fillOpacity' : 0.2}).add_to(mapa)

## plot recent incidents 
for i, row in events_df[events_df["INCIDENT_YEAR"] == 2015].iterrows():
    folium.CircleMarker([float(row["LOCATION_LATITUDE"]), float(row["LOCATION_LONGITUDE"])], 
                        radius=1, color='red').add_to(mapa)

## plot aggregated number of total incidents
for i, row in police_df.iterrows():
    dist_ll = _get_latlong_point(row["geometry"].centroid)
    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>UseOfForce:</b> "+str(row["total_incidents"]), 
                       radius=float(row["total_incidents"])*0.01, color='green', fill=True).add_to(mapa)
    
mapa

**Graph Interpreatation** 

<i>The size of green circles represents the total count of police activity in a district. Higher the size, higher is the police activity incidents in that district and vice versa. Hover on the circles bubbles to view the actual numbers. </i>

**Inferences**
> - The Minneapolice police department serves the total of five districts in the city. The first look at the police activity shows that different district's have witnessed **different number of use-of-force incidents** .   
> - In the years 2012-2017, maximum use of force incidents occured in **district 1 where approximately 4,000 incidents** occured. District 4 and 3 witnessed about 3500 and 2500 police incidents in last 5 years. While only about 1100 - 1300 incidents occured in district 2 and 5.   
> - The possible cause of this distribution may be one or more of the following factrs : **high crime rates, police characteristics, and community and police relationships**. In the later parts of this report, analysis is performed that attempts to uncover these insights and the possible causes.    

**Among these incidents:**  
  > - Maximum use of force was used by police in the month of January followed by June with about 1500 police incidents each.   
  > - Most used type of force was "Body Weight to Pin" with about 3700 occurances  
  > - Most common reason stated by police to use force was "Tensed Subject" with about 3900 incidents.  

<a id="2.2"></a>
## <font color="#703bdb">2.2 Are there Racial Disparities? (Controlled Analysis) <hr></font>

Now we try to answer the most important question that we are looking for : Are there Racial Disparities in the police activities ? We will go deeper into these incidents and identify citizens of which race are targeted the most. One point to note that to establish the credible results, a controlled analysis is required, hence this analysis will is broken into two parts to get a clear view:  

A - First analyse the **breakdown of police activity by different race by district**  
B - Next analyse the **actual population porportion of race by district**  

In [None]:
## plot the race distribution by different year
years = [2012, 2013, 2014, 2015, 2016, 2017]
year_dic = {}
for year in years:
    yeardf = events_df[events_df["INCIDENT_YEAR"] == year]
    t1 = dict(yeardf["SUBJECT_RACE"].value_counts())
    t2 = {}
    for k,v in t1.items():
        t2[k] = round(100*float(v) / sum(t1.values()), 2)
    if year not in year_dic:
        year_dic[year] = t2

## Generate Bar Plot
data = []
races = ["White","Black", "Native American", "Other", "Asian"]
colors = ["White","Black", "#6599ed", "Green", "Orange"]
for i, rac in enumerate(races):
    trace1 = go.Bar(x=list(year_dic.keys()), y=[year_dic[_][rac] for _ in year_dic], name=rac, 
                    marker=dict(color=colors[i], opacity=1.0, line=dict(color='black',width=1)))
    data.append(trace1)     
layout = go.Layout(title='Use-of-Force by Race : 2012 - 2017', height=350, 
                   yaxis=dict(range=(0,100), title="Use of Force %"), legend=dict(orientation="h", x=0.1, y=1.2))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='style-bar')

**Graph Interpreatation** 

*Every bar represents the percentage of use-of-force used on a particular race in a particular year.* 

**Inferences**  
> - The plot shows clear **disproportionate numbers related to police activity** among different races.  About 60% of the times, use of force was used on Black individuals.   
> - In comparison with White Subjects, the use of force on Blacks was **3 times that of Whites**.  

**Controlled Analysis:** 

Currently, these plot only shows the absolutes, we need to first control these numbers according to the population proportion. So we plot the comparative charts of police activity and population porportion. Due to data availability, we focus on three main races of the district : blacks, whites, and asians. 

In [None]:
## Add Some features
races = ["Black", "White", "Asian"]
colors = ["black", "white", "orange"]
for c in races:
    police_df["total"] = police_df["uof_race"].apply(lambda x : sum(x.values()))
    police_df[c.lower() + "_cnt"] = police_df["uof_race"].apply(lambda x : x[c])
    police_df[c.lower() + "_per"] = police_df["uof_race"].apply(lambda x : 100*float(x[c]) / sum(x.values()))
    
    enriched_df[c.lower() + "_per"] = 100*enriched_df[c.lower() + "_pop"] / enriched_df["total_pop"]

enriched_df["other_per"] = 100 - enriched_df["white_per"] - enriched_df["black_per"] - enriched_df["asian_per"]

wp = 100 * sum(enriched_df["white_pop"]) / sum(enriched_df["total_pop"])
bp = 100 * sum(enriched_df["black_pop"]) / sum(enriched_df["total_pop"])
hp = 100 * sum(enriched_df["asian_pop"]) / sum(enriched_df["total_pop"])
op = 100 - wp - bp - hp

wp1 = 100*sum(police_df["white_cnt"]) / sum(police_df["total"])
bp1 = 100*sum(police_df["black_cnt"]) / sum(police_df["total"])
hp1 = 100*sum(police_df["asian_cnt"])/ sum(police_df["total"])
op1 = 100 - wp1 - hp1 - bp1 

data = [go.Bar(x=[wp1, wp], y=["UseOfForce", "Population"], name="Whites", 
               marker=dict(color="white", opacity=1, line=dict(color='black',width=1)), orientation='h'),
       go.Bar(x=[bp1, bp], y=[ "UseOfForce", "Population"], name="Blacks", 
              marker=dict(color="black", opacity=1.0, line=dict(color='black',width=1)), orientation='h'),
       go.Bar(x=[hp1, hp], y=[ "UseOfForce", "Population"], name="Asian", 
              marker=dict(color="orange", opacity=0.5, line=dict(color='black',width=1)), orientation='h'),
       go.Bar(x=[op1, op], y=[ "UseOfForce", "Population"], name="Others", 
              marker=dict(color="purple", opacity=0.5, line=dict(color='black',width=1)), orientation='h')]

layout = go.Layout(barmode='stack',height=400, title='Use-of-Force and Population Proportion : By Race', 
                   legend = dict(orientation="h", x=0.1, y=1.15), xaxis=dict(title="Use of Force %"), showlegend=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig)


########### Next Part : Breakdown by Districts 

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=["Population Breakdown", "Use Of Force BreakDown"])
for i, rac in enumerate(races):
    trace1 = go.Bar(y = "D"+enriched_df[_identifier], x = enriched_df[rac.lower() + "_per"], orientation = "h", name=rac, 
                    marker=dict(color=colors[i], opacity=1.0, line=dict(color='black',width=1)))
    fig.append_trace(trace1, 1, 1)
    
    trace2 = go.Bar(y = "D"+police_df[_identifier], x = police_df[rac.lower() + "_per"], orientation = "h", name=rac, 
                marker=dict(color=colors[i], opacity=1.0, line=dict(color='black',width=1)))
    fig.append_trace(trace2, 1, 2)
    
fig["layout"].update(barmode='stack', showlegend = False, 
                     hovermode='closest', title="Use-of-Force and Population Proportion : By Race, District", 
                     height=600)
iplot(fig)

**Graph Interpretations**

*The first plot shows the proportion of use-of-force vs population of different races : whites, blacks and asian. The next two plots represents the same information but sliced by different police department's districts.* 

**Inferences**

> - These plots indicate a very strong level of racial disparity in the police activity of the Minneapolis police department.  
> - In the first chart, we can see that the **population proportions of blacks is only ~20%** which is considerably lesser than whites whose population is 72%, However the proportion of **police activity on blacks is much higher**         
> - In total, **About 71% of the use of force incidents are targeted on Blacks (population : 20%)** only, 27% incidents are targeted on Whites (population : 72%)      
> - Among the 5 districts served by the police department, **Maximum disparity exists in district 1 followed by district 4** and least in district 2.   
> - In the total 4140 incidents in years 2012-2017 in district 1, Black subjects were involved in about 2700 of them whilte only about 950 were whites.  In about 1100 incidents that occured in district 2, about 550 of them involved white subjects and about 450 of them involved black subjects.  

<a id="2.3"></a>
## <font color="#703bdb">2.3 Racial Bias in Subject Injuries ?<hr></font>

Next we look at a portion of the data and only focus on cases in which subjects were injured by police.  This helps us to answer the quest : is there any racial bias in police activity that harms the subjects. 

In [None]:
events_df[_identifier] = events_df[_identifier].apply(lambda x : str(x).replace(".0",""))

tempdf = events_df[events_df["SUBJECT_INJURY"] == -1]
titles = []
fig = tools.make_subplots(rows=1, cols=5, print_grid=False, subplot_titles=["Dist:"+str(i) for i in range(1, 6)])
traces = []
cnt = 1

for idf in police_df[_identifier].values:
    sdf = dict(tempdf[tempdf[_identifier] == idf]["SUBJECT_RACE"].value_counts())
    _perdic = {}
    for x, y in sdf.items():
        _perdic[x] = 100*float(y) / sum(sdf.values())
    
    _perdic2 = {"Black" : _perdic["Black"], "White" : _perdic["White"], "Others" : 100 - _perdic["Black"] - _perdic["White"]}
    tr = go.Bar(x = list(_perdic2.keys()), y = list(_perdic2.values()), 
                name="Dist: "+str(idf), marker=dict(color=["Black", "White", "Orange"], 
                                                   line=dict(color='black',width=1)))
    titles.append("Dist: "+str(idf))
    fig.append_trace(tr, 1, cnt)
    cnt += 1

fig["layout"].update(barmode='stack', showlegend = False, 
                     hovermode='closest', title="% of Subjects Injured by Race in 5 Districts", height=300)
iplot(fig)

**Inferences**

> - Similar kinds of distributions are observed when subjects were injured by police. One can observe the **disproportionate numbers** when among difference races.   
> - In three districts (1, 3, and 4), the subjects, the proportion of **blacks being injured is very high** ( about 50% in Dist 1, and 3, and about 80% in dist 4). Interesting to note that their population proportion in these districts is only (26%, 17%, and 40% respectively). This numbers are the signs of racial disparity shown by the police.  
> - In District 2 and 5, the percentage of white being injured is higher as compared to blacks. Though in district 5, more black people are the victim of use of force.  In all of the districts, "BodilyForceType" was the most common force used by the police. 

<a id="2.4"></a>
## <font color="#703bdb">2.4 Does disparity exist in different types of Use of Force ?  <hr></font>

Another way to slice the data is to observe the different types of use of force by police.

In [None]:
wp = sum(enriched_df["white_pop"]) / sum(enriched_df["total_pop"])
bp = sum(enriched_df["black_pop"]) / sum(enriched_df["total_pop"])
hp = sum(enriched_df["asian_pop"]) / sum(enriched_df["total_pop"])
tp = wp + hp + bp

wp = 100 * sum(enriched_df["white_pop"]) / sum(enriched_df["total_pop"])
bp = 100 * sum(enriched_df["black_pop"]) /  sum(enriched_df["total_pop"])
op = 100 - wp - bp
xx = ["Blacks (" + str(int(bp)) + "%)", "Whites (" + str(int(wp)) + "%)", "Others (" + str(int(op)) + "%)"]
        
data = [go.Scatter(x = xx, y=["","",""], mode='markers', name="",  
                   marker=dict(color=["black","white", "orange"], opacity=1.0, size= [bp, wp, op]))]
layout = go.Layout(barmode='stack', height=300, margin=dict(l=100), title='Population Distribution by Race', 
                   legend = dict(orientation="h", x=0.1, y=1.15),plot_bgcolor='#d7e9f7', paper_bgcolor='#d7e9f7', showlegend=False)
fig = go.Figure(data=data, layout=layout)
iplot(fig)


## Police Behaviour Graph by different categories 
traces = []
titles = []
for vi in events_df["WEAPON_OR_TOOL_USED"].value_counts().index[:21]:
    if vi == "0":
        continue
    tempdf = events_df[events_df['WEAPON_OR_TOOL_USED'] == vi]
    tdoc = dict(tempdf["SUBJECT_RACE"].value_counts())
    if "Black" in tdoc:
        pdoc = {"B" : 100*float(tdoc["Black"]) / sum(tdoc.values()), 
                "W" : 100*float(tdoc["White"]) / sum(tdoc.values()),
                "O" : 100*float(sum(tdoc.values()) - tdoc["White"] - tdoc["Black"]) / sum(tdoc.values())}
        
        xx = ["Blacks", "White", "Others"]
        yy = ["", "", ""]
        ss = [pdoc[_[0]] for _ in xx]
        xx = ["Blacks (" + str(int(ss[0])) + "%)", "Whites(" + str(int(ss[1])) + "%)", "Others (" + str(int(ss[2])) + "%)"]
        trace0 = go.Scatter(x=xx, y=yy, mode='markers', name="",  marker=dict(color=["black","white", "orange"], opacity=1.0, size=ss))
        traces.append(trace0)
        titles.append(vi)

fig = tools.make_subplots(rows=5, cols=3, print_grid=False, subplot_titles = titles[:15])

r, c = 1, 1
for trace in traces[:15]:
    fig.append_trace(trace, r, c)
    c += 1 
    if c == 4:
        r += 1
        c = 1

fig["layout"].update(showlegend = False, height = 1000, plot_bgcolor='#d7e9f7', paper_bgcolor='#d7e9f7',
                     title = "Different Use-of-Force used by Police",
                    xaxis=dict(showgrid=False))
iplot(fig, filename='bubblechart-color')  
## Add hover text

**Graph Interpretations**

*Every subplot represents a type of use-of-force distriction by race. Size of every bubble represents the percentage of respective race on which the force was used.* 

**Inferences**

> - The above plot also depicts that, by comparing the porportion of black population, irrespective of type of use of force, the activity has always been higher on blacks. Though the **total black population is only about 18%** , the use-of-force by police is always much higher than any other race. 
> - On an average black subject's are victim of force used for about 60% of the time.    
> -  Crowd Control Mace, Compliance, and Conscious Neck Restraint are the force types in which blacks are targeted the highest.  

<a id="2.5"></a>
## <font color="#703bdb">2.5 Not all age groups are targeted similarly !  <hr></font>

Let's look at how these incidents are distributed by different age-groups and by race. 

In [None]:
def cbp(df, col1, col2, aggcol, func, title, cs, bottom_margin=None):
    tempdf = df.groupby([col1, col2]).agg({aggcol : func}).reset_index()
    tempdf[aggcol] = tempdf[aggcol].apply(lambda x : int(x))
    tempdf = tempdf.sort_values(aggcol, ascending=False)

    sizes = list(reversed([i for i in range(10,31)]))
    intervals = int(len(tempdf) / len(sizes))
    size_array = [9]*len(tempdf)
    
    st = 0
    for i, size in enumerate(sizes):
        for j in range(st, st+intervals):
            size_array[j] = size 
        st = st+intervals
    tempdf['size_n'] = size_array
    # tempdf = tempdf.sample(frac=1).reset_index(drop=True)

    cols = list(tempdf['size_n'])

    trace1 = go.Scatter( x=tempdf[col1], y=tempdf[col2], mode='markers', text=tempdf[aggcol],
        marker=dict( size=tempdf.size_n, color=cols, colorscale=cs ))
    data = [trace1]
    if bottom_margin:
        layout = go.Layout(title=title, margin=dict(b=150))
    else:
        layout = go.Layout(title=title)
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    
    
###

tmp = events_df.groupby("agebin").agg({'SUBJECT_GENDER' : "count"}).reset_index().rename(columns={"SUBJECT_GENDER" : "count"})
tmp = tmp[~tmp["agebin"].isin(["60+", "<20"])]
tmp["percentage"] = tmp["count"].apply(lambda x : 100*float(x) / sum(tmp["count"]))

keys = ["20_24_pop", "25_34_pop", "35_44_pop", "45_54_pop", "55_59_pop"]
tmp2 = pd.DataFrame()
tmp2["agebin"] = [x.replace("_","-").replace("-pop", "") for x in keys]
tmp2["cnt"] = [sum(enriched_df[k]) for k in keys]
tmp2["percentage"] = [100*y/sum(tmp2["cnt"]) for y in tmp2["cnt"]]

trace1 = go.Bar(x = tmp2["agebin"], y = tmp2["percentage"], marker=dict(color="purple", opacity=0.6))
trace2 = go.Bar(x = tmp["agebin"], y = tmp["percentage"], marker=dict(color="orange", opacity=0.6))

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=["Population % Breakdown by Age", "Use-of-force % BreakDown by Age"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig["layout"].update(barmode='group', showlegend = False, 
                     hovermode='closest', title="", 
                     height=400)
fig["layout"].yaxis1.range = (0, 50)
fig["layout"].yaxis2.range = (0, 50)
iplot(fig)

###

cbp(events_df, 'SUBJECT_RACE', 'agebin', 'SUBJECT_GENDER', 'count', "Use of Force breakdown by Age and Race", 'Jet')

**Graph Interpretation** 

*Size of the bubble represents the respective count of a race and an age group*

**Inferences** 

> - By observing population % breakdown and use-of-force % breakdown, it is noted that **police activity aligns with the population percentage** of respective group. However, when we look at the breakdown of police activity by different age groups, a form of racial bias appears. 
 > - **Black population** which belongs to **25-34 age group** is the victim of most use of force by police, followed by whites in the same age group, and by 20-24 among the age groups. 
 > - **Young subjects are more likely to be victim of police use of force** than the adults or old people irrespective of their race. 
 
 <a id="2.6"></a>
## <font color="#703bdb">2.6 Vehicle Stops : Do they tell the same story ?   <hr></font>

Till now, we only looked at use-of-force data, Let's now use Vehicle Stops data for the analysis purposes that may help to provide a more clearer picture. 

First, let's view an overview of 2017 vehicle stops by police. In the following map, the base districtrs are represented as blue polygons, the white points represents the recent vehicle stops, and the orange circles representes the aggregated number of vehicle stops by district. 

In [None]:
_vs = vstops[(vstops["year"] == 2017) & (vstops["month"] == 1)]
_vs = _vs[~_vs["race"].isin(["Unknown", "not recorded"])]
_vs = _vs[_vs["problem"] == "Traffic Law Enforcement (P)"]

mapb = folium.Map(center_ll, zoom_start=10.5, tiles='CartoDB dark_matter')
folium.GeoJson(shape_gdf, style_function = lambda feature: { 'fillColor': "blue", 'color' : "blue", 'weight' : 1, 'fillOpacity' : 0.5}).add_to(mapb)
for i, row in _vs.iterrows():
    folium.CircleMarker([float(row["lat"]), float(row["long"])], radius=1, color='white').add_to(mapb)

for i, row in police_df.iterrows():    
    dist_ll = _get_latlong_point(row["geometry"].centroid)
    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Vehicle Stops:</b> "+str(row["total_vstops"]), 
                       radius=float(row["total_vstops"])*0.02, color='orange', fill=True).add_to(mapb)

mapb

**Interpretations** 

> - Unlike the trends obtained about use-of-force incidents, vehicle stops are highest in district 5 (equal to 1743 in the year 2017). District 1 which had the maximum use of force incidents in 2012 - 2017 had lower vehicle stops incidents equal to 1098 in the year 2017. 

<a id="2.7"></a>
## <font color="#703bdb">2.7. Vehicle Stops : Is there any Racial Bias ? <hr></font>

It will be interesting to see how vehicle stops are distributed by Race (espicially Blacks and Whites) across different districts of this department. 

In [None]:
races = ["Black", "White", "Latino"]
for c in races:
    police_df[c.lower() + "_vcnt"] = police_df["vstops_race"].apply(lambda x : x[c])
    police_df[c.lower() + "_vper"] = police_df["vstops_race"].apply(lambda x : 100*float(x[c]) / (x["Black"] + x["White"] + x["Latino"]))

fig = plt.figure(figsize=(16,3))
for i in range(1, 6):
    tmp = police_df[police_df[_identifier] == str(i)]
    plt.subplot(1, 5, i)
    docs = [tmp["black_vper"].iloc(0)[0], tmp["white_vper"].iloc(0)[0], tmp["latino_vper"].iloc(0)[0]]
    plt.pie(docs, labels=["Black", "Whites", "Latino"],
            colors=["black", "white", "green"],
           wedgeprops={"edgecolor":"orange",'linewidth': 1})
    plt.title("Dist: " + str(i)) 
    
plt.rcParams['axes.facecolor']='red'
plt.rcParams['savefig.facecolor']='red'

**Inferences**

> - These piecharts indicate that **more percentage of black population is the victim of Vehicle Stops** in district 1, 3, and 4. This is another sign of racial biasness that exists in the police deparment. This is because the black **population percentage is very low** in these districts (26%, 17%, and 40% black population only).   
> - While in districts 2 and 5, higher percentage of white population is stopped for vehicle checks. This also aligns with the insights obtained from the analysis of Subject Injuries. The number was more for white population and less for black population.  
> - Least disparity is in district 5 and maximum is in district 4. 

 <a id="2.8"></a>
## <font color="#703bdb">2.8. Vechile Stops - Blacks Vs Whites as a proportion of Population    <hr></font>

Let's compre the percentage of blacks and whites as a proportion of their population at the overall department level. 

In [None]:
t1  = enriched_df[[_identifier, 'black_per', 'white_per']]
t1  = t1.merge(police_df[[_identifier, 'black_vper', 'white_vper']])

data_radar = [
    go.Scatterpolar(
      r = list(t1.black_per.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'toself',
      name = 'Blacks Population Proportion'
    ),
    go.Scatterpolar(
      r = list(t1.black_vper.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'tonext',
      name = 'Blacks Vehicle Stops Proportion'
    )
]

layout = go.Layout(margin=dict(l=120), width=800, title="Vehicle Stops and Population Proportion of Blacks",
                   legend=dict(orientation="h"),  polar = dict(
    radialaxis = dict(visible = False )))

fig = go.Figure(data=data_radar, layout=layout)
iplot(fig)


data_radar = [
    go.Scatterpolar(
      r = list(t1.white_per.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'toself',
      name = 'Whites Population Proportion'
    ),
    go.Scatterpolar(
      r = list(t1.white_vper.values),
      theta = ["D"+ str(_) for _ in t1[_identifier].values],
      fill = 'toself',
      name = 'Whites Vechile Stops Proportion'
    )
]

layout = go.Layout(margin=dict(l=120), width=800, title="Vehicle Stops and Population Proportion of Whites", legend=dict(orientation="h"),  polar = dict(
    radialaxis = dict(visible = False)))

fig = go.Figure(data=data_radar, layout=layout)
iplot(fig)

**Graph Interpretation**  

*In these radial charts, One color (BLUE) represents the amount population proportion in 5 districts, other (ORANGE) represents the number of vehicle stops by districts. Basically, these two information - population and vehicle stops are overlaped over one another in order to measure the extent of biasness. The 5 lines that represents the radius of these radial charts account for 100%, so the blue / orange points on these lines are less than 100% * 

**Inferences**

> - We can observe that in all five districts served by this department, the population **proportion of blacks is much lower (orange area)** while their **vehicle stops proportion is higher (blue area)**. 
> - In the second plot, A reverse scenario can be observed in case of whites, the population proportion (represented as blue) is much higher and the vehicle stops proportion (represented as orange) is much lower. 
> - Again, this confirms that there exist some level of racial disparity.  

 <a id="2.9"></a>
## <font color="#703bdb">2.9 A look at other socio-econometirc factors that can depict Racial Bias     <hr></font>

There are many other fields that are processed by the data processing and integration pipeline : 

- Median Income  - By Race  
- Employment to Population Ratio - By Race  

In [None]:
tempdf = police_df[["LOCATION_DISTRICT", "total_vstops", "total_incidents"]]
tempdf = tempdf.merge(enriched_df[['LOCATION_DISTRICT', 'blacks_income', 'whites_income']], on="LOCATION_DISTRICT")
tempdf

trace0 = go.Scatter(x=tempdf["blacks_income"], y=tempdf["whites_income"],
    mode='markers', marker=dict(size=tempdf["total_vstops"]*0.05, color="green", opacity=0.7))
data = [trace0]
layout = go.Layout(title="Vehicle Stops by Median Income", xaxis=dict(title="Median Income of Blacks"), yaxis=dict(title="Median Income of Whites"))
fig = go.Figure(data=data, layout = layout)
iplot(fig, filename='bubblechart-color')


####### 

tempdf = police_df[["LOCATION_DISTRICT", "total_vstops", "total_incidents"]]
tempdf = tempdf.merge(enriched_df[['LOCATION_DISTRICT', 'blacks_ep_ratio', 'whites_ep_ratio']], on="LOCATION_DISTRICT")

trace0 = go.Scatter(x=tempdf["blacks_ep_ratio"], y=tempdf["whites_ep_ratio"], mode='markers', 
                    marker=dict(opacity=1, size=tempdf["total_vstops"]*0.05, color="pink"))
data = [trace0]
layout = go.Layout(title="Vehicle Stops by Employment to Population Ratio", 
                   xaxis=dict(title="Blacks : Employment to Population Percent"), yaxis=dict(title="Whites: Employment to Population Percent"))
fig = go.Figure(data=data, layout = layout)
iplot(fig, filename='bubblechart-color')

**Inferences**

>- In the first plot, the bubble with largest size means that maximum number of vehicle stops were made in that area. 
> - Interesting to note that, in that particular area (district 4), the **median income of whites is almost three times than that of blacks** ie. median income ratio of whites and blacks is 75,000 USD : 25,000 USD. This may indicate that police makes more vehicle stops in the area where black population earns less.   
> - Similarly, in the second plot we can observe that the maximum vehicle stops (largest bubble) were observed in district 5 in which **Employment to Population Ratio of Blacks is about 60%**, while it is about 80% for whites.  

 <a id="3"></a>
## <font color="#703bdb">3. What can account for Racial Disparities ? <hr></font>

In this section, we explore what can account for the racial disparities observed in different districts. For this purpose, We will explore two main metrics - 

1. High Crime Rates 
2. Low Poverty Ratios

<a id="3.1"></a>
### <font color="#703bdb">3.1 High Crime Rates</font>

Key Questions that ponder are : 

- Can police behaviour be explained by high crime rates ?  
- Does police are more active in areas where crime is actually high ? 
- Or more preciesly, police are active in areas where black population are higher ?   

Let's explore the crime data of minneapolice districts in order to get these answers. We will plot the crime incidents that occured in Minneapolis city and their 5 districts along with the police activities. 

In the following graph:

- The <font color="blue">"blue"</font> polygons represents the police districts.   
- The <font color="red">"red"</font> circles represents the aggregated crime statistics.  
- The <font color="gray">"gray"</font> circles represents the "Use-of-Force" by police  
- The <font color="green">"green"</font> circles represents the "vehicle stops" by police.  


In [None]:
## Load an external dataset about crime in minneapolis
crime = pd.read_csv("../input/external-datasets-cpe/minneapolis_Police_Incidents_2016.csv")

cr = crime["Precinct"].value_counts().to_frame().reset_index().rename(columns={"Precinct" : "CrimeCount", "index" : "LOCATION_DISTRICT"})[:5]
cr["LOCATION_DISTRICT"] = cr["LOCATION_DISTRICT"].astype(str)
police_df = police_df.merge(cr, on="LOCATION_DISTRICT")

mapc = folium.Map(center_ll, zoom_start=10.5, tiles='CartoDB dark_matter')
folium.GeoJson(shape_gdf, style_function = lambda feature: { 'fillColor': "blue", 'color' : "blue", 'weight' : 1, 'fillOpacity' : 0.2}).add_to(mapc)
for i, row in crime[crime['Description'] == "Asslt W/dngrs Weapon"].iterrows():
    folium.CircleMarker([float(row["Lat"]), float(row["Long"])], radius=1, color='white').add_to(mapc)

for i, row in police_df.iterrows():    
    dist_ll = _get_latlong_point(row["geometry"].centroid)
    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Vehicle Stops:</b> "+str(row["total_vstops"]), 
                       radius=float(row["total_vstops"])*0.01, color='green', fill=True).add_to(mapc)

    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Use of Force:</b> "+str(row["total_incidents"]), 
                       radius=float(row["total_incidents"])*0.01, color='gray', fill=True).add_to(mapc)

    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Total Crime: </b> "+str(row["CrimeCount"]), 
                       radius=float(row["CrimeCount"])*0.005, color='red', fill=True).add_to(mapc)

mapc

**Inferences**

> **Ideal Scenario :** More Crime Means More Vehicle Stops and More Use of Force   
> ie. Radius :  RedCircle (crime) < GreenCircle (VehicleStops), GrayCircle (Use of Force) 

However,  From this graph, we can note following points: 

> - In District 1 and District 4, the use of force incidents are much higher than the aggregated crime in the districts. Though the vehicle stops are lesser.  
> - District 2, 3, and 5 tells a different story, the relative propotion of crime incidents is very high however the use-of-force and vehicle stops are much lower in these districts. Also, we established earlier that in district 3, their exists racial disparity. 

#### Black Population Proportions in respective districts

Let's add another important information in this graph : the proportion of black population. Here, we want to check if the crime rate is higher where black population is higher. 

In [None]:
police_df = police_df.merge(enriched_df[[_identifier, "black_pop"]], on=_identifier)
mapd = folium.Map(center_ll, zoom_start=10.5, tiles='CartoDB dark_matter')
folium.GeoJson(shape_gdf, style_function = lambda feature: { 'fillColor': "blue", 'color' : "blue", 'weight' : 1, 'fillOpacity' : 0.2}).add_to(mapd)
for i, row in crime[crime['Description'] == "Asslt W/dngrs Weapon"].iterrows():
    folium.CircleMarker([float(row["Lat"]), float(row["Long"])], radius=1, color='gray').add_to(mapd)

for i, row in police_df.iterrows():    
    dist_ll = _get_latlong_point(row["geometry"].centroid)
    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Total Crime: </b> "+str(row["CrimeCount"]), 
                       radius=float(row["CrimeCount"])*0.005, color='red', fill=True).add_to(mapd)

    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Total Black Population: </b> "+str(int(row["black_pop"])), 
                       radius=float(row["black_pop"])*0.0005, color='white', fill=True).add_to(mapd)

mapd

**Inferences** 

> - All the districts has relatively low black population proportion, and yet we saw that the number of crimes in those areas is higher. And we also saw that police actions are higher in those areas as well. 
> - In district 1, where there existed highest levels of disparity has actually lower percentage of blacks residing there.  
> - The above plot indicates that **it is not necessarily true that more crime exists in areas where there is more black population**. and most importantly, **Crime Rates cannot be attributed to racial behavior**


<a id="3.2"></a>
### <font color="#703bdb">3.2 Poverty Levels</font>

Apart from Crime rate, we will also explore if there is any relationship between low poverty levels of an area, the crime levels / crime rates and the police activity. 

In [None]:
police_df = police_df.merge(enriched_df[[_identifier, "below_pov_pop"]], on=_identifier)

mape = folium.Map(center_ll, zoom_start=10.5, tiles='CartoDB dark_matter')
folium.GeoJson(shape_gdf, style_function = lambda feature: { 'fillColor': "blue", 'color' : "blue", 'weight' : 1, 'fillOpacity' : 0.2}).add_to(mape)

for i, row in police_df.iterrows():    
    dist_ll = _get_latlong_point(row["geometry"].centroid)
    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Total Crime: </b> "+str(row["CrimeCount"]), 
                       radius=float(row["CrimeCount"])*0.005, color='red', fill=True).add_to(mape)

    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Total Below Poverty Population: </b> "+str(int(row["below_pov_pop"])), 
                       radius=float(row["below_pov_pop"])*0.0005, color='yellow', fill=True).add_to(mape)

    folium.CircleMarker(dist_ll, popup="<b>District ID:</b>" + row["LOCATION_DISTRICT"] +"<br> <b>Use of Force:</b> "+str(row["total_incidents"]), 
                       radius=float(row["total_incidents"])*0.01, color='white', fill=True).add_to(mape)

mape

**Inferences** 

> - Here in this plot, yellow circles represents the **below poverty population**, and we can observe that it is not true that crime is higher when below poverty population is higher. However, police activity : use of force in this case is still higher. So we cannot attribute low poverty levels to be the reasons for police behaviours.

### <font color="#703bdb">End Note</font>

In the end, we saw a controlled exploratory analysis suggests that there exists some level of racial disparity, we also saw the extent of racial disparity in different districts along with the factors such as high crime or low poverty which cannot be attributed to police activity.

In the <a href="https://www.kaggle.com/shivamb/4-2-analysis-report-lapd-49-00033">Next Kernel </a>, I have shared the analysis report for Los Angeles police department with a different analysis template than this one. 