In [None]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
from geopandas import GeoDataFrame

from shapely.geometry import Point
import pandas as pd, numpy as np 
import shutil, os, ast, folium
import geopandas as gpd

init_notebook_mode(connected=True)

def _read_shape_gdf(_base_dir, selected_dept):
    shape_pth = _base_dir + "/shapefiles/department.shp"
    shape_gdf = gpd.read_file(shape_pth)
    return shape_gdf

def _get_latlong_point(point):
    _ll = str(point)
    _ll = _ll.replace("POINT (","").replace(")", "")
    _ll = list(reversed([float(_) for _ in _ll.split()]))
    return _ll

def _agebin(x):
    if str(x).lower() == "nan":
        return None
    
    ranges = [20, 24, 34, 44, 54, 59, 100]
    tags = ["<20", "20-24", "25-34", "35-44", "45-54", "55-59", "60+"]
    for i, rng in enumerate(ranges):
        if int(x) <= rng:
            return tags[i]

depts_config = {
    'Dept_23-00089' : {'_rowid' : "DISTRICT", "ct_num" : "18"},  
    'Dept_49-00035' : {'_rowid' : "pol_dist", "ct_num" : "06"},  
    'Dept_24-00013' : {'_rowid' : "PRECINCT", "ct_num" : "27"},  
    'Dept_24-00098' : {'_rowid' : "gridnum",  "ct_num" : "27"},   
    'Dept_49-00033' : {'_rowid' : "number",   "ct_num" : "06"},    
    'Dept_11-00091' : {'_rowid' : "ID",       "ct_num" : "25"},         
    'Dept_49-00081' : {'_rowid' : "company",  "ct_num" : "06"},   
    'Dept_37-00049' : {'_rowid' : "Name",     "ct_num" : "48"},      
    'Dept_37-00027' : {'_rowid' : "CODE",     "ct_num" : "48"},     
    'Dept_49-00009' : {'_rowid' : "objectid", "ct_num" : "53"}, 
}

_identifier = "LOCATION_DISTRICT"

def _get_dfs(_dept):
    _base_dir = "../input/3-example-runs-of-automation-pipeline/CPE_ROOT/" + _dept

    enriched_df = pd.read_csv(_base_dir + "/enriched_df.csv")
    police_df = pd.read_csv(_base_dir + "/police_df.csv")
    shape_gdf = _read_shape_gdf(_base_dir, _dept)

    ## Convert Dictionary Columns 
    for c in police_df.columns:
        if c != _identifier:
            police_df[c] = police_df[c].apply(ast.literal_eval)
            
    events_df = pd.read_csv(_base_dir + "/events/events_df.csv", low_memory=False, parse_dates = ["INCIDENT_DATE"])[1:]

    ## custom code - only for _dept : Dept_23-00089
    if _dept == "Dept_23-00089":
        police_df[_identifier] = police_df[_identifier].fillna("")
        events_df[_identifier] = events_df[_identifier].fillna("")
        
        police_df[_identifier] = police_df[_identifier].apply(lambda x : x.replace(" District", ""))
        events_df[_identifier] = events_df[_identifier].apply(lambda x : x.replace(" District", ""))

    shape_gdf = shape_gdf.rename(columns = {depts_config[_dept]['_rowid'] : _identifier})
    shape_gdf[_identifier] = shape_gdf[_identifier].astype(str)
    enriched_df[_identifier] = enriched_df[_identifier].astype(str)
    police_df[_identifier] = police_df[_identifier].astype(str)
    police_df = police_df.merge(shape_gdf[[_identifier, "geometry"]], on=_identifier)
    
    
    if "SUBJECT_AGE" in events_df.columns:
        events_df["agebin"] = events_df["SUBJECT_AGE"].apply(lambda x : _agebin(x))
    
    return enriched_df, police_df, shape_gdf, _base_dir, events_df

# <font color="#703bdb">Part 4.3 Analysis Report - Indianapolis Department (23-00089)</font><hr>

<a href="http://policingequity.org/">Center of Policing Equity</a> is a research and action think tank that works collaboratively with law enforcement, communities, and political stakeholders to identify ways to strengthen relationships with the communities they serve. CPE is also the home of the nationâ€™s first and largest <a href="http://policingequity.org/national-justice-database/">database</a> tracking national statistics on police behavior. 

The main aim of CPE is to bridge the divide created by communication problems, suffering and generational mistrust, and forge a path towards public safety, community trust, and racial equity. This kernel series is my contribution to the <a href="https://www.kaggle.com/center-for-policing-equity/data-science-for-good">Data Science for Good: Center for Policing Equity</a>. The contribution is focused on providing a generic, robust, and automated approach to integrate, standardize the data and further diagnose disparities in policing, shed light on police behavior, and provide actionable recommendations. 

Following are parts of Kernels Submissions in order:  

<ul>
    <li><a href="https://www.kaggle.com/shivamb/1-solution-workflow-science-of-policing-equity/">Part 1: Solution Workflow - The Science of Policing Equity </a>  </li>
    <li><a href="https://www.kaggle.com/shivamb/2-automation-pipeline-integration-processing">Part 2: Data Integration and Processing : Automation Pipeline</a>  </li>
    <li><a href="https://www.kaggle.com/shivamb/3-example-runs-of-automation-pipeline">Part 3: Example Runs of Automation Pipeline </a>  </li> 
    <li><a href="https://www.kaggle.com/shivamb/4-1-analysis-report-minneapolis-24-00013">Part 4.1: Analysis Report - Measuring Equity - Minneapolis Police Department </a>   </li>
    <li><a href="https://www.kaggle.com/shivamb/4-2-analysis-report-lapd-49-00033">Part 4.2: Analysis Report - Measuring Equity - Los Angles Police Department (49-00033) </a>   </li>
    <li><a href="https://www.kaggle.com/shivamb/4-3-analysis-report-officer-level-analysis">Part 4.3: Analysis Report - Officer Level Analysis - Indianapolis Police Department (23-00089) </a>   </li></ul>

The complete overview of the solution is shared in the *first kernel*. It explains the process and flow of automation, standardization, processing, and analysis of data. In the *second kernel*, the first component of the solution pipeline : data integration and processing is implemented. It processes both core level data as well as department level data. In the *third kernel*, this pipeline is executed and run for several departments. After all the standardized and clean data is produced, it is analysed with different formats of the Analysis Framework in 4.1, 4.2 and 4.3 kernels. In *kernel 4.1*, core analysis is done along with link with crime rate and poverty data. In *kernel 4.2*, core analysis is done along with statistical analysis. In *kernel 4.3*, officer level analysis is done. 

<hr>

This kernel, is the fourth of the series. In this kernel, Analysis report for Indianapolis Police Department, more specifically this analysis template is different from previous two templates. In this template, the main focus is on officer level analysis ie. the analysis makes use of officer attributes in measuring racial bias. <br>

<a href="https://www.kaggle.com/shivamb/4-1-analysis-report-minneapolis-24-00013"> 4.1  Analysis Report :  24-00013  + <b>(What can account for Racial Disparities ?)</b> </a>  
<a href="https://www.kaggle.com/shivamb/4-2-analysis-report-lapd-49-00033"> 4.2 Analysis Report : 49-00033 + <b>(Statistical Analysis : Regression + Correlations)</b> </a>  
<a href="https://www.kaggle.com/shivamb/4-3-analysis-report-officer-level-analysis"> 4.3 Analysis Report : 23-00089 + <b>(Officer Level Analysis)</b> </a>  <br>

<br>
<b>Contents of this kernel:</b> 

<ul>
    <li><a href="#1">1. Key Highlights </a>  </li>
    <li><a href="#2">2. Deep Exploration of Policy Activity </a>  </li>
    <ul>
        <li><a href="#2.1">2.1 Racial Disparity - Force Used by Officer Age </a>  </li>
        <li><a href="#2.2">2.2 Experienced Vs Junior Officers : Use of Force</a>  </li>
        <li><a href="#2.3">2.3 Does Officer Race impacts racial bias ? </a>  </li>
        <li><a href="#2.4">2.4 Officer Profiling </a>  </li>
        <li><a href="#2.5">2.5 Department Level : Extent of Racial Disparity  </a>  </li>
        <li><a href="#2.6">2.6 Department Level : Key Statistics to Measure Racial Disparity  </a>  </li>
        <li><a href="#2.7">2.7 Disparity Score : A Naive Model (Example) </a>  </li>
        <li><a href="#2.8">2.8 Measuring Racial Bias : Use of Force Severity </a>  </li>
        <li><a href="#2.9">2.9 Most significant variables : Use of Force's Severity</a>  </li></ul>
</ul>

<a id="1"></a>
## <font color="#703bdb">1. Key Highlights <hr></font>

Officer level data is not shared by every department, but if shared it can be used to analyse the racial bias from a different perspective - officer attributes. In this kernel, I have shared the analysis done from the officer level attributes. Following are the key highlights of the analysis: 

- Officers having age less than 30 have used force more on blacks than whites among the total subjects which they encountered.  
- Recently recruited officers tend to show some level of racial bias, as their use-of-force on blacks is much higher than that of whites.  
- More experienced officers have used force more on whites than blacks, indicating that this agegroup does not shows very high racial bias specifically on black population.  
- Use of force severity is highest on hispanics, followed by blacks and then whites.  
- The most significant factors that affects the severity of use of force are about race of the subject and years of experience of the officer.  



## <font color="#703bdb">2. Racial Bias : Officer Level Analysis <hr></font>

All the analysis is first done at high level and then by controlling the socio-econometric or demographic factors of the area which the department serves. As the first step, we set the department and load the processed datasets that were produced from the data processing and analysis pipeline. 

In [None]:
_dept = "Dept_23-00089"
enriched_df, police_df, shape_gdf, _base_dir, events_df = _get_dfs(_dept)

white_subjects = events_df[events_df["SUBJECT_RACE"] == "White"]
black_subjects = events_df[events_df["SUBJECT_RACE"] == "Black"]

<a id="2.1"></a>
## <font color="#703bdb">2.1 Force Used by Officer Age <hr></font>

First, we look at the proportion of force used by police on black subjects and white subjects distributed the offier age. We try to find if there exist some level of racial disparity in the force used by police by police officers of different age-groups. This will help to identify which age group has high racial bias characteristics. 

In [None]:
races = ["Black", "White"]
colors = ["black", "white"]
for c in races:
    enriched_df[c.lower() + "_per"] = 100*enriched_df[c.lower() + "_pop"] / enriched_df["total_pop"]
enriched_df["other_per"] = 100 - enriched_df["white_per"] - enriched_df["black_per"]

wp = 100 * sum(enriched_df["white_pop"]) / sum(enriched_df["total_pop"])
bp = 100 * sum(enriched_df["black_pop"]) / sum(enriched_df["total_pop"])
op = 100 - wp - bp

data = [go.Bar(x=[wp], y=["Population"], name="Whites", 
               marker=dict(color="white", opacity=1, line=dict(color='black',width=1)), orientation='h'),
       go.Bar(x=[bp], y=["Population"], name="Blacks", 
              marker=dict(color="black", opacity=0.5, line=dict(color='black',width=1)), orientation='h'),
       go.Bar(x=[op], y=["Population"], name="Others", 
              marker=dict(color="orange", opacity=0.5, line=dict(color='black',width=1)), orientation='h')]

layout = go.Layout(barmode='stack',height=250, title='Population Proportion : By Race', 
                   legend = dict(orientation="h", x=0.1, y=1.35), xaxis=dict(title="Population Proportion %"), showlegend=True)
fig = go.Figure(data=data, layout=layout)
iplot(fig)




race_doc_w = dict(white_subjects["OFFICER_AGE"].value_counts())
race_doc_b = dict(black_subjects["OFFICER_AGE"].value_counts())

race_per_w = {}
for k, v in race_doc_w.items():
    race_per_w[k] = round(100 * float(v) / sum(race_doc_w.values()), 2)

race_per_b = {}
for k, v in race_doc_b.items():
    race_per_b[k] = round(100 * float(v) / sum(race_doc_b.values()), 2)


## plot
trace1 = go.Bar(x=list(race_per_w.keys()), y=list(race_per_w.values()), name="Use of Force on Whites", 
                marker=dict(color="white", line=dict(width=1, color="black")))
trace2 = go.Bar(x=list(race_per_b.keys()), y=list(race_per_b.values()), name="Use of Force on Blacks", 
                marker=dict(color="black", opacity=0.5, line=dict(width=1, color="black")))

layout = go.Layout(title='Use-of-Force on Blacks and Whites by Officer Age', height=450, 
                   xaxis=dict(range=(21, 52.5), title="Officer Age"), 
                   yaxis=dict(title="Percentage of Subjects", range=(0,10)), 
                   legend=dict(orientation="h", x=0.1, y=1.17))
fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig, filename='style-bar')

**Graph Interpretation**

*The first plot indicates that population proportion of black people is about 28% while it is about 61% for whites. The second graph represents the percentage of subjects who received force from the police officers of different age. The Y axis represents the percentage of subjects, X axis represents the officer age. Additionally, we observe the percentage of two groups of subjects - blacks and whites*  

**Inference**

> - Though there is a big difference in the population of blacks and whites, the force used by police officer of different age groups shows different distributions. 
> - We can observe is that when **officer age is less than 30, then height of black bars is more than that of white bars**. This indicates that officers having age less than 30 have used force more on blacks than whites among the total subjects which they encountered. This reflects **a form of racial disparity.**  
> - Also, we can observe that officers aged more than 30 have higher white bars than black bars (mostly all of them). This indicates that **officers with 30+ age tend to use force more on whites than blacks**. This also aligns with the population propotions of whites and blacks in the areas, thus reflecting a very low or no racial disparity.  

<a id="2.2"></a>
## <font color="#703bdb">2.2 Experienced Vs Junior Officers : Use of Force <hr></font>

Next, we analyse how much disparity exist when we observe the officer subject's encountered separated by different years in service served by them. 

In [None]:
race_doc_w = dict(white_subjects["OFFICER_YEARS_ON_FORCE"].value_counts())
race_doc_b = dict(black_subjects["OFFICER_YEARS_ON_FORCE"].value_counts())

race_per_w = {}
for k, v in race_doc_w.items():
    race_per_w[k] = round(100 * float(v) / sum(race_doc_w.values()), 2)

race_per_b = {}
for k, v in race_doc_b.items():
    race_per_b[k] = round(100 * float(v) / sum(race_doc_b.values()), 2)


## plot
trace1 = go.Bar(x=list(race_per_w.keys()), y=list(race_per_w.values()), name="Use of Force on Whites", 
                marker=dict(color="white", line=dict(width=1, color="black")))
trace2 = go.Bar(x=list(race_per_b.keys()), y=list(race_per_b.values()), name="Use of Force on Blacks", 
                marker=dict(color="black", opacity=0.5, line=dict(width=1, color="black")))

layout = go.Layout(title='Use-of-Force on Blacks and Whites by Officer Years on Force', height=450, 
                   xaxis=dict(title="Officer Years on Force", range=(0,20.5)), 
                   yaxis=dict(title="Percentage of Subjects", range=(0,20)), 
                   legend=dict(orientation="h", x=0.1, y=1.17))
fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig, filename='style-bar')

**Inferences**  
> - **Recently recruited officers tend to show some level of racial bias**, as their **use-of-force on blacks is much higher than that of whites**. This can be observed from the above plot that when officer age is less tan 5-6 years, the height of black bars is more than that of whites.   
> - More **experienced officers** have used force **more on whites than blacks**, indicating that this agegroup does not shows very high racial bias specifically on black population.  

<a id="2.3"></a>
## <font color="#703bdb">2.3 Does Officer Race impacts racial bias ? <hr></font>

Next we look if there is any racial bias which can be explained by Officer Race. We will plot a co-occurance matrix in order to visualize it. 

In [None]:
small_df = events_df[events_df["SUBJECT_RACE"].isin(["Black", "White", "Hispanic"])]
small_df = small_df[small_df["OFFICER_RACE"].isin(["Black", "White", "Hispanic"])]

cols = ['SUBJECT_RACE', 'OFFICER_RACE']
colmap = sns.light_palette("gray", as_cmap=True)
tb = pd.crosstab(small_df[cols[1]], small_df[cols[0]]).apply(lambda r: round(100*r/r.sum(),2), axis=1)
tb = tb.style.background_gradient(cmap = colmap)
tb

**Inference**  

> - Though percentage of force used on black subjects is much higher than hispanics and whites but no clear evidence can be established that officers of same race do not show racial bias and vice verca. 

<a id="2.4"></a>
## <font color="#703bdb">2.4 Officer Profiling  <hr></font>

One officer may have handelled more than one subject, its better to profile the officers and create their characteristicis using aggregations. These profiles can be used to evaluate some factors and their role in racial disparity. 

In [None]:
from collections import Counter 
def find_percentage(row):
    races = row["SUBJECT_RACE"]
    per_dic = {}
    for race, cnt in races.items():
        per_dic[race] = round(100 * float(cnt) / row["uof_count"], 2)
    return per_dic 


officer_df = events_df.groupby('OFFICER_ID').agg({"SUBJECT_RACE" : lambda x : Counter("|".join(x).split("|")) ,
                                                 "SUBJECT_GENDER" : "count"}).reset_index().rename(\
                                            columns = {"SUBJECT_GENDER" : "uof_count"})
officer_df = officer_df.sort_values("uof_count", ascending = False)
officer_df = officer_df[officer_df["uof_count"] >= 3]

officer_df["race_percent"] = officer_df.apply(lambda row : find_percentage(row), axis = 1)
officer_df["black_subjects_per"] = officer_df["race_percent"].apply(lambda x : x["Black"] if "Black" in x else 0.0)
officer_df["white_subjects_per"] = officer_df["race_percent"].apply(lambda x : x["White"] if "White" in x else 0.0)
officer_df["hispanic_subjects_per"] = officer_df["race_percent"].apply(lambda x : x["Hispanic"] if "Hispanic" in x else 0.0)
officer_df["difference_b_w"] = officer_df.apply(lambda x : x["black_subjects_per"] - x["white_subjects_per"], axis = 1)
officer_df = officer_df.drop(["SUBJECT_RACE", "race_percent"], axis = 1)
officer_df.head()

> - The above dataframe is the officer profile database, in this table every row is a profile of an officer which shows the aggregated analysis of the officer and the subjects handelled by them in the past.  
> - The main columns are "uof_count" : Total number of times force used, "black/white/hispanic_subjects_per" : percentage of force used on black/white/hispanic subjects, and "difference_b_w" : Difference between the force used percentages of black subjects and white subjects.
> - Officer profiling will help us to evaluate the department. 

<a id="2.5"></a>
## <font color="#703bdb">2.5 Department Level : Extent of Racial Disparity <hr></font>

In order to understand the extent of racial disparity that exist in a department. We analyse the number of officers having more black subject encounters than whites. If this number turns out to be very high, then levels of racial disparity are also higher. We now analyse the number of black vs white subjects encountered by number of officers. 

In [None]:
trace1 = go.Histogram(x=officer_df.black_subjects_per, name = "Blacks - Count of Force used", marker=dict(color="black", opacity=0.5))
trace2 = go.Histogram(x=officer_df.white_subjects_per, name = "Whites - Count of Force used", marker=dict(color="white", opacity=1.0, 
                                                                                                         line = dict(width=1, color="black")))
layout = go.Layout(title='', height=400, 
                   xaxis=dict(title="Officer's percentage of Force Used on Black / White ", range=(2, 98)), 
                   yaxis=dict(title="Number of Officers", range=(0, 130)), 
                   legend=dict(orientation="h", x=0.1, y=1.17))
fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig, filename='style-bar')

**Graph Interpretation** 

> - The above plot shows the glimpse of the entire department. The x-axis represents percentage of force used on blacks or whites by the officers. The y axis represents the number of officers.  
> - An example - The two bars at x-axis = 10 indicates that among all the police officers in the department, the ones who have used about 10% force on either blacks or whites, there are only 14 officers who have used force on blacks and 27 police officers who have used force on whites.  
> - Another example : The two bars at x-axis = 70 indicates that among all the police officers in the department, the ones who have used about 70% force on either blacks or whites, there are 27 of them who have used force on blacks and only 18 of them who have used force on blacks. 
> - What is racial disparity : black_height > white_height in the right side of the graph ie. there are large majortiy of officers who have used force more on blacks than on whites. 

**Inferences** 
> - Left side of the plot is the officers who have used force on the subjects lesser number of times, If we observe the **left side of the plot** (before 50), we can see that **more number of police officers have handelled more white subjects** than black subjects. 
> - Right side of the plot is the officers who have used force on the subjects higher number of times, if we observe the **right side of the plot** (after 50), we can see that there are **more number of police officers who have handelled more blacks** than whites. 
> - These insights suggest that police officer shows a level of disparity in using force on blacks.  

<a id="2.6"></a>
## <font color="#703bdb">2.6 Department Level : Key Statistics to Measure Racial Disparity  <hr></font>

Now, let's find some numbers to identify the extent of racial bias. 

1. Percentage of Officers in the department who have **used force more on blacks than whites**   
    > Lower the better  

2. Percentage of Officers in the department who have **used force only on black subjects**  
    > Lower the better  
 
3. Percentage of Officers in the department who have **used force only on white subjects**  
    > Higher the better  

4. Percentage of Officers in the department having **large difference in force used on blacks and whites**
     > Lower the better 

In [None]:
from IPython.core.display import display, HTML

display(HTML("<h3>Key Statistics : Extent of Racial Disparity</h3>"))

val = round(100 * officer_df[officer_df["black_subjects_per"] >= 40].shape[0] / officer_df.shape[0])
html = "<font size=7>" + str(val) + "% </font> percentage of officers who have <b>used force more on blacks than whites</b>"
display(HTML(html))

val = round(100 * officer_df[officer_df["black_subjects_per"] >= 90].shape[0] / officer_df.shape[0])
html = "<font size=7>" + str(val) + "% </font> percentage of officers who have <b>used force more only on Blacks</b>"
display(HTML(html))

val = round(100 * officer_df[officer_df["white_subjects_per"] >= 90].shape[0] / officer_df.shape[0])
html = "<font size=7>" + str(val) + "% </font> percentage of officers who have <b>used force more only on Whites</b>"
display(HTML(html))

val = round(100 * officer_df[officer_df["difference_b_w"] >= 50].shape[0] / officer_df.shape[0])
html = "<font size=7>" + str(val) + "% </font> officers with <b>large difference (>50%) in force used on blacks and whites</b>"
display(HTML(html))

<a id="2.7"></a>
## <font color="#703bdb">2.7 Disparity Score : A Naive Model (Example) <hr></font>

CPE can infact use these statistics at the first level to come up with a racial disparity score of every department. **Ofcourse, this is calculated from the officer level attributes but the idea is that such statistics can be calculated from other attributes as well.** For example, Police Activity Statistical Measures : total count of use of force, use of force on blacks, difference in blacks and whites etc.   

A simple naive model can be built by obtaining the normalized score of these statistical measures. For example, the first statistics : percentage of UOF on blacks is more than whites. I created a following table (which can be fine tuned with iterative runs and domain knowledge) which can be used to normalize the statistical scores, for example in this case, the percentage of officers whose UOF on blacks is more than whites is 62%, so it turns out to be 8.5

<table>
    <tr><td>**Statistical Measure** </td> <td> **Range** </td> <td> **Normalized Score** </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 0 - 10 </td> <td> 1 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 10 - 20 </td> <td> 3 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 20 - 30 </td> <td> 5 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 30 - 40 </td> <td> 6 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 40 - 50 </td> <td> 7 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 50 - 60 </td> <td> 8.0 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 60 - 70 </td> <td> 8.5 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 70 - 80 </td> <td> 9 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 80 - 90 </td> <td> 9.5 </td>  </tr>
    <tr><td>UOF_officers : blacks > whites </td> <td> 90 - 100 </td> <td> 10 </td>  </tr>
</table>

Similar tables can be developed and improved for other statistics as well. And finally, weights can be decided for every statistics. For example - 

<table>
    <tr><td>**Statistical Measure**</td> <td> **Weight** </td> </tr>
    <tr><td> UOF_officers : blacks > whites </td> <td> 0.40 </td> </tr>
    <tr><td>UOF_officers : only_on_blacks </td> <td> 0.30 </td> </tr>
    <tr><td>UOF_officers : whites > blacks </td> <td> 0.20 </td> </tr>
    <tr><td>UOF_officers : large_difference </td> <td> 0.10 </td> </tr>
</table>

So in this case, the normalized racial score for Indianapolis department can be calculated as following: 

normalized scores : 8.5 for UOF_officers : blacks > whites, 6 for UOF_officers : only_on_blacks, 7 for UOF_officers : whites > blacks, and 4 for UOF_officers : large_difference

8.5 ( 0.40 ) + 6 ( 0.35 ) + 7 ( 0.20 ) + 4 ( 0.10 ) = 7.3 

## Racial Disparity Score (Indianapolis) = 7.3 / 10 

A similar score can be calculated for other departments and can be shared in their reports which help the departments understand where and till what level's they need to improve. 

<a id="2.8"></a>
## <font color="#703bdb">2.8 Measuring Racial Bias : Use of Force Severity <hr></font>

One insight to measure racial bias in the department can be about Severity of Force Used and checking it which race has the highest severity on an average. However, this variable is not directly available, so we can quantify it using following weighing scheme. I refered one of the CPE report to came up with this type of weighing. 

<table>
    <tr> <td>** Use of Force Type** </td> <td> **Severity Score** </td> <tr>
    <tr> <td>Physical Use of Force</td> <td> 1 </td> <tr>
    <tr> <td>Canine</td> <td> 2 </td> <tr>
    <tr> <td>Use of CS/OC </td> <td> 3.5 </td> <tr>
    <tr> <td>Less Lethal </td> <td> 5 </td> <tr>
    <tr> <td>Lethal Handgun </td> <td> 8 </td> <tr>
    <tr> <td>Lethal Vehicle </td> <td> 10 </td> <tr>
</table>

Based on this weighing, we can plot the average severity of use of force by different race of the subjects. 


In [None]:
severity_score = {}
for x in events_df["TYPE_OF_FORCE_USED"].value_counts().index:
    if x.startswith("Physical"):
        severity_score[x] = 1
    elif x.startswith("Canine"):
        severity_score[x] = 2
    elif "CS/OC" in x:
        severity_score[x] = 4.5
    elif x.startswith("Less Lethal"):
        severity_score[x] = 5
    elif x == "Lethal-Vehicle":
        severity_score[x] = 10
    elif x.startswith("Lethal"):
        severity_score[x] = 8

events_df = events_df[~events_df["TYPE_OF_FORCE_USED"].isna()]
events_df["uof_severity"] = events_df["TYPE_OF_FORCE_USED"].apply(lambda x : severity_score[x])
t1 = events_df[events_df['INCIDENT_YEAR'] == 2015].groupby("SUBJECT_RACE").agg({"uof_severity" : "mean" }).reset_index()

trace1 = go.Bar(x = t1.SUBJECT_RACE[2:], y=t1.uof_severity[2:], name="Average Severity", 
                marker=dict(color="gray", opacity = 0.4))

layout = go.Layout(title='Use-of-Force Average Severity by Race of the Subject', height=400,
                   legend=dict(orientation="h", x=0.1, y=1.17), 
                  yaxis=dict(range=(0.85, 2.3)))
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig, filename='style-bar')

> - Use of force severity is highest on hispanics, followed by blacks and then whites.  

<a id="2.9"></a>
## <font color="#703bdb">2.9 Most significant variables : Use of Force's Severity <hr></font>

Now, we can use officer level attributes, subject level attributes and socio econometric factors of an area in order to identify which variables matters the most. 


In [None]:
import statsmodels.api as sm_api
from sklearn import preprocessing

merged_df = enriched_df.drop_duplicates().merge(events_df, on = "LOCATION_DISTRICT")

merged_df['is_same_race'] = merged_df.apply(lambda x : 1 if x["SUBJECT_RACE"] == x["OFFICER_RACE"] else 0, axis = 1)
merged_df['is_black_subj'] = merged_df.apply(lambda x : 1 if x["SUBJECT_RACE"] == "Black" else 0, axis = 1)
merged_df['is_white_subj'] = merged_df.apply(lambda x : 1 if x["SUBJECT_RACE"] == "White" else 0, axis = 1)
merged_df['is_hispanic_subj'] = merged_df.apply(lambda x : 1 if x["SUBJECT_RACE"] == "Hispanic" else 0, axis = 1)

merged_df["OFFICER_AGE"] = merged_df["OFFICER_AGE"].fillna(30.0)
merged_df["OFFICER_YEARS_ON_FORCE"] = merged_df["OFFICER_YEARS_ON_FORCE"].fillna(5.0)
# merged_df = merged_df.dropna()

merged_df["white_per"] = merged_df["white_pop"] / merged_df["total_pop"]
merged_df["black_per"] = merged_df["black_pop"] / merged_df["total_pop"]
merged_df["hispanic_per"] = merged_df["hispanic_pop"] / merged_df["total_pop"]
merged_df["w2b"] = merged_df["white_pop"] / merged_df["black_pop"]


cols = ["is_black_subj", "is_white_subj", "is_hispanic_subj", "w2b", "white_per", "black_per", "hispanic_per", 'whites_income',
       'blacks_income', 'below_pov_pop',
       'whites_ep_ratio', 'blacks_ep_ratio', 'whites_unemp_ratio',
       'blacks_unemp_ratio', "OFFICER_AGE", "OFFICER_YEARS_ON_FORCE", "is_same_race"]

x = merged_df[cols].values
min_max_scaler = preprocessing.StandardScaler()
t4 = min_max_scaler.fit_transform(x)
t4 = pd.DataFrame(t4, columns = cols)

target = "uof_severity"
X = t4[cols]
Y = np.log1p(merged_df[target])
model = sm_api.OLS(Y, X)
results = model.fit()
results.summary()

**Inferences**  

> - The most significant variables that affects the severity of use of force are : 

        - is_black_subj : If the subject is Black  
        - is_hispanic_subj : If the subject is Hispanic  
        - OFFICER_YEARS_ON_FORCE : Number of years an officer has served in the department  

Though, the model may be suffering from multicollineartiy problems but if we concentrate only on the significant variables, it can be stated that being a black or hispanic increases the chances that officers will be using force, that too more severe. 


### End Notes

Thanks for viewing my submission. I hope it will be useful to CPE. Please do share the valuable comments and feedback. 