In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.options.display.max_rows=None

# Importing and Inspecting the dataset

In [None]:
df = pd.read_csv("/kaggle/input/data-police-shootings/fatal-police-shootings-data.csv", parse_dates=["date"])

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe(include="O")

In [None]:
df.describe()

# Visualising the given data

## Number of Police Shootings every Month Over the Years

We will try to determine if the number of police shootings have increased over the years.

In [None]:
!pip install calmap
import calmap
daywise_incidents = df.groupby(df["date"])["id"].count()
plot, axis = calmap.calendarplot(daywise_incidents, monthticks=2, daylabels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], fillcolor="grey", linewidth=1, fig_kws=dict(figsize=(20,20)))
plot.colorbar(axis[0].get_children()[1], ax=axis, cmap=plt.cm.get_cmap("Blues",9), orientation="horizontal", label = "Number of incidents per day")
plt.show()

In [None]:
df["date"] = pd.to_datetime(df.date)

In [None]:
df.info()

In [None]:
df["month_year"] = df["date"].dt.to_period("M")

In [None]:
df.head(5)

In [None]:
month_year_vc = df.month_year.value_counts()
month_year_vc

The most number of shootings took place in May 2020.

In [None]:
plt.figure(figsize=(18,10))
sns.set(font_scale=0.7, palette="viridis")
sns.lineplot(x=month_year_vc.index.astype(str), y=month_year_vc.values)
plt.title("Number of shootings over the years", fontsize=20)
plt.xlabel("Month and year", fontsize=10)
plt.ylabel("Number of shootings", fontsize=10)
plt.xticks(rotation=45) 
plt.ylim(60,120)
plt.yticks(np.arange(60,120,2))
plt.show()

Although there is a minor increase in the number of shootings over the years, the above lineplot does not provide any conclusive evidence about the increase in the number of shootings over the years since the change is not significant.

## Manner of Death

Now, we will try to determine how the victims were killed and the portion of victims killed by each method.

In [None]:
df.manner_of_death.unique()

The victims were either shot or shot and tasered.

In [None]:
plt.figure(figsize=(12,8))
labels = ['shot', 'shot and tasered']
count_shot = df[df.manner_of_death=="shot"].id.count()
percentage_shot = (float(count_shot)/5416)*100
percentage_shot_tased = 100 - percentage_shot
percentages = [percentage_shot, percentage_shot_tased]
explode=(0.1,0)
plt.rcParams['font.size'] = 15
plt.pie(percentages, explode=explode, labels=labels, autopct='%1.0f%%', shadow=False, startangle=0, pctdistance=1.2, labeldistance=1.4)
plt.axis('equal')
plt.title("Manner of death", fontsize=20)
plt.legend(frameon=False, bbox_to_anchor=(1.5,0.8), fontsize=15)
plt.show()

## Armed

Now, let's begin analyzing the sort of weapons the police shoot out victims were carrying.

(Misc includes the following weapons: 

machete                               43                        Taser                                 26
ax                                    24
sword                                 22
baseball bat                          18
gun and knife                         17
hammer                                16
sharp object                          14
metal pipe                            13
screwdriver                           13
box cutter                            12
gun and car                           11
hatchet                               11
gun and vehicle                       10
crossbow                               9
scissors                               7
pipe                                   6
rock                                   6
shovel                                 6
crowbar                                5
BB gun                                 5
piece of wood                          5
baton                                  5
meat cleaver                           5
blunt object                           5
metal object                           4
vehicle and gun                        4
chair                                  4
straight edge razor                    4
pick-axe                               4
samurai sword                          3
metal pole                             3
glass shard                            3
chain                                  3
pellet gun                             3
guns and explosives                    3
beer bottle                            3
metal stick                            3
brick                                  2
chain saw                              2
pole and knife                         2
lawn mower blade                       2
flashlight                             2
hatchet and gun                        2
pitchfork                              2
incendiary device                      2
garden tool                            2
pole                                   2
barstool                               1
baseball bat and bottle                1
ice pick                               1
stapler                                1
oar                                    1
baseball bat and fireplace poker       1
hand torch                             1
tire iron                              1
claimed to be armed                    1
Airsoft pistol                         1
fireworks                              1
pen                                    1
air pistol                             1
metal hand tool                        1
walking stick                          1
vehicle and machete                    1
motorcycle                             1
baseball bat and knife                 1
chainsaw                               1
air conditioner                        1
metal rake                             1
machete and gun                        1
car, knife and mace                    1
nail gun                               1
pepper spray                           1
cordless drill                         1
bayonet                                1
gun and sword                          1
spear                                  1
wasp spray                             1
wrench                                 1
bow and arrow                          1
flagpole                               1
carjack                                1
BB gun and vehicle                     1
contractor's level                     1
grenade                                1)


In [None]:
armed_vc = df.armed.value_counts()
print(sum((armed_vc[armed_vc.values<80]).values))
armed_vc.loc["misc"] = sum((armed_vc[armed_vc.values<80]).values)
armed_vc.drop((armed_vc[armed_vc.values<80]).index, inplace=True, axis=0)
print(armed_vc)
plt.figure(figsize=(12,8))
plt.xticks(rotation=45, fontsize=12)
sns.set(font_scale=1, palette="viridis")
sns.barplot(data=df, x=armed_vc.index, y=armed_vc.values)
plt.show()

It is clear from the above bargraph that most of the victims were carrying guns.

## Age, Gender and Signs of Mental Illness

First, let us  find out the percentage of the victims that were male and female.

In [None]:
f = df[df.gender=="F"].id.count()
m = df[df.gender=="M"].id.count()
perc_f = (f/(f+m))*100
perc_m = 100-perc_f
print("The percentage of female victims are: ",perc_f,"%")
print("The percentage of male victims are: ",perc_m,"%")

Now, we will find out the percentage of victims showing signs of mental illness.

In [None]:
ill = df[df.signs_of_mental_illness==True].id.count()
not_ill = df[df.signs_of_mental_illness==False].id.count()
perc_ill = (ill/(ill + not_ill))*100
perc_not_ill = 100 - perc_ill
print("Percentage of victims showing signs of mental illness:",perc_ill,"%")
print("Percentage of victims not showing any signs of mental illness:",perc_ill,"%")

The following is a visual representation of the portion of males and females who did or did not show signs of mental illness.

In [None]:
plt.figure(figsize=(5,8))
sns.set(font_scale=1, palette="viridis")
sns.countplot(data=df, x="gender", hue="signs_of_mental_illness")
plt.xticks(fontsize=10)
plt.show()

Here, we will try to determine how the male and female victims who either showed signs of mental illness or didn't are distributed accross the age range.

In [None]:
plt.figure(figsize=(12,12))
sns.violinplot(hue=df.gender, y=df.age, x=df.signs_of_mental_illness, split=True, inner="quartile")
plt.yticks(np.arange(0,100,2))
plt.show()

It is clear from the above violin plot that median age of the men showing signs of mental illness was higher than that of the women. Albeit, the median of ages of both men and women who showed signs of mental illness is late fourties.

Another interesting observaton is that the interquartile range of men showing signs of mental illness is lower than that of the women. In layman's terms, the ages of women showing signs of mental illness were more spread out from the median than the men.

The first quartile is the median of the lower half of the data and the third quartile is the median of the upper half of the data. Therefore, about 50% of the women who showed signs of mental illness were between 29 to 50 years of age and the men who showed signs of mental illness were between 31 to 49 years of age.

## Flee v/s Manner of Death

With the following bar graph we will try to determine the faction of people who did not try to flee from the police and were shot or shot and tasered.

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1, palette="viridis")
sns.countplot(data=df, x="manner_of_death", hue="flee")
plt.xticks(fontsize=10)
plt.show()

It is evident from the above bar graph that most of the individuals shot or shot and tasered were not fleeing from the police.

## Race

In [None]:
df.race.value_counts()

For better understanding, here is a pie chart for comparison.

In [None]:
plt.figure(figsize=(12,10))
labels = ["White", "Black", "Asian" , "Native American", "Hispanic", "Other", "unknown"]
perc_W = (float(df[df.race=="W"].id.count())/5416)*100
perc_B = (float(df[df.race=="B"].id.count())/5416)*100
perc_A = (float(df[df.race=="A"].id.count())/5416)*100
perc_N = (float(df[df.race=="N"].id.count())/5416)*100
perc_H = (float(df[df.race=="H"].id.count())/5416)*100
perc_O = (float(df[df.race=="O"].id.count())/5416)*100
perc_U = (float(df[df.race==None].id.count())/5416)*100
percentages = [perc_W, perc_B, perc_A, perc_N, perc_H, perc_O, perc_U]
explode=(0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1)
plt.rcParams['font.size'] = 15
plt.pie(percentages, autopct="%1.0f%%", shadow=False, startangle=0, pctdistance=1.06, labeldistance=1.1,rotatelabels=True)
plt.axis('equal')
plt.title("Race of Victims", fontsize=20)
plt.legend(frameon=False, bbox_to_anchor=(1.5,0.8), fontsize=15, labels=labels)
plt.show()

Most of the victims were White, following that were blacks.

## Body Camera

In [None]:
df.body_camera.value_counts()

In most of the instances the body camera was off. In my opinion, it is bound to arouse suspicion.

In [None]:
((((df["id"].groupby([df["body_camera"],df["race"]])).count())/5416)*100)

To put the above numbers in perspective, following is a graph showing the percentages of victims of each race being shot when the body camera was on and when the body camera was off.

In [None]:
plt.figure(figsize=(12,9))
(((((df["id"].groupby([df["body_camera"],df["race"]])).count())/5416)*100).rename("percentage").reset_index().pipe((sns.barplot, "data"), x="body_camera", y="percentage", hue="race"))
plt.yticks(np.arange(0,44,2))
plt.show()


The most number of instances involved shooting white individuals and the body camera being off. Followed by, black individuals and the camera being off.

In [None]:
((df["id"].groupby([df["body_camera"],df["threat_level"]])).count())

In the majority of the shootings the threat_level was "attack" but there isn't evidence to confirm this since in the majority of the cases the body cam was off.

To put the above numbers into perspective, following is a graph indicating the percentages of victims that were a potential threat or not.

In [None]:
plt.figure(figsize=(12,9))
(((((df["id"].groupby([df["body_camera"],df["threat_level"]])).count())/5416)*100).rename("percentage").reset_index().pipe((sns.barplot, "data"), x="body_camera", y="percentage", hue="threat_level"))
plt.yticks(np.arange(0,60,2))
plt.show()


In [None]:
df.state.value_counts()

## States and Cities

Following is a graph representing the shootings by state.

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1, palette="viridis")
sns.countplot(data=df, x="state")
plt.xticks(fontsize=10)
plt.xticks(rotation=45, fontsize=9)
plt.show()

The 3 states with the highest number of shootings are - California, Texas and Florida.

Following are the city-wise shootings for the 3 states with the most number of shootings.

California

In [None]:
df[df.state=="CA"].city.value_counts()

Texas

In [None]:
df[df.state=="TX"].city.value_counts()

Florida

In [None]:
df[df.state=="FL"].city.value_counts()

The following are state wise boxplots which briefly describe the age distribution of the white, black and hispanic victims.

In [None]:
data = df[(df.signs_of_mental_illness == True) & df.race.isin(['W', 'B', 'H'])]
fig, ax = plt.subplots(figsize=(40,10))
sns.boxplot(x="state", y="age", data=data, hue='race')
ax.set_xlabel(ax.get_xlabel(), fontsize=20)
ax.set_ylabel('Age', fontsize=20)
ylabels= ['{:,.0f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)
ax.legend(fontsize = 14, title='Races')
plt.show()

From the above graph, it can be interpreted that for the state of california, the median age and the range of age of the white victims was higher than that of both black and hispanic victims. The same can be done for all the states respectively.

In [None]:
threat_level = pd.get_dummies(df.threat_level, prefix='threat_level:')
mental_illness = pd.get_dummies(df.signs_of_mental_illness, prefix='mental_illness:')
flee = pd.get_dummies(df.flee, prefix='flee:')
df_sub = pd.concat([threat_level, mental_illness, flee], axis=1)
sns.set(style="white")
corr = df_sub.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
figure, ax = plt.subplots(figsize=(15,15 ))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})
xlabels = [x for x in df_sub.columns]
ax.set_xticklabels(xlabels, rotation=90, fontsize=12)
ylabels= [x for x in df_sub.columns]
ax.set_yticklabels(ylabels,fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()

From the above graph it is observed that victims who are mentally ill have a positive correlation with not fleeing whereas victims who aren't mentally ill have a strong negative correlation with not fleeing and a positive correlation with fleeing by car and fleeing by foot.

In layman's terms, victims who were mentally ill and tried to flee are considerably less than the number of victims who weren't mentally ill and tried to flee.

In [None]:
df["armed_tf"] = True
df.loc[df.armed=="unarmed","armed_tf"]=False
armed = pd.get_dummies(df["armed_tf"], prefix='armed:')
mental_illness = pd.get_dummies(df.signs_of_mental_illness, prefix='mental_illness:')
race = pd.get_dummies(df.race, prefix='race:')
df_sub = pd.concat([ race, armed, mental_illness], axis=1)
sns.set(style="white")
corr = df_sub.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
figure, ax = plt.subplots(figsize=(15,15 ))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})
xlabels = [x for x in df_sub.columns]
ax.set_xticklabels(xlabels, rotation=90, fontsize=12)
ylabels= [x for x in df_sub.columns]
ax.set_yticklabels(ylabels,fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()

As per the heat map above, it can be observed that there is a positive correlation between white victims and mentall ilness and on the contrary, a negative correlation between mental illness and, black and hispanic victims. That is, more white victims were showing signs of mental illness than any of the other races.

Also, a negative correlation between armed=True and race_B suggests that there were fewer black victims who were armed.