# Library

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
import datetime

# Import Data

In [None]:
sns.set(style="darkgrid")

df = pd.read_csv('../input/data-police-shootings/fatal-police-shootings-data.csv')
df['month_year'] = pd.to_datetime(df['date']).dt.to_period('M')
df['year'] = pd.to_datetime(df['date']).dt.year
df['armed_gp'] = df['armed'].apply(lambda x: True if x != 'unarmed' else False)
df.columns

# 1. Univariate Analysis
First, let's look into each column of our raw data.

In [None]:
def uni_pic(df, column, order = True, x_rotation = 0):
    
    fig, ax = plt.subplots(figsize=(20,5))
    
    if order:
        sns.countplot(x = column, data = df, order = df[column].value_counts().index)
        
    if not order:
        sns.countplot(x = column, data = df)
        
    ax.set_xlabel(ax.get_xlabel(), fontsize=20)
    ax.set_ylabel("No. of Death", fontsize=20)

    ax.set_xticklabels(ax.get_xticklabels(), rotation=x_rotation, fontsize=12)
    ax.set_yticklabels(ax.get_yticks().astype('int'),fontsize=12)

    return plt.show()

In [None]:
uni_pic(df, 'manner_of_death', order = True, x_rotation = 0)    

In [None]:
uni_pic(df, 'armed', order = True, x_rotation = 90) 

In [None]:
uni_pic(df, 'age', order = False, x_rotation = 90) 

In [None]:
uni_pic(df, 'gender', order = True, x_rotation = 0) 

In [None]:
uni_pic(df, 'race', order = True, x_rotation = 0)    

In [None]:
uni_pic(df, 'state', order = True, x_rotation = 90) 

In [None]:
uni_pic(df, 'signs_of_mental_illness', order = True, x_rotation = 0) 

In [None]:
uni_pic(df, 'threat_level', order = True, x_rotation = 0) 

In [None]:
uni_pic(df, 'flee', order = True, x_rotation = 0) 

In [None]:
uni_pic(df, 'body_camera', order = True, x_rotation = 0) 

Please keep in mind that 2020 data is not complete.

In [None]:
df_dt = df.groupby(['year'])['id'].count().reset_index()

fig, ax = plt.subplots(figsize=(20,5))
sns.lineplot(data = df_dt, x='year', y = 'id')

ax.set_xlabel('Year', fontsize=20)
ax.set_ylabel('No. of Death', fontsize=20)

xlabels = ['%i'%i for i in ax.get_xticks()]
ax.set_xticklabels(xlabels, rotation=0, fontsize=12)
ylabels= ['{:,.0f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)

plt.show()

# 2. Multivariate Analysis

To understand if race discrepency is involved in fatal shootings by police, death count is not an objetective measure, since larger overall population contributes to the larger number of deaths. 

Therefore, in this section, we will compare the percentage values of intrests within each race group. Considering the race-wise distribution, we will focus on the three largest race groups in US: White, Black, and Hispanic.

# 2.1 Who were shoot by police when body camera is on?

It suprises me that the use of body camera is such uncommon. In the following graph, the grey dashed line shows the ovarall percentage of fatal shooting by police when footage camera is in use. The other solid lines indicate the coresponding percntages within the top three race groups. 

From the graph, the race gorup of Black has higher percentage value, compared to the other two race groups.

In [None]:
df_bdy0 = df.groupby(['year'])['body_camera'].apply(lambda x: x.sum()/x.count()).reset_index()

df_bdy = df.groupby(['year','race'])['body_camera'].apply(lambda x: x.sum()/x.count()).reset_index()
df_bdy = df_bdy[df_bdy.race.isin(['W', 'B', 'H'])]

fig, ax = plt.subplots(figsize=(20,5))
sns.lineplot(data = df_bdy0, x='year', y = 'body_camera', color='grey', linewidth=2.5, label = 'AllRaces').lines[0].set_linestyle("--")
sns.lineplot(data = df_bdy, x='year', y = 'body_camera', hue = 'race', linewidth=2)

ax.set_xlabel(ax.get_xlabel(), fontsize=20)
ax.set_ylabel('Percentage with Camera in Use', fontsize=20)

xlabels = ['%i'%i for i in ax.get_xticks()]
ax.set_xticklabels(xlabels, rotation=0, fontsize=12)
ylabels= ['{:,.2f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)

plt.legend(fontsize = 14)
plt.show()

# 2.2 Who were shoot and tasered to death?

Only a small percentage of deaths caused by both shooting and tasering. In the following graph, the dashed grey line shows the overall percentage of deaths casued by both shooting and tasering by each year. The other solid lines indicate the the percentage values under each race group. 

Overall, there is no obvious difference in the percentage value among different race groups over time. However, the peak point in 2017 of the Hispanic group stands out. 

In [None]:
obs = df[df.manner_of_death == 'shot and Tasered'].groupby(['year'])['manner_of_death'].count()
pop = df.groupby(['year'])['manner_of_death'].count()
df_sub0 = (obs/pop).reset_index()

obs = df[df.manner_of_death == 'shot and Tasered'].groupby(['year','race'])['manner_of_death'].count()
pop = df.groupby(['year','race'])['manner_of_death'].count()
df_sub = (obs/pop).reset_index()
df_sub = df_sub[df_sub.race.isin(['W', 'B', 'H'])]

fig, ax = plt.subplots(figsize=(20,5))
sns.lineplot(data = df_sub0, x='year', y = 'manner_of_death', color='grey', linewidth=2.5, label = 'AllRaces').lines[0].set_linestyle("--")
sns.lineplot(data = df_sub, x='year', y = 'manner_of_death', hue = 'race', linewidth = 2)

ax.set_xlabel(ax.get_xlabel(), fontsize=20)
ax.set_ylabel('Percentage of Shoot-and-Tasered to Death', fontsize=20)

xlabels = ['%i'%i for i in ax.get_xticks()]
ax.set_xticklabels(xlabels, rotation=0, fontsize=12)
ylabels= ['{:,.2f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)

ax.legend(fontsize = 14)
plt.show()

I wonder if it has something to do with the Trump's new immigrtions policies in that year. The following graph is for the historical fatal shooting by police over the years. The raw count of death also has a peak in 2017 for the group of Hispanic civilians.

In [None]:
df_sub = df[df.manner_of_death == 'shot and Tasered'].groupby(['year','race'])['manner_of_death'].count().reset_index()
df_sub = df_sub[df_sub.race.isin(['W', 'B', 'H'])]

fig, ax = plt.subplots(figsize=(20,5))
sns.lineplot(data = df_sub, x='year', y = 'manner_of_death', hue = 'race', linewidth = 2)

ax.set_xlabel(ax.get_xlabel(), fontsize=20)
ax.set_ylabel('No. of Death', fontsize=20)

xlabels = ['%i'%i for i in ax.get_xticks()]
ax.set_xticklabels(xlabels, rotation=0, fontsize=12)
ylabels= ['{:,.0f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)

ax.legend(fontsize = 14)
plt.show()

# 2.3 Who has the sign of mental illness? 

The sign of mental inllness only takes place in 20-25% of overall fatal shooting by police each year (shown in the grey dashed line). The race group of White has continuously higher suffering percentage values, compared to the races groups of Black and Hispanic. 

In [None]:
df_sub0 = df.groupby(['year'])['signs_of_mental_illness'].apply(lambda x: x.sum()/x.count()).reset_index()

df_sub = df.groupby(['year','race'])['signs_of_mental_illness'].apply(lambda x: x.sum()/x.count()).reset_index()
df_sub = df_sub[df_sub.race.isin(['W', 'B', 'H'])]

fig, ax = plt.subplots(figsize=(20,5))
sns.lineplot(data = df_sub0, x='year', y = 'signs_of_mental_illness', color='grey', linewidth=2.5, label = 'AllRaces').lines[0].set_linestyle("--")
sns.lineplot(data = df_sub, x='year', y = 'signs_of_mental_illness', hue = 'race', linewidth=2)

ax.set_xlabel(ax.get_xlabel(), fontsize=20)
ax.set_ylabel('Percentage of Fatal Shooting with Signs of Mental Illness', fontsize=20)

xlabels = ['%i'%i for i in ax.get_xticks()]
ax.set_xticklabels(xlabels, rotation=0, fontsize=12)
ylabels= ['{:,.2f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)

plt.legend(fontsize = 14)
plt.show()

However, the race groups of Black and Hispanic have much lower average and median ages compared to the White group.

In [None]:
sub0 = df[(df.signs_of_mental_illness == True) & df.race.isin(['W', 'B', 'H'])]

fig, ax = plt.subplots(figsize=(20,5))
sns.boxplot(x='year', y='age', data=sub0, hue='race')

ax.set_xlabel(ax.get_xlabel(), fontsize=20)
ax.set_ylabel('Age', fontsize=20)

#xlabels = ['%i'%i for i in ax.get_xticks()]
#ax.set_xticklabels(xlabels, rotation=0, fontsize=12)
ylabels= ['{:,.0f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)

ax.legend(fontsize = 14, title='Races')
plt.show()

In [None]:
df_sub0 = df.groupby(['year'])['age'].mean().reset_index()

df_sub = df.groupby(['year','race'])['age'].mean().reset_index()
df_sub = df_sub[df_sub.race.isin(['W', 'B', 'H'])]

fig, ax = plt.subplots(figsize=(20,5))
sns.lineplot(data = df_sub0, x='year', y = 'age', color='grey', linewidth=2.5, label = 'AllRaces').lines[0].set_linestyle("--")
sns.lineplot(data = df_sub, x='year', y = 'age', hue = 'race', linewidth=2)

ax.set_xlabel(ax.get_xlabel(), fontsize=20)
ax.set_ylabel('Average Age', fontsize=20)

xlabels = ['%i'%i for i in ax.get_xticks()]
ax.set_xticklabels(xlabels, rotation=0, fontsize=12)
ylabels= ['{:,.2f}'.format(x) for x in ax.get_yticks()]
ax.set_yticklabels(ylabels,fontsize=12)

plt.legend(fontsize = 14)
plt.show()

# 2.4 What are the relationships among certain behaviors?

The following corelation matrix shows some correlations among behaviors of people who were shot to death by the police.

The civilians who were not armed tend to have low level of threat and flee by foot. On the other hand, the civilians who were armed tend to have high level of threat but not flee. These correlations are pretty consistent with our common understanding.

In [None]:
armed = pd.get_dummies(df.armed_gp, prefix='Armed:')
flee = pd.get_dummies(df.flee, prefix='Flee:')
threat = pd.get_dummies(df.threat_level, prefix='Threat:')
df_sub = pd.concat([armed, flee, threat], axis=1)

sns.set(style="white")
# Compute the correlation matrix
corr = df_sub.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
figure, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

xlabels = [x for x in df_sub.columns]
ax.set_xticklabels(xlabels, rotation=90, fontsize=12)
ylabels= [x for x in df_sub.columns]
ax.set_yticklabels(ylabels,fontsize=12)

#ax.set_title('Correlations among Attack, Threat & Flee', fontsize=20)

plt.show()

The following matrix is expanded by including race groups. The race group of Hispanic has negative correlation with "attack". White group has positive correaltion with "not fleeing". 

Different from all other race groups, the group of Black has positive correlations with "flee by foot" and "being not armed". It seems that the race group of Black felt more threatened by the police.

In [None]:
race = pd.get_dummies(df.race, prefix='Races:')
armed = pd.get_dummies(df.armed_gp, prefix='Armed:')
flee = pd.get_dummies(df.flee, prefix='Flee:')
threat = pd.get_dummies(df.threat_level, prefix='Threat:')
df_sub = pd.concat([race, armed, flee, threat], axis=1)

sns.set(style="white")
# Compute the correlation matrix
corr = df_sub.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
figure, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

xlabels = [x for x in df_sub.columns]
ax.set_xticklabels(xlabels, rotation=90, fontsize=12)
ylabels= [x for x in df_sub.columns]
ax.set_yticklabels(ylabels,fontsize=12)

#ax.set_title('Correlations among Attack, Threat & Flee', fontsize=20)

plt.show()