In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Importing Librarires
import numpy as np
import pandas as pd 
import os
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

# We dont Probably need the Gridlines. Do we? If yes comment this line
sns.set(style="ticks")

flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"] # defining the colour palette
flatui = sns.color_palette(flatui)

from wordcloud import WordCloud

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/data-police-shootings/fatal-police-shootings-data.csv") #reading data sheet

In [None]:
#looking at first 10 rows of dataset
df.head(10)

In [None]:
df.shape

In [None]:
#printing summary of dataframe

df.info()

In [None]:
#listing columns
df.columns

In [None]:
#modifying datetime
df['month'] = pd.to_datetime(df['date']).dt.month
df['year'] = pd.to_datetime(df['date']).dt.year

In [None]:
#plotting heatmap of columns
f,ax = plt.subplots(figsize=(10, 5))
sns.heatmap(df.corr(), annot=True, linewidths=0.5,linecolor="red", fmt= '.1f',ax=ax)
plt.show()

Not much correlation can be derived from the heatmap, with correlation data at a max value of 0.1.


In [None]:
#Checking columns for na values to be imputed

df.columns[df.isna().any()]

In [None]:
#Filling missing na values with Unknown
df.fillna(0, inplace=True)
df.head(20)

In [None]:
#grouping stats by race
df.groupby("race").count()

We can see that majority of the cases belong to victims of the race 'White Non-Hispanic'.

In [None]:
#replacing race columns so they are less confusing
df.replace(to_replace = ['A'], value = ['Asian'], inplace = True)
df.replace(to_replace = ['B'], value = ['Black'], inplace = True)
df.replace(to_replace = ['H'], value = ['Hispanic'], inplace = True)
df.replace(to_replace = ['N'], value = ['Native American'], inplace = True)
df.replace(to_replace = ['O'], value = ['Other'], inplace = True)
df.replace(to_replace = ['W'], value = ['White'], inplace = True)
df.head(20)


In [None]:
df.groupby("gender").count()

There is a significantly higher male to female ratio in the cases as seen above.

In [None]:
df['manner_of_death'].value_counts()

Significantly larger amount of victims were shot, instead of being shot and tasered.

In [None]:
#rough overview of age groups involved
df.groupby('age').describe()


In [None]:
#grouping by state
df['state'].value_counts()

It can be observed that most were from the state 'CA'.

# **Data visualizations**

In [None]:
deathbyshooting=df[df['manner_of_death']=='shot']
fig = px.histogram(deathbyshooting,x='race',color='race')
fig.show()

The largest percentage of deaths were of the 'White' race, as can be seen by the longest red bar. The fewest deaths belong to the 'Other' category.

In [None]:
#Histogram displaying cases by age groups
fig = px.histogram(df['age'],x='age',color='age')
fig.show()

As can be seen from the graph above, most of the case victims belong to the age group of 20-30 years old. There is a declining trend observed with increasing age.

In [None]:
fig = px.histogram(df['state'],x='state',color='state')
fig.show()

Highest number of cases occuring in California, with a count of 790.

In [None]:
sns.countplot(x = "manner_of_death", data = df)

A supermajority of cases involved victims being shot, instead of getting shot and tasered.

In [None]:

fig = px.histogram(df['armed'],x='armed',color='armed')
fig.show()

A significant majority of the people were armed, mostly with guns and knives. There are 353 unarmed cases as well.

In [None]:
#Taking a closer look at unarmed cases
unarmedcases = df.loc[df.armed == 'unarmed']
unarmedcases

In [None]:
fig = px.histogram(unarmedcases,x='race',color='race')
fig.show()

Most of the unarmed cases consisted of people belonging to the 'White' race.

In [None]:
fig = px.histogram(unarmedcases,x='gender',color='gender')
fig.show()

A significantly large proportion of unarmed victims were male.

In [None]:
fig = px.histogram(unarmedcases,x='flee',color='flee')
fig.show()

Most of the unarmed cases did not flee.

In [None]:
import plotly.figure_factory as ff
np.random.seed(1)
x = df['age']
hist_data = [x]
group_labels = ['Age']
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

More visualizations to show the distribution of the overall age groups.

In [None]:
mentallyill = df[df.signs_of_mental_illness==True]
mentallyill = mentallyill[['year','race']]
mentallyill['kills'] = 1
mentallyill = mentallyill.groupby(['year','race']).sum()
mentallyill = mentallyill.reset_index()
fig = px.bar(mentallyill, y='kills', x='year',color='race', barmode='group')
fig.show()

Most of the victims who were mentally ill were shot in 2015, with a majority being White.

Similar results for the unarmed cases.

# Conclusion

Most of the cases occured in California, with a majority of victims being White and male.
There has been a decline in the number of shooting cases.
Largest count of age groups belong to 20-30 years old.
Similar statistics were shown for the unarmed cases, no particular relation to race or gender with respect to the unarmed cases.
