In [None]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt 
from collections import Counter
import plotly.express as px
import seaborn as sns
import datetime

df=pd.read_csv('../input/data-police-shootings/fatal-police-shootings-data.csv')

In [None]:
df.sample(10)

In [None]:
df.info()

# Data cleaning

## Parsing the date

In [None]:
df['date']=pd.to_datetime(df['date'], format = "%Y-%m-%d")

In [None]:
df['months']=df['date'].dt.month
df['years']=df['date'].dt.year

In [None]:
df.isnull().sum()

## Only checking if there is some coincidence with null armed values

In [None]:
df.loc[df['armed'].isnull()]

In [None]:
df['armed'].unique()

## As we can see there are a bunch of names. I will classify these by hand.

In [None]:
df['armed']=df['armed'].str.lower()
gun=['gun', 'guns and explosives','crossbows','gun and knife','hatchet and gun','machete and gun','gun and sword', 'gun and car','incendiary device','gun and vehicle','vehicle and gun','grenade','crossbow']
perforating_weapon=['nail gun','knife','hatchet','baseball bat and knife','sword', 'machete','box cutter','screwdriver','lawn mower blade','sharp object','meat cleaver','beer bottle','straight edge razor','ax','chain saw', 'garden tool', 'scissors','pick-axe','spear','pitchfork','bayonet','glass shard','metal rake','crowbar','pole and knife','pen','chainsaw','samurai sword', 'bow and arrow','ice pick','pellet gun']
no_perforating_weapon=['shovel','hammer','metal object','flagpole','cordless drill','metal pole', 'metal pipe', 'metal hand tool','blunt object','metal stick','chain', "contractor's level",'stapler','bean-bag gun','baseball bat and fireplace poker', 'brick', 'baseball bat', 'hand torch','pole','flashlight','baton','chair','rock', 'piece of wood','pipe','oar', 'tire iron','air conditioner','baseball bat and bottle','fireworks','wrench','walking stick','barstool']
vehicle=['vehicle','carjack','motorcycle','vehicle and machete','car, knife and mace']
non_lethal=['taser','wasp spray','pepper spray']
fake_gun=['claimed to be armed','toy weapon','bb gun and vehicle','air pistol','airsoft pistol','bb gun']
undertermined=['undertemined','unknown weapon',np.nan]
for i,x in enumerate(df['armed']):
    if x in gun:
         df['armed'][i]='Gun'
    elif x in perforating_weapon:
         df['armed'][i]='Perforating_weapon'   
    elif x in no_perforating_weapon:
         df['armed'][i]='no_perforating_weapon' 
    elif x in vehicle:
         df['armed'][i]='Vehicle'     
    elif x in non_lethal:
         df['armed'][i]='Non_lethal' 
    elif x in fake_gun:
         df['armed'][i]='Fake_gun' 
    elif x in undertermined:
         df['armed'][i]='undetermined'

## We need to see the ages now.

In [None]:
(df['age'].isnull().sum()/df['age'].shape[0])*100

## We have 4% of our ages as null. Our approach is going to be filling those null values by taking the median of their city or state in case of less than 10 rows

In [None]:
for x in df.loc[df['age'].isnull(),['city','state','id']].values:
    city,state,i=x
    if df.loc[df['city']==city,'age'].median()>10:
        df.loc[df['id']==i,'age']=df.loc[df['city']==city,'age'].median()
    else:
         df.loc[df['id']==i,'age']=df.loc[df['state']==state,'age'].median()

## We're going to fill null values of races with U of Undetermined

In [None]:
df['race'].fillna('U',inplace=True)

In [None]:
df.dropna(inplace=True)

# Data Analysis

# Deaths by Race:

In [None]:
fig, ax1 = plt.subplots(figsize=(10,8))
race_types=df['race'].value_counts().index
amount_race=df['race'].value_counts().values
ax1.bar(race_types,amount_race)

## The percentage of people from White race, in USA, is 63.4%, the percentage of Latinos and Black are 15% and 13.4%. Therefore we can assume that Latinos and African americans die more with regards of their own population.

# Time Distribution

## Let's take a look in the time distribution.

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(15,5))

days=df['date'].dt.day

sns.distplot(df['years'].values,kde=False,bins=10,ax=ax2)
ax2.set_xlabel('Year')
sns.distplot(df['months'].values,kde=False,bins=12,ax=ax1)
ax1.set_xlabel('Months')
sns.distplot(days,kde=False,bins=10,ax=ax3)
ax3.set_xlabel('Days')

## The time distribution is somewhat uniform, except for the distribution of months which is right skewed.

In [None]:
df.info()

# Weapons

In [None]:
fig, ax1 = plt.subplots(figsize=(8,8))
x=df['armed'].value_counts().index.values
y=df['armed'].value_counts().values
ax1.bar(x,y)
plt.xticks(rotation=20)

## There is much more chance to be shot dead if carrying a perforating weapon than carrying a no perforating weapon.

# Age and Gender

In [None]:
fig, ax1 = plt.subplots(figsize=(8,8))
x=df['gender'].value_counts().index
y=df['gender'].value_counts().values
ax1.bar(x,y)

## The number of men killed by the police is much greater than the number of women

In [None]:
fig, ax1 = plt.subplots(figsize=(10,8))
ax1.set_xlabel('Ages')
ax1.set_ylabel('Amount')
age_values=Counter(min(x//10*10,90) for x in df['age'].values )
ax1=plt.bar(age_values.keys(),age_values.values(),width=8)
plt.xticks([10 * i for i in range(11)])

# Most violent States and Cities

In [None]:
y=df.state.value_counts().values[0:5]
x=df.state.value_counts().index[0:5]
plt.bar(x,y)

In [None]:
y=df.city.value_counts().values[0:5]
x=df.city.value_counts().index[0:5]
plt.bar(x,y)

# Mental Illness and Age

In [None]:
ages=Counter(min(x//10*10,90) for x in df['age'].values )
df['count']=1
fig = px.bar(df, x="age", y='count', color="signs_of_mental_illness")
fig.show()

## People between 50 and 80 years old have, generaly 30% - 35% chances of having mental illness.

# Races and Threat Level

In [None]:
threat_level = df[['race','threat_level']]
threat_level ['kills'] =1
threat_level  = threat_level .groupby(['race','threat_level']).sum()
threat_level = threat_level.reset_index()
fig = px.bar(threat_level , y='kills', x='threat_level',color='race', barmode='group')
fig.show()