In [None]:
#importing the essentials

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../input/us-police-shootings/shootings.csv')

# <font color="violet"><b>Let's take a look at the Dataset

In [None]:
data.head()

In [None]:
#feature engineering (Adding col Age Group)
bins = [0,18,45,60,100]
group_names = ['Teenager','Adult','Old','Very Old']
data['Age Group'] = pd.cut(data['age'], bins, labels= group_names)

#converting date column from str to date
data['date']=pd.to_datetime(data['date'])
data['year'] = pd.DatetimeIndex(data['date']).year
data['month'] = pd.DatetimeIndex(data['date']).month
data['month_year']= pd.to_datetime(data['date']).dt.to_period('M')

count_year= data.groupby(['year']).agg('count')['id'].to_frame(name='count').reset_index()

# <font color="violet"><b>**Killings by Year**

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10,7))
sns.barplot(x=count_year['year'], y=count_year['count'], data=count_year)
plt.xlabel("")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title('Killings by Year')
plt.ylabel("")
plt.show()

* The data in the year 2020 is till June that is why the data is why we can see a sudden drop after 2019, but we can see that the killings are getting less by each year.
* The data in the year 2020 is till June that is why the data is why we can see a sudden drop after 2019, but we have already witnessed around 400 casualties till mid of the year 2020.
* After the drop in counts from the year 2015 to 2016, the count of shot people is almost the same

# <font color="violet"><b> Killings by Month

In [None]:
plt.style.use('bmh')
data['month_year']= data.month_year.astype(str)
line_chart = data.groupby(['month_year']).agg('count')['id'].to_frame(name='count').reset_index()
plt.figure(figsize=(20,8))
plt.xticks(fontsize=12)
plt.yticks(fontsize=15)
plt.plot(line_chart['month_year'],line_chart['count'])
plt.title('Killings by Month')
plt.xticks(ticks = line_chart['month_year'],rotation=90)
plt.show()

In [None]:
avg_shot_per_day = (count_year['count'][0:5].sum())/(365*5)
print('Avg death count per day',avg_shot_per_day)

In [None]:
avg_per_month = (count_year['count'][0:5].sum())/(12*5)
print('Avg death count per month',avg_per_month)

In [None]:
line_chart.columns = ['Month_Year', 'Victim_Count']
max_1 = pd.DataFrame(line_chart[line_chart.Victim_Count == line_chart.Victim_Count.max()].reset_index(drop=True))
min_1 = pd.DataFrame(line_chart[line_chart.Victim_Count == line_chart.Victim_Count.min()].reset_index(drop=True))
print('Max amount of Death is in Month of\n',max_1)
print('**********************************************')
print('Min amount of Death is in Month of\n',min_1)

In [None]:
shot_or_taser = data.groupby(['Age Group','manner_of_death']).agg('count')['id'].to_frame(name='count').reset_index()
shot_or_taser = shot_or_taser.rename(columns = {'manner_of_death':'Manner of Death', 0:'Count'})
shot_or_taser = shot_or_taser.sort_values(by=['count'],ascending=False)

# <font color="violet"><b>Killings by Age Category

In [None]:
plt.style.use('seaborn-pastel')
plt.figure(figsize=(20,10))
sns.barplot(x="Age Group", y="count",hue="Manner of Death", data=shot_or_taser)
plt.xlabel("")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title('Killings by Age Category')
plt.ylabel("")
plt.show()

* The data points are not clear because the Shot count is comparatively very high. Let's check it on Log scale

In [None]:
plt.style.use('seaborn-pastel')
plt.figure(figsize=(15,7))
sns.barplot(x="Age Group", y="count",hue="Manner of Death", data=shot_or_taser,log=True)
plt.xlabel("")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title('Killings by Age Group(On Log Axis)')
plt.ylabel("")
plt.show()

* We can see that shot as well as Shot and Tasered both types are highest in Adult Category

# <font color="violet"><b>Shot or Tasered(in Percentages)

In [None]:
a=shot_or_taser.groupby(['Age Group']).sum().reset_index()

list_percent = []
total = a['count']
temp = 0
temp_2 = 0
for i in range(4):
    for j in range(2):
        per=(shot_or_taser['count'][temp])/total[temp_2]
        list_percent.append(per)
        temp=temp+1
    temp_2=temp_2+1

list_1 = ['Shot','Shot and Tasered']
teenager = list_percent[0:2]
adult = list_percent[2:4]
old = list_percent[4:6]
very_old = list_percent[6:8]

Let's Analyse by Age Category

In [None]:
#For Teenager and Adult
plt.style.use('seaborn-pastel')
fig, (ax1,ax2) = plt.subplots(1,2,figsize = (15,15))
plt.rcParams.update({'font.size': 18})
ax1.pie(teenager, labels=list_1, shadow=True, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})

ax2.pie(adult, labels=list_1, shadow=True, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})

ax1.set_title("Teenager")
ax2.set_title("Adult")
fig.tight_layout()
plt.show()

In [None]:
plt.style.use('seaborn-pastel')
fig, (ax3,ax4) = plt.subplots(1,2, figsize = (15,15))
plt.rcParams.update({'font.size': 18})
ax3.pie(old, labels=list_1, shadow=True, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})
ax4.pie(very_old, labels=list_1, shadow=True, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})

ax3.set_title("Old")
ax4.set_title("Very Old")
plt.tight_layout()
plt.show()

* Looks like that tasers are used comparatively less in case of Very old people
* Let's Look at the actual data points

## <font color="violet"><b>Actual Data points (manner of death by year by Age)

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(20,10))
sns.swarmplot(data=data, x="year", y="age",hue="manner_of_death")
plt.xlabel("")
plt.ylabel("")
plt.title('Manner of Death by Year and Age')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

* It looks like that the numbers of Shot and Tasered people are comparatively less than previous years, let's confirm that by checking the data

In [None]:
data.groupby(['year','manner_of_death']).count()['id'].reset_index()

* We can see that not only Shot and Taseres but also Shot category is getting less year by year[](http://)

## <font color="violet"><b>Armed or Not

In [None]:
#finding top 5(what ammunitions used)

armed_or_not=data.groupby(['armed']).size()
armed_or_not=armed_or_not.sort_values()
armed_or_not = armed_or_not.to_frame().reset_index()
armed_or_not = armed_or_not.rename(columns={'armed':'Armed', 0:'Count'})
armed_or_not = armed_or_not.sort_values(by = ['Count'],ascending=False)
top_5= armed_or_not.head(5)

In [None]:
plt.figure(figsize=(20,10))
plt.style.use('fivethirtyeight')
plt.bar(top_5.Armed,top_5.Count)
sns.barplot(x='Armed', y='Count', data=top_5)
plt.title('Armed or Not')
plt.ylabel('Number of Victims')
plt.xlabel('')
plt.show()

* We can see clearly that majority people had gun, probably becaause of the lenient Arms laws in the U.S
* Let's take a clear picture with help of Pie Chart by summarising all other weapons in an Other Category

In [None]:
records = data.shape[0]
armed_or_not_pie = armed_or_not.head(4)
list_p = []
for i in (range(len(armed_or_not_pie))):
    temp=(armed_or_not_pie['Count'].values[i]/records)*100
    list_p.append(temp)

Others_p= 100-sum(list_p)
percentages=pd.Series(list_p)
armed_or_not_pie['percent'] = percentages.values
armed_or_not_pie.drop('Count',axis=1,inplace=True)
Others_df = ['Others',Others_p]
other_series = pd. Series(Others_df, index = armed_or_not_pie.columns)
armed_or_not_pie = armed_or_not_pie. append(other_series, ignore_index=True)

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7,7))
plt.rcParams.update({'font.size': 18})
plt.pie(armed_or_not_pie['percent'], labels=armed_or_not_pie['Armed'], shadow=True,
        startangle=90, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})
plt.title("Armed or Not")
plt.tight_layout()
plt.show()

* 56.3% people were armed with Gun, and then we have 14.5 people who were armed with Knife
* Others category include all other categories except top 4

# <font color="violet"><b>Gender Wise Killings per Year

In [None]:
#full df
gender_shoot = data.groupby(['year','gender']).agg('count')['id'].to_frame(name='count').reset_index()

#for Male
male = gender_shoot.loc[gender_shoot['gender']=='M']

#for Female
female = gender_shoot.loc[gender_shoot['gender']=='F']

In [None]:
#plotting the data
plt.figure(figsize=(20,10))
plt.style.use('fivethirtyeight')
x_indexes = np.arange(len(male['year']))
width = 0.40

plt.bar( x_indexes,male['count'],width = width,label = 'Male')
plt.bar( x_indexes+width,female['count'],width = width, label = 'Female')
plt.title('Gender Wise Killings per Year')
plt.xticks(ticks = x_indexes, labels = female['year'])
plt.tight_layout()
plt.legend()
plt.show()

* If we talk about killings by gender, then we can see that Men count is much larger than women

## <font color="violet"><b>Killings By Age Group

In [None]:
df =data.groupby(['Age Group','gender']).agg('count')['id'].to_frame(name='count').reset_index()
df=df.sort_values(by=['count'],ascending=False)

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,8))
sns.barplot(x='Age Group', y='count', data=df)
plt.title('Killings by Age Group')
plt.ylabel('Number of Victims')
plt.show()

* Most people belong to Adult Category, let's take a closer look by including Gender into picture on log scale

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10))
sns.barplot(x='Age Group', y='count', hue='gender', data=df,log=True)
plt.title('Killings by Age Group(on log)')
plt.ylabel('Number of Victims')
plt.show()

# <font color="violet"><b>Killings by Race by Year

In [None]:
race_df = data.groupby(['year','race']).agg('count')['id'].to_frame(name='count').reset_index()
race_df = race_df.sort_values(by='count',ascending=False)

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10))
sns.barplot(x="year", y="count",hue="race", data=race_df)
plt.title('Killings by Race by Year')
plt.ylabel('Number of Victims')
plt.xlabel('')
plt.show()

* If we talk about Race, then we can see that every year the count of killings in White Race is much larger than other races
* The above chart proves that the most people are from White race, then Black and then from Hispanic Race

# <font color="violet"><b>Killings by Race by Age

In [None]:
plt.style.use('fivethirtyeight')
fig_dims = (20, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.stripplot(
    data=data, x="age", y="race",s=20,
    alpha=0.20,jitter=0.30
)
plt.title('Killings by Race by Age')
plt.ylabel('')
plt.xlabel('Age')
plt.show()

* We can see that In Native, Other and Asian Race the age primarily lies in range 20-60
* The cases of age Above 60 years usually falling in White Race 
* Let's find out the media ages by different races

# <font color="violet"><b>Killing by Race (Monthly Average)

In [None]:
killings_by_race =data.groupby(['year','month_year','race']).count()['id'].reset_index()
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10))
sns.boxplot(x=killings_by_race['race'],y=killings_by_race['id'],dodge=False)
plt.title('Killings by Race(By Month Average)')
plt.ylabel('Number of Victims')
plt.xlabel('')
plt.show()

* On Average 37 White people get shot every Month
* On Average 19 Black people get shot every Month
* On Average 13 Hispanic people get shot every Month

# <font color="violet"><b>Median Age by Race

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10))
sns.boxplot(x=data['race'],y=data['age'],dodge=False)
plt.title('Killings by Race by Age')
plt.ylabel('Age')
plt.xlabel('')
plt.show()

* It is clear from the above graph that Other category is having least meadin age, and then we have Black, Hispanic and then White

# <font color="violet"><b>Let's visualize top 3 races

In [None]:
#Comparing White V/S Black V/S Hispanic
white_black_hispanic = data[data['race']. isin (['Black','White','Hispanic'])]
white_black_hispanic=white_black_hispanic.groupby(['year','race']).agg('count')['id'].to_frame(name='count').reset_index()
white_black_hispanic['year']= white_black_hispanic['year'].astype(str)

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10))

colors = ["#2874A6", "#B9770E","#922B21"]
sns.set_palette(sns.color_palette(colors))

sns.lineplot( data=white_black_hispanic,x="year", y="count", hue="race")
plt.title('Killings in Top 3 Races by Year')
plt.ylabel('Number of Victims')
plt.xlabel('')
plt.show()

* We can say that the most number of people shot are from White race and then from Black Race

# <font color="violet"><b>States with most number of cases

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10))
plt.bar(data['state'].value_counts().index,data['state'].value_counts().values)
plt.title('Killings by different States')
plt.ylabel('Number of Victims')
plt.xticks(fontsize=11)
plt.show()

* We can see that most of the cases are from California, Texas and Florida

# <font color="violet"><b>Let's look at top 5 states and check whether the person was fleeing or not

In [None]:
top_5_states = data[data['state'] . isin(['CA','TX','FL','AZ','CO'])]
top_5_states = top_5_states.groupby(['state','flee']).agg('count')['id'].to_frame(name='count').reset_index().sort_values('count',ascending=False)

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20,10))
sns.barplot(x='state', y='count',hue='flee', data=top_5_states)
plt.ylabel('Number of Victims')
plt.title('Killings by States with different flee Status')
plt.xlabel("")
plt.show()

* It is clear from above chart that most of the people in all top 5 states were not fleeing during the encounter

# <font color="violet"><b>Were they Mentally affected?

In [None]:
plt.style.use('fivethirtyeight')

fig_dims = (20, 20)
fig, ax = plt.subplots(figsize=fig_dims)
sns.stripplot(
    data=data, x="age", y="race",hue="signs_of_mental_illness",s=25,
    alpha=0.5,jitter=0.40
)
plt.title('Killings by Race on basis of Mental Status')
plt.axis('tight')
plt.xlabel('Age')
plt.show()

* It looks like most of them were mentally stable. Let's Visualize Top 3 Races i.e Black, White, Hispanic by Pie chart

# <font color="violet">Top 3 Races by Mental Status

In [None]:
#Let's check by pie chart
pie_chart =data.groupby(['race','signs_of_mental_illness']).count()['id'].reset_index()
black_pie_chart = pie_chart[pie_chart['race']=='Black']
white_pie_chart = pie_chart[pie_chart['race']=='White']
hispanic_pie_chart = pie_chart[pie_chart['race']=='Hispanic']

In [None]:
plt.style.use('seaborn-pastel')
fig, (ax1,ax2,ax3) = plt.subplots(1,3,figsize = (15,15))
plt.rcParams.update({'font.size': 18})
ax1.pie(black_pie_chart['id'], labels=black_pie_chart['signs_of_mental_illness'], shadow=True,
        startangle=90, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})
ax1.set_title("Race: Black")

ax2.pie(white_pie_chart['id'], labels=white_pie_chart['signs_of_mental_illness'], shadow=True,
        startangle=90, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})
ax2.set_title("Race - White")

ax3.pie(hispanic_pie_chart['id'], labels=hispanic_pie_chart['signs_of_mental_illness'], shadow=True,
        startangle=90, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})
ax3.set_title("Race - Hispanic")

fig.tight_layout()

plt.show()

# <font color="blue">Please Upvote if you Like :)

![](https://d241gzwmzya7ka.cloudfront.net/autosizing_assets/Blog-Post-Images/_1200x630_crop_center-center_82_none/Blog-image-2020-04-07@2x.png?mtime=20200406102912&focal=none&tmtime=20200406103158)