### Import required libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import folium

In [None]:
#Read the dataset
df = pd.read_csv("/kaggle/input/data-police-shootings/fatal-police-shootings-data.csv")

### Let's have a look at the data

In [None]:
df.head()

In [None]:
#Check if there are any null values
print(df.isnull().sum())

### Data quality report for categorical variables

In [None]:
#Function to get summary statistics for categorical variable.

def dataQuality(data):
    d={}
    def cat_quality(data):
        def count(x):
            return x.count()
        def miss_per(x):
            return x.isnull().sum()/len(x)
        def unique(x):
            return len(x.unique())
        def freq_cat(x):
            return x.value_counts().sort_values(ascending=False).index[0]
        def freq_cat_per(x):
            return x.value_counts().sort_values(ascending=False).index[0]/len(x)
        qr=dict()
        #select only categorical data types
        data=data.select_dtypes(include=[object])
        for i in np.arange(0,len(data.columns),1):
            xi=data.agg({data.columns[i]:[count,unique,miss_per,freq_cat]})
            qr[data.columns[i]]=xi.reset_index(drop=True)[data.columns[i]]
            df2=pd.DataFrame(qr)
            #df2.index=xi.index
        df2.index=["Count","Unique","Miss_percent","Freq_Level"]
        return df2.T
    d['categorical']=cat_quality(data)
    return d

In [None]:
#Call the above function to get the data quality report.
(dataQuality(df)['categorical'])

#### What we get from the above report?
* California state has the maximum number of shootings and specifically the city of Los Angeles in      the state has the highest number of deaths.
* Most of the people who were shot dead by the police were carrying gun.
* The maximum number of people shot dead are White people.
* The maximum number of people shot dead are males.
* The columns 'armed', 'gender', 'flee' and 'race' are having missing values.* And for 9.6% of the records the 'race' of the person is missing.

### What about rate of shootings?
The number of deaths are increasing every year and the count has touched 1000 in the year 2019.

In [None]:
#Extract year from date and add to the dataframe 
df['year']=pd.DatetimeIndex(df['date']).year

In [None]:
plt.figure(figsize=(5,7))
splot=sns.countplot(data=df,x='year',palette='YlGnBu')
sns.set_style('ticks')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.title("Year and number of shootings")
plt.xlabel('Year')
plt.ylabel('No. of deaths')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(5,7))
splot=sns.countplot(data=df.query("armed == 'unarmed'").query("threat_level != 'attack'"),x='year',palette='YlGnBu')
sns.set_style('ticks')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.title("Year and number of shootings")
plt.xlabel('Year')
plt.ylabel('No. of deaths')
plt.xticks(rotation=45)
plt.show()

### What is the rate of killings relative to race and age?

In [None]:
#Drop all rows having null values
df=df.dropna(subset=['race'])

#Check if there are any null values
df.isnull().sum()

In [None]:
#Replce the acronyms with the actual words
def race(x):
    if(re.findall("W",x)):
        return 'White'
    elif(re.findall("B",x)):
        return 'Black'
    elif(re.findall("A",x)):
        return 'Asian'
    elif(re.findall("N",x)):
        return 'Native American'
    elif(re.findall("H",x)):
        return 'Hispanic'
    elif(re.findall("O",x)):
        return 'Other'
       
df['race']=df['race'].apply(lambda x:race(x))

#### Race and Deaths
The maximum number of people shot dead are **White** people followed by **Black** and **Hispanic**

In [None]:
plt.figure(figsize= (6,7))
splot= sns.countplot(data=df,x='race',palette='YlGnBu')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

sns.set_style('ticks')
plt.title("Race")
plt.xlabel('Race')
plt.ylabel('No. of deaths')
plt.xticks(rotation=45)
plt.show()

#### Age and Deaths
* The minimum age is 6yrs and the maximum age is 91 yrs
* From the histogram we can see that mostly of the people shot dead are in the age group 20 yrs to 40 yrs.
* 75% of the popultion who were shot dead are within 45 yrs of age.


In [None]:
#Descriptive stats
df['age'].describe()

In [None]:
# Histogram to show the distribution
df['age'].plot.hist(grid=True, bins=30, rwidth=0.9,
                   color='darkturquoise')

#### Lets analyze the cases of Minors and Senior Citizens
Most of the minors and senior citizens were in posession of some sort of weapon and mostly in possession of gun and had a purpose to attack

In [None]:
minor=df.query("age <= 16")
print("Minors: ",minor.shape)
senior=df.query("age >=65")
print("Senior citizens: ",senior.shape)

In [None]:
plt.figure(figsize= (10,7))
splot= sns.countplot(x='armed', hue= 'threat_level',data=minor,palette='YlGnBu')
splot.legend(loc='upper right')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

sns.set_style('ticks')
plt.title("Minors and Armed")
plt.xlabel('Armed')
plt.ylabel('No. of deaths')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize= (10,7))
splot= sns.countplot(x='armed',hue = 'threat_level',data=senior,palette='YlGnBu')
splot.legend(loc='upper right')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
sns.set_style('ticks')
plt.title("Seniors and Armed")
plt.xlabel('Armed')
plt.ylabel('No. of deaths')
plt.xticks(rotation=90)
plt.show()

### Let's have a look at the unarmed population
* It seems, in most of the cases, the police did not use the body camera.
* The majority of the unarmed population did not even '**attack**.'


In [None]:
plt.figure(figsize= (10,7))
splot= sns.countplot(x='threat_level',hue ='body_camera',data=df.query("armed=='unarmed'"),palette='YlGnBu')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
sns.set_style('ticks')
plt.title("Unarmed and threat level")
plt.xlabel('Flee')
plt.ylabel('No. of deaths')
plt.xticks(rotation=90)
plt.show()

### Mental illness and not attacking
* 357 cases were recorded where the person was mentally ill, did not attack and was not fleeing as well.
* At the same time it is evident from the data that over the years, the number of such deaths have been decreasing.

In [None]:
plt.figure(figsize= (6,7))
splot= sns.countplot(data=df.query("signs_of_mental_illness == True").query("threat_level!= 'attack'"), x='flee',hue='year',palette='YlGnBu')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

sns.set_style('ticks')
splot.legend(loc='upper right')
plt.title("Mentally ill and were not attacking")
plt.xlabel('flee')
plt.ylabel('No. of deaths')
plt.xticks(rotation=45)
plt.show()

In [None]:
#creaet a dataframe containing states and count of killings those states.

state_count=df[['state','id']].groupby('state',as_index = False).count()
state_count.rename(columns={"id":"count"},inplace=True)

### Which states have the most kills?
* Coropleth map is an interesting way to show state boundaries with colors depicting the death counts.
* **California** has the maximum number of kills followed by **Texas** and **Florida**

In [None]:
# Load the shape of the zone (US states)
# Find the original file here: https://github.com/python-visualization/folium/tree/master/examples/data
# You have to download this file and set the directory where you saved it
#url to get data of the state boundaries of USA
url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'
 
# Initialize the map:
m = folium.Map(location=[37, -102], zoom_start=4)
 
# Add the color for the chloropleth:
choropleth = folium.Choropleth(
 geo_data=state_geo,
 name='choropleth',
 data=state_count,
 columns=['state', 'count'],   
 key_on='feature.id',
 fill_color='GnBu',
 fill_opacity=0.7,
 line_opacity=0.2,
 legend_name='Number of shootins'
).add_to(m)
folium.LayerControl().add_to(m)

choropleth.geojson.add_child(folium.features.GeoJsonTooltip(fields = ['name'],aliases=['State'],style=('background-color: grey; color: white;')))

m

### Top 10 cities with maximum number of deaths

In [None]:
city=df[['city','id']].groupby('city',as_index = False).count()
city.rename(columns={"id":"count"},inplace=True)
city.sort_values(by='count', ascending=False,inplace=True)
city=city[:10]
city

### Conclusion
* The total number of killings has gone up in the past few years.
* Majority of the people who were killed were White people followed by Black and Hispanic people
* In most of the cases, police has not used the body camera during the encounters which should be enforced by the department for transparency.
* Most of the people who were shot dead were in possession of gun and attacked someone.
* Based on the data, killings of innocent people has dropped in the past few years.



#### Thank you so much for going through my notebook. Please<font color='green'> UPVOTE </font>if you found my analysis interesting. Comments are most welcome, as those will help me get better.