# Shooting Analysis a data science project on Kaggle
Problem Statement:

Questions that you can analyze, for example:

What about rate of shootings
What is the rate of killings relative to race and age
Which states have the most kills


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Data

In [None]:
shooting=pd.read_csv('/kaggle/input/data-police-shootings/fatal-police-shootings-data.csv')
shooting.info()

# Identifying Missing Data and replacing missing values with appropiate values

In [None]:
shooting.isnull()


**Importing libratries**

In [None]:

import seaborn as sns
# to render the graphs
import matplotlib.pyplot as plt
# import module to set some ploting parameters
from matplotlib import rcParams


# This function makes the plot directly on browser
%matplotlib inline

# Seting a universal figure size 
rcParams['figure.figsize'] = 10,8
# let us find the missing values.represented as yellow lines
sns.heatmap(shooting.isnull(),yticklabels=False,cbar=False,cmap='viridis')

*Factors like Armed, Age, race and flee are having missing values*

**Adderssing missing value of Age**

In [None]:
# figure size
plt.figure(figsize=(12,5))

# using facetgrid that is a great way to get information of our dataset
g = sns.FacetGrid(shooting, col='gender',size=5)
g = g.map(sns.distplot, "age")
plt.show()

*For Both Male and Female , the mean age for being shot is same but median and mode seems to be different. It is clear the young people age less than 40 years have highest of being shot *

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='gender',y='age',data=shooting,palette='winter')

**For Both Male and Female , the median  age for being shot is same but mode seems to be different. It is clear that 
* young people age less than 40 years have highest of being shot
* old age Man have hightest chance of being shot than female**

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='threat_level',y='age',data=shooting,palette='winter')

*From Threat level perspective:
* People with threat level "attach" have higher chances of being older than other "threat levels"
* Mean and Median Age of being shot of  threat levels (attack and others) seems to be same
* Mean and Median Age of being shot of "undetermined"s seems to be different from others
**Conclusion: to replace missing age it is better to identify the threat level and replace it with median **

In [None]:

def impute_age(cols):
    age = cols[0]
    threat_level = cols[1]
    
    if pd.isnull(age):

        if threat_level == "attack":
            return 38

        elif threat_level == "other":
            return 38

        else:
            return 36

    else:
        return age

In [None]:
shooting['age'] = shooting[['age','threat_level']].apply(impute_age,axis=1)

Adderssing missing value of factor gender

In [None]:
shooting['gender'].fillna(shooting['gender'].mode()[0], inplace=True)

**Adderssing missing value of factor armed**

In [None]:
sns.countplot(x='armed',hue='manner_of_death',data=shooting,palette='rainbow')

In [None]:
shooting['armed'].mode()

In [None]:
shooting['armed'].fillna(shooting['armed'].mode()[0], inplace=True)

**Adderssing missing value of factor race**

In [None]:
shooting['race'].value_counts()

In [None]:
sns.heatmap(shooting.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:

plt.subplot(2,1,1)
sns.countplot("race",data=shooting,hue="manner_of_death", palette="hls")
plt.ylabel("count", fontsize=18)
plt.xlabel("race", fontsize=18)
plt.title("race dist ", fontsize=20)
plt.show()

* Whites have higher chance of being shot compared to other race**

In [None]:
mostshotrace =pd.DataFrame(shooting.groupby('race')['state'].count())
mostshotrace  = mostshotrace .sort_values('state', ascending=False)
mostshotrace.head(30).plot(kind = "bar")

* Whites have higher chance of being shot compared to other race**

In [None]:
race_state3 = shooting.pivot_table(values='age', index='state', columns='race', fill_value=0)
race_state3

In [None]:
sns.set(style="ticks")
#exercise = sns.load_dataset("shooting")
g = sns.catplot(x="state", y="age", hue="race",height=10, data=shooting)

In [None]:
race_state = shooting.groupby(['state','race']).count()


In [None]:
race_state

In [None]:
race_state=race_state.reset_index()

In [None]:
race_state

In [None]:
race_state2 = race_state.pivot_table(values='age', index='state', columns='race', fill_value=0)
race_state2

In [None]:
def impute_race(cols):
    race = cols[0]
    state = cols[1]
    
    if pd.isnull(race):

        if state == "HI":
            return "O"

        elif state == "AZ":
            return "N"

        else:
            return "W"

    else:
        return race

In [None]:
shooting['race'] = shooting[['race','state']].apply(impute_race,axis=1)

In [None]:
#plt.subplot(2,1,1)
sns.set(style="ticks")
#exercise = sns.load_dataset("shooting")
g = sns.catplot(x="state", y="age", hue="manner_of_death",height=10, data=shooting)

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='state',y='age',data=shooting,palette='winter')

In [None]:
sns.heatmap(shooting.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
df=shooting.drop(['flee'], axis=1)


In [None]:
df=shooting.drop(['id'],axis=1)

In [None]:
df.info()

In [None]:
##df_train['Estimated_Insects_Count'] = df_train['Estimated_Insects_Count'].astype("int16")

df['date']=pd.to_datetime(df['date'],dayfirst=True)

In [None]:
df['quarter'] = df['date'].dt.quarter
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['week_day'] = df['date'].dt.dayofweek

In [None]:
df.info()

In [None]:
df.drop(['name'],axis=1)

In [None]:
df=df.drop(['name'],axis=1)

In [None]:
df=df.drop(['date'],axis=1)

In [None]:
df.info()

In [None]:
df=df.drop(['flee'],axis=1)

In [None]:
df.info()

In [None]:
#df = pd.get_dummies(df, columns=["manner_of_death","armed","gender","race","city","state","signs_of_mental_illness","threat_level","body_camera"],\
                         #prefix=["death","arned","gender","race","city","st","ill","threat","photo"], drop_first=False)

In [None]:
#plt.show()
#plt.figure(figsize=(15,12))
#sns.heatmap(df.astype(float).corr(),vmax=1.0,  annot=True)
#plt.show()

In [None]:
df.info()

In [None]:
#df["manner_of_death"].value_counts()
#columns=["manner_of_death","armed","gender","race","city","state","signs_of_mental_illness","threat_level","body_camera"]
#for i in columns:
    #df[i].value.counts()
    
df["manner_of_death"].value_counts()
df["armed"].value_counts()
df["gender"].value_counts()
df["race"].value_counts()
df["city"].value_counts()
df["state"].value_counts()
df["signs_of_mental_illness"].value_counts()
df["threat_level"].value_counts()
df["body_camera"].value_counts()


In [None]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
df["manner_of_death"]= encoder.fit_transform(df["manner_of_death"].fillna('Nan'))
df["armed"]= encoder.fit_transform(df["armed"].fillna('Nan'))
df["gender"]= encoder.fit_transform(df["gender"].fillna('Nan'))
df["race"]= encoder.fit_transform(df["race"].fillna('Nan'))
df["city"]= encoder.fit_transform(df["city"].fillna('Nan'))
df["state"]= encoder.fit_transform(df["state"].fillna('Nan'))
df["signs_of_mental_illness"]= encoder.fit_transform(df["signs_of_mental_illness"].fillna('Nan'))
df["threat_level"]= encoder.fit_transform(df["threat_level"].fillna('Nan'))
df["body_camera"]= encoder.fit_transform(df["body_camera"].fillna('Nan'))

In [None]:
df.info()

In [None]:
plt.show()
plt.figure(figsize=(15,12))
sns.heatmap(df.astype(float).corr(),vmax=1.0,  annot=True)
plt.show()

In [None]:
df.isnull().sum()


In [None]:
rateofshooting=df.groupby(['year','city']).count()
rateofshooting

In [None]:
rateofshooting=rateofshooting.reset_index()

In [None]:
rateofshooting

In [None]:
#Plotting the count of title by Crop damage or not category
sns.countplot(x='year', data=df, palette="hls",hue="quarter")
plt.xlabel("year", fontsize=16)
plt.ylabel("rate", fontsize=16)
plt.title("yearly rate if shooting", fontsize=20)
plt.xticks(rotation=45)
plt.show()

In [None]:
#Plotting the count of title by Crop damage or not category
sns.countplot(x='year',data=df, palette="hls",hue="week_day")
plt.xlabel("year", fontsize=16)
plt.ylabel("rate", fontsize=16)
plt.title("yearly rate if shooting", fontsize=20)
plt.xticks(rotation=45)
plt.show()

In [None]:
df1= shooting.pivot_table(values='age', index='race', columns='state', fill_value=0)
df1

In [None]:
sns.barplot(x="race", y="age", data=shooting)

In [None]:
plt.figure(figsize=(20, 12))

sns.countplot(x = 'state',
              data = shooting,
              order = shooting['state'].value_counts().index)
plt.show()

In [None]:
shooting.info()

In [None]:
df['state'].value_counts()[:10]
shootingystate=shooting['state'].value_counts()[:10]
shootingystate

In [None]:
shootingystate1=pd.DataFrame(shootingystate).reset_index()
import plotly.express as px
fig = px.pie(shootingystate1, values='state', names='index', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()