In [None]:
# Importing libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#### Loading the dataset

In [None]:
df = pd.read_csv('/kaggle/input/us-police-shootings/shootings.csv')
df.head()

In [None]:
print(f'# of missing values in the dataset: {df.isna().sum().sum()}')

#### How dataset is distributed by gender

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

fig, axes = plt.subplots(2,3, figsize=(18,8))

ft = {'kind':'pie','autopct':'%1.1f%%', 'shadow':True, 'startangle':120}

df['gender'].value_counts().plot( **ft, ax=axes[0,0]);

df['manner_of_death'].value_counts().plot( **ft, ax=axes[0,1]);

df['signs_of_mental_illness'].value_counts().plot( **ft, ax=axes[0,2]);

df['threat_level'].value_counts().plot( **ft, ax=axes[1,0]);

df['flee'].value_counts().plot( **ft, ax=axes[1,1]);

df['body_camera'].value_counts().plot( **ft, ax=axes[1,2]);

In [None]:
df['race'].value_counts().plot(kind='bar', color=['DarkOrange', 'DarkGreen','DarkBlue','Gray','y'], \
                              ylabel='# of shootings', xlabel='race');


In [None]:
import seaborn as sns

g = sns.countplot(x='race', data=df, hue='manner_of_death');
g.set_xlabel('Race')
g.set_ylabel('# of people')
g.set_title('# of dead people based on their race');

#### Feature engineering


In [None]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.loc[:,'year'] = df['date'].dt.year
df.loc[:,'weekofyear'] = df['date'].dt.isocalendar().week
df.loc[:, 'month'] = df['date'].dt.month
df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)
df.loc[:, 'quarter'] = df['date'].dt.quarter

In [None]:
df.head()

In [None]:
fig, axes = plt.subplots(2,2, figsize=(16,12))

def plot_count(x, ax, data=df):
    s = sns.countplot(x=x, data=data, palette='Set2', ax=ax);
    s.set_title(f'# of people died by {x}')
    
plot_count(x='month', ax=axes[0,0])
plot_count(x='dayofweek', ax=axes[0,1])
plot_count(x='weekend', ax=axes[1,0])
plot_count(x='quarter', ax=axes[1,1])

#### Aggregated features

In [None]:
aggs = {}
aggs['month'] = ['nunique','mean']
aggs['weekofyear'] = ['nunique', 'mean']
aggs['age'] = ['min', 'max', 'mean', 'sum']

aggs_df = df.groupby('city').agg(aggs)
aggs_df = aggs_df.reset_index()
aggs_df.head()

#### Creating new features
- maybe we would like to consider both city and race together

In [None]:
df['city_race'] = df['city'].astype(str) + '_' + df['race'].astype(str)
df['city_race'].describe()