# Police Violence in US, 2013-2021

1st draft of notebook: data profile and plots

###  import data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#  read data
df = pd.read_csv('../input/police-violence-in-the-us-20132021/PoliceViolenceUS2013-2021.csv')
#  change date datatype from object to datetime
df['Date'] = df['Date'].astype('datetime64[ns]')
df.head()

##  Data Profile

In [None]:
#  heatmap of null values
plt.figure(figsize=(10,5))
plt.title('PoliceViolenceUS2013-2021', fontsize = 16)
sns.heatmap(df.isnull(), cmap='mako')
plt.show()

##  Age Distribution

In [None]:
#  PLOT AGE/RACE DISTRIBUTION
plt.figure(figsize=(10,5))
plt.subplot(121)
df['Victim Age'].groupby(df['Victim Race']).plot(kind='kde')
plt.xlim([0,80])
plt.title('Age Distribution per Race', fontsize = 16)
plt.xlabel('Victim Age')

#  PLOT AGE/RACE OVER TIME
plt.subplot(122)
sns.lineplot(data = df, x=df['Date'].dt.strftime('%Y').sort_values(), y = 'Victim Age', hue = df['Victim Race'], hue_order = sorted(df['Victim Race'].unique()), ci = 30)
plt.title('Age & Race Distribution over Time', fontsize = 16)
plt.xlabel('year')
plt.legend(bbox_to_anchor =(1.05, 1))
plt.show()

##  Deaths over Time
#### Police Violence Deaths from 2013 to 2020

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(121)
sns.lineplot(data = df['Victim Age'].groupby(df['Date'].dt.strftime('%Y')).size())
plt.title('Total Deaths (2013-2020)', fontsize = 16)
plt.xlim('2013','2020')
plt.ylim([1025,1150])
plt.xlabel('YEAR')
plt.ylabel('COUNT')

plt.subplot(122)
dfx = df[['Victim Race', 'Date']]    # df w/race & date, size and unstack
dfx['Date'] = dfx['Date'].dt.strftime('%Y')  #  YYYY
dfx = dfx.groupby(['Date', 'Victim Race']).size()
dfx = dfx.unstack()

plt.plot(dfx)
plt.title('Deaths per Race (2013-2020)', fontsize = 16)
plt.legend(dfx.columns, bbox_to_anchor =(1.05, 1))
plt.xlim('2013','2020')
plt.xlabel('YEAR')
plt.ylabel('COUNT')
plt.show()

##  Categorical Data Plots
#### Separate Categorical and Numeric Data

In [None]:
df_CAT = []
df_NUM = []
for i in df.columns.sort_values():
    if (len(df[i].unique())) > 10:
        df_NUM.append(i)
    else:
        df_CAT.append(i)
        print('{:>30} {:>3} unique values'.format(i,len(df[i].unique())))

####  Plot Categorical  Data

In [None]:
plt.figure(figsize=(12,len(df_CAT)*5))
a = len(df_CAT) # subplot rows
b = 3           # subplot columns
c = 1           # subplot counter


for i in df_CAT:
    plt.subplot(a,b,c)
    sns.countplot(data = df, x = i, order = df[i].value_counts().index[0:7])
    plt.title(i, fontsize = 14)
    plt.xlabel('')
    plt.xticks(rotation = 90)
    cnt = df[i].value_counts()
    plt.ylim([0,round(max(cnt.values) *1.2, 0)])
    for x, y in enumerate(cnt):
        z = "{}\n({:.2%})\n".format(y,y/len(df[i].notnull()))
        plt.text(x, y, str(z), ha = 'center', va='baseline')#, size = 18)
    c = c + 1

plt.tight_layout()
plt.show()

planning on completing this notebook when I have time :-)
## THE END