## Quick navigation

* [1. Preprocessing](#1)
* [2. Data visualization](#2)
* [3. Focus on cases in California](#3)

## Color Palettes

In this notebook, I used color palettes from websites below
* https://flatuicolors.com/
* https://colorpalettes.net/

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import folium
%matplotlib inline

In [None]:
df = pd.read_csv('../input/us-police-shootings/shootings.csv')

In [None]:
df.head()

In [None]:
df.info()

# Preprocessing
<a id="1"></a>

## Find columns which include null values

In [None]:
def find_null(dataFrame):
    total = len(dataFrame.columns)
    n = 0
    for col in dataFrame.columns:
        null_sum = dataFrame[col].isna().sum()
        if null_sum > 0:
            n+=1
            print(f'"{col}": {null_sum} null values')
            
    print('---------------------------------------')
    print(f'\n{n}/{total} columns have null values')

In [None]:
find_null(df)

## Create "year", "month", "day" and "day of week" columns

In [None]:
df['tmp_date'] = pd.to_datetime(df['date'])
df['year'] = df['tmp_date'].dt.year
df['month'] = df['tmp_date'].dt.month
df['day'] = df['tmp_date'].dt.day
df['day_of_week'] = df['tmp_date'].dt.day_name()

In [None]:
df.head()

## Create "age range" columns

In [None]:
df['age_range'] = pd.cut(df['age'], bins=np.arange(0, 130, 10), right=False)

In [None]:
df.head()

# Data visualization
<a id="2"></a>

## Mapping (Number of cases by state)

In [None]:
tmp_df = df[['id','state']]
tmp_df = tmp_df.groupby('state', as_index=False).count()
tmp_df = tmp_df.rename(columns={'id':'case'})

url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'

m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=tmp_df,
    columns=['state', 'case'],
    key_on='feature.id',
    fill_color='OrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Numer of cases'
).add_to(m)

folium.LayerControl().add_to(m)

m

## Number of cases by state

In [None]:
def show_countplot(df, col, color, figsize=(10,5), rotation=0):
    plt.subplots(figsize=figsize)
    title = 'number of cases by ' + col
    plt.title(title,fontsize=20)
    plt.xlabel(col,fontsize=15)
    plt.xticks(rotation=rotation)
    plt.ylabel('count',fontsize=15)
    df[col].value_counts().plot(kind="bar",color=color)
    plt.show()

In [None]:
show_countplot(df, 'state', "#ff6d69")

## Gender ratio

In [None]:
def plot_pie(df, col, color):
    tmp_df = df[col]
    tmp_df = tmp_df.reset_index()
    tmp_df = tmp_df.groupby([col]).count()
    tmp_df = tmp_df.sort_values('index',ascending=False)

    label = tmp_df.index

    plt.figure(figsize=(12,8))
    title = col + ' ratio'
    plt.title(title,fontsize=20)

    plt.pie(tmp_df, labels=label,colors=color,counterclock=False, startangle=90,autopct="%1.1f%%", pctdistance=0.7)
    plt.show()

In [None]:
color1 = ("#74b9ff", "#e84393")
plot_pie(df, 'gender', color1)

## Manner of death ratio

In [None]:
color2 = ("#ffeaa7","#e17055")
plot_pie(df, 'manner_of_death', color2)

## Race ratio

In [None]:
color3 = ("#55efc4","#81ecec","#74b9ff","#a29bfe","#dfe6e9",
          "#ffeaa7","#e17055","#d63031","#e84393","#2d3436",
          "#00b894","#0984e3","#ffeaa7","#fab1a0","#fd79a8")
plot_pie(df, 'race', color3)

## Arms category count

In [None]:
show_countplot(df, 'arms_category', "#74b9ff")

## Combination of race and other attributes

In [None]:
plt.subplots(figsize=(12,12))
plt.subplot(3,1,1)
sns.countplot(x='race', hue='signs_of_mental_illness', data=df)
plt.title('race & signs_of_mental_illness', fontsize=15)
plt.show()

plt.subplots(figsize=(12,12))
plt.subplot(3,1,2)
sns.countplot(x='race', hue='flee', data=df)
plt.title('race & flee', fontsize=15)
plt.show()

plt.subplots(figsize=(12,12))
plt.subplot(3,1,3)
sns.countplot(x='race', hue='body_camera', data=df)
plt.title('race & body_camera', fontsize=15)
plt.show()

## Number of cases by year

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(x='year', data=df, order=sorted(df['year'].unique()),palette= ["#7fcdbb"])
plt.title('number of cases by year', fontsize=15)
plt.show()

## Number of cases by month (2015~2019)

In [None]:
plt.subplots(figsize=(10,5))
tmp_df = df[df['year']!=2020]
sns.countplot(x='month', data=tmp_df, order=sorted(df['month'].unique()),palette= ["#fd79a8"])
plt.title('number of cases by month (2015~2019)', fontsize=15)
plt.show()

## Number of cases by day of week

In [None]:
plt.subplots(figsize=(10,5))
order=['Monday','Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.countplot(x='day_of_week', data=df, order=order,palette= ["#6c5ce7"])
plt.title('number of cases by day_of_week', fontsize=15)
plt.show()

## Number of cases by day (2015~2019)

In [None]:
plt.subplots(figsize=(10,5))
tmp_df = df[df['year']!=2020]
sns.countplot(x='day', data=tmp_df, order=sorted(df['day'].unique()),palette= ["#fdcb6e"])
plt.title('number of cases by day (2015~2019)', fontsize=15)
plt.show()

## Number of cases by age_range

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(x='age_range', data=df, order=sorted(df['age_range'].unique()),palette= ["#013766"])
plt.title('number of cases by age_range', fontsize=15)
plt.show()

# Focus on cases in California
<a id="3"></a>

California has the largest number of deathes, so focus on California and compare other states

In [None]:
# Mapping (Number of cases by state)
tmp_df = df[['id','state']]
tmp_df = tmp_df.groupby('state', as_index=False).count()
tmp_df = tmp_df.rename(columns={'id':'case'})

url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'

m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=tmp_df,
    columns=['state', 'case'],
    key_on='feature.id',
    fill_color='OrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Numer of cases'
).add_to(m)

folium.LayerControl().add_to(m)

m

In [None]:
df_ca = df[df['state']=='CA']
df_other = df[df['state']!='CA']

In [None]:
df_ca.info()

## Race ratio - California vs other states

In [None]:
color = ["#55efc4","#81ecec","#74b9ff","#a29bfe","#dfe6e9","#ffeaa7"]
order=['Hispanic','White', 'Black', 'Asian', 'Native', 'Other']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))


sns.countplot(x='race', data=df_ca, order=order,palette=color, ax=ax1)
ax1.set_title('Number of cases in CA by race', fontsize=15)


sns.countplot(x='race', data=df_other, order=order,palette=color, ax=ax2)
ax2.set_title('Number of cases in other states by race', fontsize=15)

fig.show()

## Gender ratio - California vs other states

In [None]:
color = ["#74b9ff", "#e84393"]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))


sns.countplot(x='gender', data=df_ca, palette=color, ax=ax1)
ax1.set_title('Number of cases in CA by gender', fontsize=15)


sns.countplot(x='gender', data=df_other, palette=color, ax=ax2)
ax2.set_title('Number of cases in other states by gender', fontsize=15)

fig.show()

In [None]:
color = ["#013766"]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))


sns.countplot(x='age_range', data=df_ca, palette=color, ax=ax1)
ax1.set_title('Number of cases in CA by gender', fontsize=15)


sns.countplot(x='age_range', data=df_other, palette=color, ax=ax2)
ax2.set_title('Number of cases in other states by gender', fontsize=15)

fig.show()

In [None]:
color = ["#7fcdbb"]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))


sns.countplot(x='year', data=df_ca, palette=color, ax=ax1)
ax1.set_title('Number of cases in CA by year', fontsize=15)


sns.countplot(x='year', data=df_other, palette=color, ax=ax2)
ax2.set_title('Number of cases in other states by year', fontsize=15)

fig.show()