In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import math
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

## Data loading and preprocessing

In [None]:
# load fatal shootings data and dummify categorical vars
shootings = pd.read_csv('../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv', 
                 encoding='windows-1252')
shootings = pd.get_dummies(shootings, columns = ['gender'], drop_first=True)

shootings = pd.get_dummies(shootings, columns = ['manner_of_death',
                                          'race', 'threat_level', 'flee'], drop_first=False)

# load USCB county data
census = pd.read_csv('../input/us-census-demographic-data/acs2017_county_data.csv')

# load city data
city_pops = pd.read_csv('../input/us-cities-counties-and-states/us_cities_counties.csv')

# load and get state abbreviations in a dict
abbreviations_dict = pd.read_csv('../input/us-state-county-name-codes/states.csv', 
                     index_col='State')
abbreviations_dict = abbreviations_dict['Abbreviation'].to_dict()

In [None]:
# remove missing pop data and unneeded cols and standardize county/state names
city_pops.dropna(subset=['population'], inplace=True)
city_pops.drop(['city_ascii', 'county_fips_all', 'county_fips', 'county_name_all', 
                'source', 'military', 'incorporated', 'timezone', 'ranking', 
                'zips', 'id', 'state_name'], axis = 1, inplace=True)
city_pops.rename(columns={'state_id':'state', 'population':'city_pop','lat':'city_lat', 
                          'lng':'city_long', 'county_name':'county', 
                          'density':'city_pop_density_km2'}, inplace=True)
city_pops.sort_values(by='state', inplace=True)

census.replace({'State': abbreviations_dict}, inplace=True)
census.drop(['CountyId', 'IncomePerCapErr', 'IncomeErr'], axis=1, inplace=True)
census.rename(columns={'State':'state', 'County':'county','TotalPop':'county_pop',
                       'Men':'county_pop_men', 'Women':'county_pop_women', 
                       'Employed':'county_pop_employed'}, inplace=True)
census['county'] = census['county'].str.replace(' county', '', case=False)

shootings.drop(['id'], axis=1, inplace=True)
shootings.reset_index(inplace=True, drop=False)

In [None]:
# merge the datasets
merged = shootings.merge(city_pops, how='left', on=['state', 'city'])
merged = merged.merge(census, how='left', on=['state', 'county'])

In [None]:
def calculate_pvalues(df):
    ''' computes the p-value for each correlation'''
    df = df._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            x,y = removeMissingData(df[r],df[c])
            results = stats.pearsonr(x,y)
            pvalues[r][c] = round(results[1], 4)
    return pvalues

def removeMissingData(a, b):
    x = list(a)
    y = list(b)
    i = len(x) -1
    while(i != -1):  # get rid of missing values
        if x[i] == None or y[i] == None \
        or math.isnan(x[i]) or math.isnan(y[i]):
            del x[i]; del y[i]
        i -= 1
    return (x,y)

            
def graph_confidence_95(pvalues, correlations, row, col):
    try:
        confidence95 = pvalues.copy()
        confidence95[confidence95 > .05] = math.nan
        correlations[(correlations < 0.1)&(correlations > -0.1)] = math.nan
        
        plot_df = correlations[~pd.isna(confidence95)].reindex(index = row, columns = col)
        plot_df = plot_df.dropna(axis = 'columns', how = 'all')
        plot_df = plot_df.dropna(axis = 'rows', how = 'all')
        
#         mask = np.zeros_like(plot_df)
#         mask[np.triu_indices_from(mask)] = True
        
        f, ax = plt.subplots(figsize= (plot_df.shape[1]//1.5, plot_df.shape[0]//2))
        ax = sns.heatmap(plot_df, mask=None, linewidths=.1, linecolor='black', 
                         annot=True, vmin = -1, vmax = 1)
        
        # save_folder = all_folder
        title = 'correlations: rho > |0.1| and p-value < 0.05'
        ax.set_title(title, fontsize=16)
        # f.savefig(os.path.join(save_folder, title))
        plt.show()
    except ValueError as err:
            print(err)

In [None]:
# sns.set(style='white', color_codes=True)
# sns.countplot(y='race', data=merged[merged['armed'] == 'unarmed'])
# plt.show()

# sns.set(style='white', color_codes=True)
# sns.countplot(y='race', data=merged[merged['armed'] == 'gun'])
# plt.show()

for every county and state
proportion killed by race
proportion race in population

prop getting killed = # killed in county / total race in county

In [None]:
# get population by race by county
rs = ['Asian', 'Black', 'Hispanic', 'Native',  'Pacific', 'White']

for r in rs:
    merged['county_pop_'+r] = (merged[r] * merged['county_pop']) / 100

merged.rename(columns={
    'Asian': 'asian_pct',
    'Black': 'black_pct', 
    'Hispanic': 'hispanic_pct', 
    'Native': 'native_pct',  
    'Pacific': 'pacific_pct', 
    'White': 'white_pct'}, inplace=True)

In [None]:
# # get only blacks and whites
# b = merged['race_B'] == True
# w = merged['race_W'] == True
# bw_only = merged[(b)|(w)]
# bw_only.drop(columns=['race_W'], inplace=True)

In [None]:
merged.columns

In [None]:
correlations = merged.corr()
pvalues = calculate_pvalues(merged).astype(float)
row = list([ele for ele in list(merged.columns) if ele not in {
    'race_A', 'race_B', 'race_H', 'race_N', 'race_O', 'race_W'}])
col = list(merged.columns)[12:18]
graph_confidence_95(pvalues, correlations, row, col)

In [None]:
merged.columns

In [None]:
merged.groupby(by=['state', 'county'])[['county_pop_Asian',
                                        'county_pop_Black', 
                                        'county_pop_Hispanic', 
                                        'county_pop_White']].sum()

In [None]:
total_deaths = merged[['race_A','race_B', 'race_H', 'race_W']].agg(['sum'])
total_deaths

In [None]:
total_pop = (merged[['county_pop_Asian', 'county_pop_Black',
                    'county_pop_Hispanic', 'county_pop_White']].agg(['sum']) / 100000).round()
total_pop

In [None]:
# deaths per 100,000 by race
prob_killed_race = total_deaths.values / total_pop.values
prob_killed_race

In [None]:
prob_killed_race.tolist()