# Preparing Data for Per Capita Stops and Veil of Darkness Analysis

* This notebook uses the stops data and census data previously cleaned and prepared to create aggregated datasets for analysis

In [1]:
# importing libraries
import pandas as pd
import numpy as np

In [2]:
# loading the data
df_ca = pd.read_csv('Data/df_all.csv', low_memory=False)
df_census = pd.read_csv('Data/df_census.csv')

## Cleaning Police Stops Data

In [3]:
#list first five rows of the stops data
print('Preview data:')
display(df_ca.head())

Preview data:


Unnamed: 0.1,Unnamed: 0,date,time,lat,lng,subject_age,subject_race,subject_sex,type,outcome,contraband_found,search_conducted,city
0,0,,01:33:00,37.803084,-122.237247,,asian/pacific islander,female,vehicular,citation,,False,Oakland
1,1,,14:48:00,37.767568,-122.19682,,black,male,,,False,True,Oakland
2,2,2013-04-01,00:00:00,37.820598,-122.270734,,white,male,vehicular,warning,False,True,Oakland
3,3,2013-04-01,00:01:00,37.821246,-122.276488,,black,male,vehicular,,,False,Oakland
4,4,2013-04-01,10:41:00,37.802937,-122.271738,,hispanic,female,,citation,,False,Oakland


In [4]:
# list the columns of the stops data
df_ca.columns

Index(['Unnamed: 0', 'date', 'time', 'lat', 'lng', 'subject_age',
       'subject_race', 'subject_sex', 'type', 'outcome', 'contraband_found',
       'search_conducted', 'city'],
      dtype='object')

In [5]:
# Unnamed: 0 is the same as the indicies so we can drop it
df_ca = df_ca.drop(['Unnamed: 0'], axis = 1)

In [6]:
df_ca.count()

date                7636058
time                7180616
lat                 1677816
lng                 1677816
subject_age         1823989
subject_race        7628381
subject_sex         7481251
type                7570426
outcome             2033924
contraband_found     152536
search_conducted    1612399
city                7636246
dtype: int64

In [7]:
# Filling in contraband_found, subject_sex, outcome, and subject_age with placevalues
df_ca['contraband_found'].fillna('N/A',inplace=True)
df_ca['search_conducted'].fillna('N/A',inplace=True)
df_ca['lat'].fillna('N/A',inplace=True)
df_ca['lng'].fillna('N/A',inplace=True)
df_ca['time'].fillna('N/A',inplace=True)
df_ca['subject_sex'].fillna('N/A',inplace=True)
df_ca['outcome'].fillna('N/A',inplace=True)
# subject_age is filled with -1, so it will be numeric, but -1 means age is unknown
df_ca['subject_age'].fillna(-1,inplace=True)

In [8]:
# Droping rows with missing values
df_ca = df_ca.dropna()
df_ca.reset_index(drop=True, inplace=True)

In [9]:
# Checking the counts again
df_ca.count()

date                7565316
time                7565316
lat                 7565316
lng                 7565316
subject_age         7565316
subject_race        7565316
subject_sex         7565316
type                7565316
outcome             7565316
contraband_found    7565316
search_conducted    7565316
city                7565316
dtype: int64

In [10]:
# Checking datatypes
print('Check datatypes:')
display(df_ca.dtypes)

Check datatypes:


date                 object
time                 object
lat                  object
lng                  object
subject_age         float64
subject_race         object
subject_sex          object
type                 object
outcome              object
contraband_found     object
search_conducted     object
city                 object
dtype: object

In [11]:
# Changing the age column to a numerical value
df_ca['subject_age'] = pd.to_numeric(df_ca['subject_age'])
# Changing date to a datetime value
df_ca['date'] = pd.DatetimeIndex(df_ca['date'])

In [12]:
print(df_ca.groupby(['city'])['date'].min())
print('  ')
print('  ')
print(df_ca.groupby(['city'])['date'].max())
print("So we will drop the Santa Anna dataset, and keep the data from 2014")

city
Bakersfield     2008-03-09
Long Beach      2008-01-01
Los Angeles     2010-01-01
Oakland         2013-04-01
San Diego       2014-01-01
San Francisco   2007-01-01
San Jose        2013-09-01
Santa Ana       2014-06-25
Stockton        2012-01-01
Name: date, dtype: datetime64[ns]
  
  
city
Bakersfield     2018-03-09
Long Beach      2017-12-31
Los Angeles     2018-06-23
Oakland         2017-12-31
San Diego       2017-03-31
San Francisco   2016-06-30
San Jose        2018-03-31
Santa Ana       2018-04-13
Stockton        2016-12-31
Name: date, dtype: datetime64[ns]
So we will drop the Santa Anna dataset, and keep the data from 2014


In [13]:
# We have consistent data for all the cities except for Santa Ana during 2014
# Therefore, we will use data from these dates when comparing the cities
df_ca_final = df_ca[(df_ca['date'] >= '2014-01-01') & (df_ca['date'] <= '2014-12-31')]
df_ca_final.reset_index(drop=True, inplace=True)

In [14]:
# We are going to drop about all the rows from Santa Ana because there is not enough data overlap
df_ca_final = df_ca_final[df_ca_final['city'] != 'Santa Ana']
df_ca_final.reset_index(drop=True, inplace=True)

In [15]:
# We are only looking at white, black, and hispanic for driver races
df_ca_final = df_ca_final[(df_ca_final['subject_race'] == 'white')|(df_ca_final['subject_race'] == 'black')|(df_ca_final['subject_race'] == 'hispanic')]
df_ca_final.reset_index(drop=True, inplace=True)

In [16]:
# Checking the counts again
df_ca_final.count()

date                900857
time                900857
lat                 900857
lng                 900857
subject_age         900857
subject_race        900857
subject_sex         900857
type                900857
outcome             900857
contraband_found    900857
search_conducted    900857
city                900857
dtype: int64

In [17]:
# saving the new merged dataset
# this is different from our visualization data because it only includes three driver races, not all races
df_ca_final.to_csv('Data/df_ca_clean.csv')

### Getting Clean Census and Aggregated Stops Data: 

In [18]:
# getting aggrigated stops data
df1 = pd.DataFrame(df_ca_final[df_ca_final['subject_race']=='black'].groupby(['city']).size()) 
df1 = df1.rename(columns={0:"black"})

df2 = pd.DataFrame(df_ca_final[df_ca_final['subject_race']=='hispanic'].groupby(['city']).size()) 
df2 = df2.rename(columns={0:"hispanic"})

df3 = pd.DataFrame(df_ca_final[df_ca_final['subject_race']=='white'].groupby(['city']).size()) 
df3 = df3.rename(columns={0:"white"})


# Place the DataFrames side by side
df = pd.concat([df1, df2, df3], axis=1)

In [20]:
df.to_csv('Data/df_agg_ca_clean.csv')

In [22]:
# getting clean census data
df_cen_1 = pd.DataFrame()
df_cen_1['city'] = df_census['City']
df_cen_1['total_pop'] = df_census['Total']
df_cen_1['black_pop'] = df_census['black']
df_cen_1['hispanic_pop'] = df_census['hispanic']
df_cen_1['white_pop'] = df_census['white']
df_cen_1 = df_cen_1.set_index('city')

In [24]:
# saving the new merged dataset
df_cen_1.to_csv('Data/df_cen_clean.csv')