# Preparing Data for Per Capita Stops and Veil of Darkness Analysis

* This notebook uses the stops data and census data previously cleaned and prepared to create aggregated datasets for analysis

In [11]:
# importing libraries
import pandas as pd
import numpy as np

In [12]:
# loading the data
df_ca = pd.read_csv('Data/df_ca_2017.csv', low_memory=False)
df_census = pd.read_csv('Data/df_census.csv')

In [13]:
#list first five rows
print('Preview data:')
display(df_ca.head())

Preview data:


Unnamed: 0.1,Unnamed: 0,date,time,lat,lng,subject_age,subject_race,subject_sex,type,outcome,contraband_found,search_conducted,city
0,0,2017-01-01,10:10:00,37.799366,-122.216793,41.0,white,male,pedestrian,warning,,False,Oakland
1,1,2017-01-01,11:58:00,37.837443,-122.266687,34.0,black,female,vehicular,warning,,False,Oakland
2,2,2017-01-01,11:58:00,37.837443,-122.266687,26.0,black,male,vehicular,warning,,False,Oakland
3,3,2017-01-01,14:45:00,37.760424,-122.189575,19.0,hispanic,male,pedestrian,warning,False,True,Oakland
4,4,2017-01-01,15:16:00,37.773503,-122.200769,28.0,black,male,vehicular,citation,,False,Oakland


In [14]:
df_ca.columns

Index(['Unnamed: 0', 'date', 'time', 'lat', 'lng', 'subject_age',
       'subject_race', 'subject_sex', 'type', 'outcome', 'contraband_found',
       'search_conducted', 'city'],
      dtype='object')

In [15]:
# Unnamed: 0 is the same as the indicies so we can drop it
df_ca = df_ca.drop(['Unnamed: 0'], axis = 1)

In [16]:
df_ca.count()

date                693980
time                675421
lat                  88133
lng                  88133
subject_age         693980
subject_race        693980
subject_sex         666361
type                693980
outcome              74477
contraband_found     16687
search_conducted     57778
city                693980
dtype: int64

In [17]:
# Filling in contraband_found, subject_sex, outcome, and subject_age with placevalues
df_ca['contraband_found'].fillna('N/A',inplace=True)
df_ca['search_conducted'].fillna('N/A',inplace=True)
df_ca['lat'].fillna('N/A',inplace=True)
df_ca['lng'].fillna('N/A',inplace=True)
df_ca['time'].fillna('N/A',inplace=True)
df_ca['subject_sex'].fillna('N/A',inplace=True)
df_ca['outcome'].fillna('N/A',inplace=True)
# subject_age is filled with -1, so it will be numeric, but -1 means age is unknown
df_ca['subject_age'].fillna(-1,inplace=True)

In [8]:
# Droping rows with missing values
df_ca = df_ca.dropna()
df_ca.reset_index(drop=True, inplace=True)

In [18]:
# Checking the counts again
df_ca.count()

date                693980
time                693980
lat                 693980
lng                 693980
subject_age         693980
subject_race        693980
subject_sex         693980
type                693980
outcome             693980
contraband_found    693980
search_conducted    693980
city                693980
dtype: int64

In [19]:
# Checking datatypes
print('Check datatypes:')
display(df_ca.dtypes)

Check datatypes:


date                 object
time                 object
lat                  object
lng                  object
subject_age         float64
subject_race         object
subject_sex          object
type                 object
outcome              object
contraband_found     object
search_conducted     object
city                 object
dtype: object

In [20]:
# Changing the age column to a numerical value
df_ca['subject_age'] = pd.to_numeric(df_ca['subject_age'])
# Changing date to a datetime value
df_ca['date'] = pd.DatetimeIndex(df_ca['date'])

In [22]:
print(df_ca.groupby(['city'])['date'].min())
print('  ')
print('  ')
print(df_ca.groupby(['city'])['date'].max())
print("So we have the correct dates. ")

city
Bakersfield   2017-01-01
Long Beach    2017-01-01
Los Angeles   2017-01-01
Oakland       2017-01-01
San Jose      2017-01-01
Name: date, dtype: datetime64[ns]
  
  
city
Bakersfield   2017-12-31
Long Beach    2017-12-31
Los Angeles   2017-12-31
Oakland       2017-12-31
San Jose      2017-12-31
Name: date, dtype: datetime64[ns]
So we have the correct dates. 


In [24]:
df_ca_final = df_ca

In [25]:
# We are only looking at white, black, and hispanic for driver races
df_ca_final = df_ca_final[(df_ca_final['subject_race'] == 'white')|(df_ca_final['subject_race'] == 'black')|(df_ca_final['subject_race'] == 'hispanic')]
df_ca_final.reset_index(drop=True, inplace=True)

In [31]:
# Checking the counts again
df_ca_final.count()

date                624040
time                624040
lat                 624040
lng                 624040
subject_age         624040
subject_race        624040
subject_sex         624040
type                624040
outcome             624040
contraband_found    624040
search_conducted    624040
city                624040
dtype: int64

In [32]:
df_ca_final['city'].value_counts()

Los Angeles    541427
Oakland         27895
San Jose        21627
Bakersfield     16961
Long Beach      16130
Name: city, dtype: int64

In [34]:
df_ca_final['subject_race'].value_counts()

hispanic    302315
black       186011
white       135714
Name: subject_race, dtype: int64

In [33]:
# saving the new merged dataset
# this is different from our visualization data because it only includes three driver races, not all races
df_ca_final.to_csv('df_ca_clean_2017.csv')

### Getting Clean Census and Aggregated Stops Data: 

In [28]:
# getting aggrigated stops data
df1 = pd.DataFrame(df_ca_final[df_ca_final['subject_race']=='black'].groupby(['city']).size()) 
df1 = df1.rename(columns={0:"black"})

df2 = pd.DataFrame(df_ca_final[df_ca_final['subject_race']=='hispanic'].groupby(['city']).size()) 
df2 = df2.rename(columns={0:"hispanic"})

df3 = pd.DataFrame(df_ca_final[df_ca_final['subject_race']=='white'].groupby(['city']).size()) 
df3 = df3.rename(columns={0:"white"})


# Place the DataFrames side by side
df = pd.concat([df1, df2, df3], axis=1)

In [29]:
df.to_csv('df_agg_ca_clean_2017.csv')

In [30]:
df

Unnamed: 0_level_0,black,hispanic,white
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bakersfield,1903,8126,6932
Long Beach,4732,7471,3927
Los Angeles,158470,265883,117074
Oakland,18488,6638,2769
San Jose,2418,14197,5012


In [22]:
# getting clean census data
df_cen_1 = pd.DataFrame()
df_cen_1['city'] = df_census['City']
df_cen_1['total_pop'] = df_census['Total']
df_cen_1['black_pop'] = df_census['black']
df_cen_1['hispanic_pop'] = df_census['hispanic']
df_cen_1['white_pop'] = df_census['white']
df_cen_1 = df_cen_1.set_index('city')

In [24]:
# saving the new merged dataset
df_cen_1.to_csv('df_cen_clean.csv')