authors: Kiersten Johns, Susan Hopper

# Imports:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
partial_clean = pd.read_csv('../kiersten/data/dataframe.csv')
partial_clean.head(2)

Unnamed: 0.1,Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,primary_cause_line_c,secondary_cause,gun_related,opioid_related,cold_related,heat_related,commissioner_district,incident_city,incident_zip_code,longitude,latitude,residence_city,residence_zip,chicago_community_area,covid_related
0,20,10/24/2023 12:43:00 AM,10/25/2023 12:11:00 AM,22.0,Female,Black,False,ACCIDENT,MULTIPLE BLUNT FORCE INJURIES. MOTOR VEHICLE C...,MULTIPLE BLUNT FORCE INJURIES,MOTOR VEHICLE COLLISION,,,False,False,False,False,1.0,CHICAGO,60644.0,-87.75489,41.872093,Indianapolis,46224.0,AUSTIN,False
1,22,10/24/2023 10:30:00 PM,10/24/2023 09:51:00 PM,35.0,Male,Black,False,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,MULTIPLE GUNSHOT WOUNDS,,,,True,False,False,False,4.0,CHICAGO,60649.0,-87.558292,41.759939,Chicago,60633.0,SOUTH SHORE,False


In [4]:
# drop the extra index column

partial_clean.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
# drop rows where gender is unknown

partial_clean.dropna(subset=['gender'], inplace=True)
partial_clean.drop(partial_clean[partial_clean['gender'] == 'Unknown'].index, inplace=True)

In [None]:
# partial_clean['incident_zip_code'].unique()

In [6]:
partial_clean[partial_clean['residence_zip']==0][:3]

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,primary_cause_line_c,secondary_cause,gun_related,opioid_related,cold_related,heat_related,commissioner_district,incident_city,incident_zip_code,longitude,latitude,residence_city,residence_zip,chicago_community_area,covid_related
3074,01/25/2023 07:26:00 PM,01/27/2023 06:48:00 PM,54.0,Female,Black,False,ACCIDENT,COMPLICATIONS OF MULTIPLE INJURIES. PEDESTRIAN...,COMPLICATIONS OF MULTIPLE INJURIES,PEDESTRIAN STRUCK BY MOTOR VEHICLE,,,False,False,False,False,1.0,CHICAGO,60651.0,-87.726197,41.895388,,0.0,HUMBOLDT PARK,False
3300,01/10/2023 04:30:00 PM,01/10/2023 04:44:00 PM,40.0,Male,Black,False,ACCIDENT,"COMBINED DRUG (FENTANYL, DESPROPIONYL FENTANYL...","COMBINED DRUG (FENTANYL, DESPROPIONYL FENTANYL...",,,,False,True,False,False,2.0,CHICAGO,60602.0,-87.627971,41.882697,,0.0,LOOP,False
3499,12/29/2022 10:08:00 AM,12/29/2022 10:28:00 AM,45.0,Male,White,False,ACCIDENT,"COMBINED DRUG (COCAINE, FENTANYL, DESPROPIONYL...","COMBINED DRUG (COCAINE, FENTANYL, DESPROPIONYL...",,,PROBABLE COLD EXPOSURE,False,True,False,False,5.0,RIVERDALE,60827.0,-87.636228,41.646622,,0.0,,False


In [7]:
# replace 0 values with NaNs in zipcode fields, so they get replaced with 'no_text' later

partial_clean['incident_zip_code'].replace(0, np.NaN, inplace=True)
partial_clean['residence_zip'].replace(0, np.NaN, inplace=True)

In [8]:
# make zip codes into 5-char strings and preseve the nulls

inc_zip_nulls = partial_clean['incident_zip_code'].isna()
partial_clean['incident_zip_code'] = partial_clean['incident_zip_code'].astype(str).mask(inc_zip_nulls, np.NaN).str[:5]

res_zip_nulls = partial_clean['residence_zip'].isna()
partial_clean['residence_zip'] = partial_clean['residence_zip'].astype(str).mask(res_zip_nulls, np.NaN).str[:5]

In [9]:
# fill nulls with agreed-upon values
# I also added 'no_text' for secondary_cause nulls

partial_clean['race'].fillna('Unknown', inplace=True)
partial_clean['primary_cause_line_b'].fillna('no_text', inplace=True)
partial_clean['primary_cause_line_c'].fillna('no_text', inplace=True)
partial_clean['secondary_cause'].fillna('no_text', inplace=True)
partial_clean['commissioner_district'].fillna('no_text', inplace=True)
partial_clean['incident_city'].fillna('no_text', inplace=True)
partial_clean['incident_zip_code'].fillna('no_text', inplace=True)
partial_clean['residence_city'].fillna('no_text', inplace=True)
partial_clean['residence_zip'].fillna('no_text', inplace=True)
partial_clean['chicago_community_area'].fillna('no_text', inplace=True)

In [10]:
# fill in nulls for death date and age (the special cases): 

partial_clean[partial_clean['date_of_death'].isna()]

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,primary_cause_line_c,secondary_cause,gun_related,opioid_related,cold_related,heat_related,commissioner_district,incident_city,incident_zip_code,longitude,latitude,residence_city,residence_zip,chicago_community_area,covid_related
31775,03/26/2016 09:07:00 AM,,70.0,Female,White,False,ACCIDENT,COMPLICATIONS OF SUBDURAL HEMATOMA . DUE TO FA...,COMPLICATIONS OF SUBDURAL HEMATOMA,DUE TO FALL DOWNSTAIRS,no_text,no_text,False,False,False,False,17.0,LA GRANGE PARK,60526,-87.867411,41.836726,La Grange Park,60526,no_text,False


In [11]:
partial_clean['date_of_death'].fillna('03/26/2016 09:07:00 AM', inplace=True)

In [12]:
partial_clean['age'].fillna(99999999, inplace=True)

In [13]:
def age_buckets(x):
    if int(x) < 5:
        return 'Under 5'
    elif int(x) < 15:
        return '5-14'
    elif int(x) < 25:
        return '15-24'
    elif int(x) < 65:
        return '25-64'
    elif int(x) < 99999999:
        return '65+'
    else:
        return 'Unknown'

In [14]:
partial_clean['age_range'] = partial_clean['age'].apply(age_buckets)

In [15]:
# make the binary categories 1s and 0s

partial_clean['latino'] = partial_clean.latino.map({False : 0, True : 1})
partial_clean['cold_related'] = partial_clean.cold_related.map({False : 0, True : 1})
partial_clean['heat_related'] = partial_clean.heat_related.map({False : 0, True : 1})
partial_clean['gun_related'] = partial_clean.gun_related.map({False : 0, True : 1})
partial_clean['opioid_related'] = partial_clean.opioid_related.map({False : 0, True : 1})
partial_clean['covid_related'] = partial_clean.covid_related.map({False : 0, True : 1})
partial_clean['gender'] = partial_clean.gender.map({'Male' : 0, 'Female' : 1})

In [16]:
# split the datetime columns into separate date and time columns and add day of week

partial_clean['date_of_death'] = pd.to_datetime(partial_clean['date_of_death'])
partial_clean['death_date'] = pd.to_datetime(partial_clean['date_of_death'].dt.date)
partial_clean['death_time'] = partial_clean['date_of_death'].dt.time
partial_clean['death_day'] = partial_clean['death_date'].dt.day_name()


partial_clean['date_of_incident'] = pd.to_datetime(partial_clean['date_of_incident'], errors='coerce')
partial_clean['inc_date'] = pd.to_datetime(partial_clean['date_of_incident'].dt.date)
partial_clean['inc_time'] = partial_clean['date_of_incident'].dt.time
partial_clean['inc_day'] = partial_clean['inc_date'].dt.day_name()

In [17]:
partial_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37151 entries, 0 to 37157
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date_of_incident        36688 non-null  datetime64[ns]
 1   date_of_death           37151 non-null  datetime64[ns]
 2   age                     37151 non-null  float64       
 3   gender                  37151 non-null  int64         
 4   race                    37151 non-null  object        
 5   latino                  37151 non-null  int64         
 6   manner_of_death         37151 non-null  object        
 7   primary_cause           37151 non-null  object        
 8   primary_cause_line_a    37151 non-null  object        
 9   primary_cause_line_b    37151 non-null  object        
 10  primary_cause_line_c    37151 non-null  object        
 11  secondary_cause         37151 non-null  object        
 12  gun_related             37151 non-null  int64 

In [18]:
partial_clean.sample(3)

Unnamed: 0,date_of_incident,date_of_death,age,gender,race,latino,manner_of_death,primary_cause,primary_cause_line_a,primary_cause_line_b,primary_cause_line_c,secondary_cause,gun_related,opioid_related,cold_related,heat_related,commissioner_district,incident_city,incident_zip_code,longitude,latitude,residence_city,residence_zip,chicago_community_area,covid_related,age_range,death_date,death_time,death_day,inc_date,inc_time,inc_day
6319,2022-06-15 11:59:00,2022-06-15 12:01:00,23.0,0,Black,0,ACCIDENT,"ALPRAZOLAM, PROMETHAZINE, DIPHENHYDRAMINE, COD...","ALPRAZOLAM, PROMETHAZINE, DIPHENHYDRAMINE, COD...",no_text,no_text,no_text,0,1,0,0,5.0,RIVERDALE,60827,-87.629242,41.640593,Riverdale,60827,no_text,0,15-24,2022-06-15,12:01:00,Wednesday,2022-06-15,11:59:00,Wednesday
31100,2016-08-01 00:47:00,2016-08-04 13:55:00,52.0,1,Black,0,ACCIDENT,COMPLICATIONS OF PHENCYCLIDINE TOXICITY,COMPLICATIONS OF PHENCYCLIDINE TOXICITY,no_text,no_text,"HYPERTENSIVE CARDIOVASCULAR DISEASE, MORBID OB...",0,0,0,0,2.0,CHICAGO,60621,-87.642901,41.773085,Chicago,60637,ENGLEWOOD,0,25-64,2016-08-04,13:55:00,Thursday,2016-08-01,00:47:00,Monday
31452,2016-06-28 23:29:00,2016-06-29 00:20:00,29.0,1,White,1,ACCIDENT,OPIATE TOXICITY,OPIATE TOXICITY,no_text,no_text,no_text,0,1,0,0,1.0,WESTCHESTER,60154,-87.868913,41.856751,Chicago,60634,no_text,0,25-64,2016-06-29,00:20:00,Wednesday,2016-06-28,23:29:00,Tuesday


In [19]:
partial_clean.to_csv('../susan/data/partial_clean_data_kjsh.csv', index=False)