# Crime data prep **Part 1**:
---
- Cleaning records from source
- Refactoring
- Standardizing county names
- Generating new columns based on date (day of week, quarter, year)
- Combining the 97-15 data and 16-19 data

In [1]:
import pandas as pd, numpy as np
import df_util
from df_util import head
input_path = lambda name: f'../input-data/{name}.csv'
work_path = lambda name: f'../working-data/{name}.csv'

In [2]:
df1 = pd.read_csv(input_path('crime_16_19__j6g4-gayk'))
df2 = pd.read_csv(input_path('crime_97_15__6vnq-az4b'))

head(df1, df2)

9 cols x 1851996 rows


Unnamed: 0,pub_agency_name,county_name,incident_date,incident_hour,offense_name,crime_against,offense_category_name,offense_group,age_num
0,Westminster,JEFFERSON; ADAMS,2017-08-26,17.0,Aggravated Assault,Person,Assault Offenses,A,52.0
1,Westminster,JEFFERSON; ADAMS,2017-11-22,20.0,Aggravated Assault,Person,Assault Offenses,A,29.0
2,Westminster,JEFFERSON; ADAMS,2017-12-28,21.0,Motor Vehicle Theft,Property,Motor Vehicle Theft,A,


10 cols x 4952282 rows


Unnamed: 0,agency_name,agency_type_name,city_name,primary_county,offense_name,crime_against,offense_category_name,age_num,incident_date,incident_hour
0,Lyons Police Department,City,Lyons,Boulder,,,,,,
1,Kremmling Police Department,City,Kremmling,Grand,,,,,,
2,Oak Creek Police Department,City,Oak Creek,Routt,,,,,,


### Crime 16-19

In [3]:
# remove State Patrol and CBI crimes
df1 = df1.loc[ ~ df1.pub_agency_name.isin(['State Patrol', 'Colorado Bureau of Investigation'])]

# Drop rows where pub agency name is null
df1 = df1[ ~ df1.pub_agency_name.isna()]

# Since we're focused on county and not police department, replace dual county
# police department county values (Ex: "JEFFERSON; ADAMS") with just the primary (Ex: "JEFFERSON")
df1.county_name = df1.county_name.str.split('; ').str[0]

# Change county name from uppercase to title case
df1.county_name = df1.county_name.str.title()

# Rename county column
df1 = df1.rename(columns={
        'county_name':      'county', 
        'pub_agency_name':  'police_dept',
        'incident_date':    'date',
        'incident_hour':    'hour',
        'age_num':          'age',
        'offense_category_name': 'offense_category',
    })

df1.county = df1.county.str.upper()
df1['year'] = pd.DatetimeIndex(df1.date).year
df1['day_of_week'] = pd.DatetimeIndex(df1.date).day_of_week
df1['month'] = pd.DatetimeIndex(df1.date).month
df1['quarter'] = pd.DatetimeIndex(df1.date).quarter

# Select only needed columns
df1 = df1[[
    'year', 'county', 'police_dept', 'date', 'quarter', 'month', 'day_of_week', 'hour',
    'age', 'crime_against', 'offense_name', 'offense_category'
    ]]

head(df1)

12 cols x 1845650 rows


Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,2017,JEFFERSON,Westminster,2017-08-26,3,8,5,17.0,52.0,Person,Aggravated Assault,Assault Offenses
1,2017,JEFFERSON,Westminster,2017-11-22,4,11,2,20.0,29.0,Person,Aggravated Assault,Assault Offenses
2,2017,JEFFERSON,Westminster,2017-12-28,4,12,3,21.0,,Property,Motor Vehicle Theft,Motor Vehicle Theft


### Crime 97-15

In [4]:
# All the rows with null dates appear to be bullshit records and can be removed
df2 = df2[ ~ df2.incident_date.isna()]

# Remove State Patrol and CBI crimes
df2 = df2.loc[ ~ df2.agency_name.isin(['State Patrol', 'Colorado Bureau of Investigation'])]

# Rename county column
df2 = df2.rename(columns={
        'primary_county':   'county',
        'agency_name':      'police_dept',
        'incident_date':    'date',
        'incident_hour':    'hour',
        'age_num':          'age',
        'offense_category_name': 'offense_category',
    })


df2.county = df2.county.str.upper()
df2['year'] = pd.DatetimeIndex(df2.date).year
df2['day_of_week'] = pd.DatetimeIndex(df2.date).day_of_week
df2['month'] = pd.DatetimeIndex(df2.date).month
df2['quarter'] = pd.DatetimeIndex(df2.date).quarter

# Select only needed columns
df2 = df2[[
    'year', 'county', 'police_dept', 'date', 'quarter', 'month', 'day_of_week', 'hour',
    'age', 'crime_against', 'offense_name', 'offense_category']]

head(df2)

12 cols x 4925016 rows


Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
24,1997,BOULDER,Longmont Police Department,1997-03-14,1,3,4,,15.0,Person,Fondling,Sex Offenses
25,1997,BOULDER,Longmont Police Department,1997-07-02,3,7,2,21.0,14.0,Property,Arson,Arson
26,1997,KIT CARSON,Kit Carson County Sheriff's Office,1997-01-20,1,1,0,22.0,58.0,Person,Simple Assault,Assault Offenses


### Save

In [5]:
# Save em
df1.to_csv(work_path('crime_16_19'), index=False)
df2.to_csv(work_path('crime_97_15'), index=False)

## Combine into single dataset
---

In [6]:
df = pd.concat([df2, df1])
df = df.drop(columns='police_dept')

In [7]:
head(df)

11 cols x 6770666 rows


Unnamed: 0,year,county,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
24,1997,BOULDER,1997-03-14,1,3,4,,15.0,Person,Fondling,Sex Offenses
25,1997,BOULDER,1997-07-02,3,7,2,21.0,14.0,Property,Arson,Arson
26,1997,KIT CARSON,1997-01-20,1,1,0,22.0,58.0,Person,Simple Assault,Assault Offenses


In [8]:
df.to_csv(work_path('crime_all'), index=False)