# Consolidated Pre-processing Notebook

In [87]:
import pandas as pd
import numpy as np

In [88]:
#Load dataset from ../raw_data/
df = pd.read_csv("../raw_data/chicago.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257077 entries, 0 to 257076
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   CASE#                   257077 non-null  object 
 1   DATE  OF OCCURRENCE     257077 non-null  object 
 2   BLOCK                   257077 non-null  object 
 3    IUCR                   257077 non-null  object 
 4    PRIMARY DESCRIPTION    257077 non-null  object 
 5    SECONDARY DESCRIPTION  257077 non-null  object 
 6    LOCATION DESCRIPTION   256032 non-null  object 
 7   ARREST                  257077 non-null  object 
 8   DOMESTIC                257077 non-null  object 
 9   BEAT                    257077 non-null  int64  
 10  WARD                    257077 non-null  int64  
 11  FBI CD                  257077 non-null  object 
 12  X COORDINATE            257011 non-null  float64
 13  Y COORDINATE            257011 non-null  float64
 14  LATITUDE            

In [89]:
df.head(3)

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,WARD,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,JH117298,01/16/2024 01:00:00 AM,038XX W DIVERSEY AVE,0810,THEFT,OVER $500,STREET,N,N,2524,35,06,1150337.0,1918345.0,41.931844,-87.722951,"(41.931843966, -87.722950868)"
1,JG561057,12/31/2023 04:30:00 PM,004XX N WABASH AVE,0460,BATTERY,SIMPLE,STREET,N,N,1834,42,08B,1176592.0,1902931.0,41.888994,-87.626935,"(41.888993854, -87.626934833)"
2,JH117691,01/16/2024 06:50:00 PM,010XX W 99TH ST,143A,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,STREET,Y,N,2232,21,15,1170976.0,1839080.0,41.713905,-87.649425,"(41.713904887, -87.649424515)"


Raw datset columns has typographical errors. Below code resolves this:

In [90]:
# Remove leading and trailing spaces from each column name
df.columns = df.columns.str.strip()

# Apply strip() to each column where the data type is string (object)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# If the column name has multiple spaces (e.g., "DATE  OF OCCURRENCE"), replace them
df.columns = df.columns.str.replace('  ', ' ', regex=False)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Drop na values in location coordinates (total 66 rows): 

In [91]:
# Remove rows where any of the specified columns have missing data
df = df.dropna(subset=['X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION'])

Feature Engineer the following features: 
1. Time of Day (Early Morning, etc.), per 4h increment
2. Weekend? (i.e. Friday 5pm onwards to Sunday 11:59pm)
3. Month

In [92]:
# Convert 'DATE OF OCCURRENCE' to datetime format
df['DATE OF OCCURRENCE'] = pd.to_datetime(df['DATE OF OCCURRENCE'], errors='coerce')

# Create a function to categorize time into buckets
def categorize_time(hour):
    if 0 <= hour < 6:
        return "Late Evening"
    elif 6 <= hour < 9:
        return "Early Morning"
    elif 9 <= hour < 12:
        return "Late Morning"
    elif 12 <= hour < 15:
        return "Early Noon"
    elif 15 <= hour < 18:
        return "Late Noon"
    else:
        return "Early Evening"

# Apply the time categorization to create 'TIME OF DAY' column
df['TIME OF DAY'] = df['DATE OF OCCURRENCE'].dt.hour.map(categorize_time)

In [93]:
# Extract the month from 'DATE  OF OCCURRENCE' and create a new column 'MONTH'
df['MONTH'] = df['DATE OF OCCURRENCE'].dt.month_name()

In [94]:
df['WEEKDAY'] = df['DATE OF OCCURRENCE'].dt.strftime('%A')

In [95]:
df['WEEKDAY NUM'] = df['DATE OF OCCURRENCE'].dt.weekday

In [96]:
df['WEEKEND'] = np.where(df['WEEKDAY NUM'] <= 4, 'NO','YES')

In [97]:
df.drop(['WEEKEND'],axis=1,inplace = True)

In [98]:
# Extract day of week and hour
df['hour'] = df['DATE OF OCCURRENCE'].dt.hour
# Use conditions to determine 'Weekend'
df['WEEKEND'] = (
    (df['WEEKDAY NUM'] == 4) & (df['hour'] >= 17) |  # Friday after 5 PM
    (df['WEEKDAY NUM'].isin([5, 6]))                 # Saturday or Sunday
).map({True: 'Yes', False: 'No'})
# Drop intermediate columns if desired
df.drop(['hour'], axis=1, inplace=True)

Consolidate Offenses: 

In [99]:
# Create a new column 'Offenses' based on 'PRIMARY DESCRIPTION'
df['OFFENSES'] = df['PRIMARY DESCRIPTION']

In [100]:
# Get the value counts for 'Offenses' and identify offenses with fewer than 500 occurrences
value_counts = df['OFFENSES'].value_counts()
# Replace offenses that occur fewer than 500 times with "OTHER OFFENSE"
to_replace = value_counts[value_counts<500].index
df['OFFENSES'] = df['OFFENSES'].replace(to_replace, "OTHER OFFENSE")

In [101]:
# Consolidate specific offenses into other categories
df['OFFENSES'] = df['OFFENSES'].replace({
    "PUBLIC PEACE VIOLATION": "PUBLIC ORDER",  # Consolidate Public Peace Violation into new category: 'PUBLIC ORDER'
    "INTERFERENCE WITH PUBLIC OFFICER": "PUBLIC ORDER",  # Consolidate Interference with Public Officer into new category: 'PUBLIC ORDER'
    "CRIMINAL SEXUAL ASSAULT": "SEX OFFENSE"  # Conslidate Criminal Sexual Assault into: 'SEX OFFENSE'
})

In [102]:
list(df)

['CASE#',
 'DATE OF OCCURRENCE',
 'BLOCK',
 'IUCR',
 'PRIMARY DESCRIPTION',
 'SECONDARY DESCRIPTION',
 'LOCATION DESCRIPTION',
 'ARREST',
 'DOMESTIC',
 'BEAT',
 'WARD',
 'FBI CD',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION',
 'TIME OF DAY',
 'MONTH',
 'WEEKDAY',
 'WEEKDAY NUM',
 'WEEKEND',
 'OFFENSES']

In [103]:
processed_df = df[['WARD', 'TIME OF DAY', 'MONTH', 'WEEKEND', 'DATE OF OCCURRENCE', 'OFFENSES', 'X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION']]
processed_df

Unnamed: 0,WARD,TIME OF DAY,MONTH,WEEKEND,DATE OF OCCURRENCE,OFFENSES,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,35,Late Evening,January,No,2024-01-16 01:00:00,THEFT,1150337.0,1918345.0,41.931844,-87.722951,"(41.931843966, -87.722950868)"
1,42,Late Noon,December,Yes,2023-12-31 16:30:00,BATTERY,1176592.0,1902931.0,41.888994,-87.626935,"(41.888993854, -87.626934833)"
2,21,Early Evening,January,No,2024-01-16 18:50:00,WEAPONS VIOLATION,1170976.0,1839080.0,41.713905,-87.649425,"(41.713904887, -87.649424515)"
3,27,Early Morning,November,No,2023-11-30 07:28:00,NARCOTICS,1153117.0,1905117.0,41.895490,-87.713086,"(41.895490399, -87.713086271)"
4,42,Late Evening,December,Yes,2023-12-31 00:55:00,WEAPONS VIOLATION,1175975.0,1903895.0,41.891653,-87.629172,"(41.891653037, -87.62917162)"
...,...,...,...,...,...,...,...,...,...,...,...
257072,27,Early Morning,November,Yes,2024-11-23 08:25:00,OTHER OFFENSE,1150853.0,1903735.0,41.891743,-87.721438,"(41.891742661, -87.721437661)"
257073,44,Early Noon,November,Yes,2024-11-23 14:45:00,THEFT,1170069.0,1921303.0,41.939552,-87.650352,"(41.939552474, -87.650352367)"
257074,21,Late Morning,November,Yes,2024-11-23 11:04:00,OTHER OFFENSE,1172801.0,1836375.0,41.706442,-87.642820,"(41.706441994, -87.642820119)"
257075,27,Late Evening,November,Yes,2024-11-23 00:54:00,BATTERY,1156727.0,1899212.0,41.879214,-87.699988,"(41.879214143, -87.699987616)"


In [26]:
# # Convert 'DATE  OF OCCURRENCE' to datetime format
# df['DATE  OF OCCURRENCE'] = pd.to_datetime(df['DATE  OF OCCURRENCE'], errors='coerce')

# # Create a function to categorize time into buckets
# def categorize_time(hour):
#     if 0 <= hour < 4:
#         return "Early Morning"
#     elif 4 <= hour < 8:
#         return "Morning"
#     elif 8 <= hour < 12:
#         return "Late Morning"
#     elif 12 <= hour < 16:
#         return "Afternoon"
#     elif 16 <= hour < 20:
#         return "Evening"
#     else:  # 20 <= hour < 24
#         return "Late Evening"

# # Apply the time categorization to create 'TIME OF DAY' column
# df['TIME OF DAY'] = df['DATE  OF OCCURRENCE'].dt.hour.apply(categorize_time)

Include 'MONTH' column:

In [29]:
# Extract the month from 'DATE  OF OCCURRENCE' and create a new column 'MONTH'
df['MONTH'] = df['DATE  OF OCCURRENCE'].dt.month_name()

df

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,WARD,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION,TIME OF DAY,WEEKEND,MONTH
0,JH117298,2024-01-16 01:00:00,038XX W DIVERSEY AVE,0810,THEFT,OVER $500,STREET,N,N,2524,35,06,1150337.0,1918345.0,41.931844,-87.722951,"(41.931843966, -87.722950868)",Early Morning,No,January
1,JG561057,2023-12-31 16:30:00,004XX N WABASH AVE,0460,BATTERY,SIMPLE,STREET,N,N,1834,42,08B,1176592.0,1902931.0,41.888994,-87.626935,"(41.888993854, -87.626934833)",Early Evening,Yes,December
2,JH117691,2024-01-16 18:50:00,010XX W 99TH ST,143A,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,STREET,Y,N,2232,21,15,1170976.0,1839080.0,41.713905,-87.649425,"(41.713904887, -87.649424515)",Late Evening,No,January
3,JG522770,2023-11-30 07:28:00,034XX W CHICAGO AVE,2024,NARCOTICS,POSSESS - HEROIN (WHITE),ALLEY,Y,N,1121,27,18,1153117.0,1905117.0,41.895490,-87.713086,"(41.895490399, -87.713086271)",Late Morning,No,November
4,JG560426,2023-12-31 00:55:00,0000X W GRAND AVE,143A,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,STREET,Y,N,1831,42,15,1175975.0,1903895.0,41.891653,-87.629172,"(41.891653037, -87.62917162)",Early Morning,Yes,December
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257072,JH518398,2024-11-23 08:25:00,038XX W OHIO ST,502P,OTHER OFFENSE,FALSE / STOLEN / ALTERED TRP,STREET,N,N,1122,27,26,1150853.0,1903735.0,41.891743,-87.721438,"(41.891742661, -87.721437661)",Late Morning,Yes,November
257073,JH519771,2024-11-23 14:45:00,031XX N CLARK ST,0890,THEFT,FROM BUILDING,APARTMENT,N,N,1933,44,06,1170069.0,1921303.0,41.939552,-87.650352,"(41.939552474, -87.650352367)",Late Noon,Yes,November
257074,JH518538,2024-11-23 11:04:00,103XX S HALSTED ST,5011,OTHER OFFENSE,LICENSE VIOLATION,CONVENIENCE STORE,Y,N,2232,21,26,1172801.0,1836375.0,41.706442,-87.642820,"(41.706441994, -87.642820119)",Early Noon,Yes,November
257075,JH518220,2024-11-23 00:54:00,029XX W WILCOX ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,N,Y,1124,27,08B,1156727.0,1899212.0,41.879214,-87.699988,"(41.879214143, -87.699987616)",Early Morning,Yes,November
