# Consolidated Pre-processing Notebook

In [3]:
import pandas as pd
import numpy as np

In [14]:
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt

# Convert WKT strings to Shapely geometry objects
ward_bound['the_geom'] = ward_bound['the_geom'].apply(wkt.loads)

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(ward_bound, geometry='the_geom')

# Function to find the ward for a given latitude and longitude
def find_ward(lat, lon, geodataframe):
    point = Point(lon, lat)  # Create a Point (longitude first)
    for _, row in geodataframe.iterrows():
        if row['the_geom'].contains(point):  # Check if the point is inside the polygon
            return row['WARD']
    return None  # Return None if no ward contains the point
# Example usage
new_lat, new_lon = 41.889, -87.627
ward = find_ward(new_lat, new_lon, gdf)

if ward:
    print(f"The point ({new_lat}, {new_lon}) lies in Ward {ward}.")
else:
    print(f"The point ({new_lat}, {new_lon}) does not lie in any ward.")


The point (41.889, -87.627) lies in Ward 42.


#### Ward data link 
  https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Wards-2015-2023-/sp34-6z76

In [12]:
ward_bound = pd.read_csv("../raw_data/WARDS.csv")
ward_bound.head(1)

Unnamed: 0,the_geom,WARD,SHAPE_Leng,SHAPE_Area
0,MULTIPOLYGON (((-87.69623470134458 41.85755495...,12,93073.340838,116096500.0


In [4]:
#Load dataset from ../raw_data/
df = pd.read_csv("../raw_data/chicago.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257077 entries, 0 to 257076
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   CASE#                   257077 non-null  object 
 1   DATE  OF OCCURRENCE     257077 non-null  object 
 2   BLOCK                   257077 non-null  object 
 3    IUCR                   257077 non-null  object 
 4    PRIMARY DESCRIPTION    257077 non-null  object 
 5    SECONDARY DESCRIPTION  257077 non-null  object 
 6    LOCATION DESCRIPTION   256032 non-null  object 
 7   ARREST                  257077 non-null  object 
 8   DOMESTIC                257077 non-null  object 
 9   BEAT                    257077 non-null  int64  
 10  WARD                    257077 non-null  int64  
 11  FBI CD                  257077 non-null  object 
 12  X COORDINATE            257011 non-null  float64
 13  Y COORDINATE            257011 non-null  float64
 14  LATITUDE            

In [5]:
df.head(2)

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,WARD,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,JH117298,01/16/2024 01:00:00 AM,038XX W DIVERSEY AVE,810,THEFT,OVER $500,STREET,N,N,2524,35,06,1150337.0,1918345.0,41.931844,-87.722951,"(41.931843966, -87.722950868)"
1,JG561057,12/31/2023 04:30:00 PM,004XX N WABASH AVE,460,BATTERY,SIMPLE,STREET,N,N,1834,42,08B,1176592.0,1902931.0,41.888994,-87.626935,"(41.888993854, -87.626934833)"


In [6]:
ward_colors = {
    1: '#e6194B',  2: '#3cb44b',  3: '#ffe119',  4: '#4363d8',  5: '#f58231',
    6: '#911eb4',  7: '#46f0f0',  8: '#f032e6',  9: '#bcf60c', 10: '#fabebe',
    11: '#008080', 12: '#e6beff', 13: '#9a6324', 14: '#fffac8', 15: '#800000',
    16: '#aaffc3', 17: '#808000', 18: '#ffd8b1', 19: '#000075', 20: '#808080',
    21: '#ffffff', 22: '#000000', 23: '#a6d3de', 24: '#fab2ea', 25: '#c7e3a3',
    26: '#df7b8f', 27: '#b76ba3', 28: '#7fcdb4', 29: '#4a9f59', 30: '#d7a8f5',
    31: '#89cce0', 32: '#eeaf61', 33: '#ba9278', 34: '#ccf3a5', 35: '#b4a7d6',
    36: '#b2df8a', 37: '#ffb3ba', 38: '#c8a2c8', 39: '#ffdfba', 40: '#b9fbc0',
    41: '#cddc39', 42: '#ff6f61', 43: '#ffc107', 44: '#607d8b', 45: '#8bc34a',
    46: '#795548', 47: '#ff5722', 48: '#9c27b0', 49: '#03a9f4', 50: '#673ab7'
}


In [11]:
""" import folium
import pandas as pd



# Drop rows with missing values
df = df.dropna(subset=["LATITUDE", "LONGITUDE"])

# Create a base map
m = folium.Map(location=[41.8781, -87.6298], zoom_start=10)  # Centered on Chicago


# Add points to the map
for _, row in df.iterrows():
    folium.CircleMarker(
        location=[row['LATITUDE'], row['LONGITUDE']],
        radius=8,
        color=ward_colors.get(row['WARD'], 'gray'),  # Default color for missing wards
        fill=True,
        fill_color=ward_colors.get(row['WARD'], 'gray'),
        fill_opacity=0.7,
        popup=f"Ward: {row['WARD']}"
    ).add_to(m)

# Display the map
m
 """

' import folium\nimport pandas as pd\n\n\n\n# Drop rows with missing values\ndf = df.dropna(subset=["LATITUDE", "LONGITUDE"])\n\n# Create a base map\nm = folium.Map(location=[41.8781, -87.6298], zoom_start=10)  # Centered on Chicago\n\n\n# Add points to the map\nfor _, row in df.iterrows():\n    folium.CircleMarker(\n        location=[row[\'LATITUDE\'], row[\'LONGITUDE\']],\n        radius=8,\n        color=ward_colors.get(row[\'WARD\'], \'gray\'),  # Default color for missing wards\n        fill=True,\n        fill_color=ward_colors.get(row[\'WARD\'], \'gray\'),\n        fill_opacity=0.7,\n        popup=f"Ward: {row[\'WARD\']}"\n    ).add_to(m)\n\n# Display the map\nm\n '

Raw datset columns has typographical errors. Below code resolves this:

In [4]:
# Remove leading and trailing spaces from each column name
df.columns = df.columns.str.strip()

# Apply strip() to each column where the data type is string (object)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# If the column name has multiple spaces (e.g., "DATE  OF OCCURRENCE"), replace them
df.columns = df.columns.str.replace('  ', ' ', regex=False)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Drop na values in location coordinates (total 66 rows): 

In [5]:
# Remove rows where any of the specified columns have missing data
df = df.dropna(subset=['X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION'])

Feature Engineer the following features: 
1. Time of Day (Early Morning, etc.), per 4h increment
2. Weekend? (i.e. Friday 5pm onwards to Sunday 11:59pm)
3. Month

In [6]:
# Convert 'DATE OF OCCURRENCE' to datetime format
df['DATE OF OCCURRENCE'] = pd.to_datetime(df['DATE OF OCCURRENCE'], errors='coerce')

# Create a function to categorize time into buckets
def categorize_time(hour):
    if 0 <= hour < 6:
        return "Late Evening"
    elif 6 <= hour < 9:
        return "Early Morning"
    elif 9 <= hour < 12:
        return "Late Morning"
    elif 12 <= hour < 15:
        return "Early Noon"
    elif 15 <= hour < 18:
        return "Late Noon"
    else:
        return "Early Evening"

# Apply the time categorization to create 'TIME OF DAY' column
df['TIME OF DAY'] = df['DATE OF OCCURRENCE'].dt.hour.map(categorize_time)

In [7]:
# Extract the month from 'DATE  OF OCCURRENCE' and create a new column 'MONTH'
df['MONTH'] = df['DATE OF OCCURRENCE'].dt.month_name()

In [8]:
df['WEEKDAY'] = df['DATE OF OCCURRENCE'].dt.strftime('%A')

In [9]:
df['WEEKDAY NUM'] = df['DATE OF OCCURRENCE'].dt.weekday

In [10]:
df['WEEKEND'] = np.where(df['WEEKDAY NUM'] <= 4, 'NO','YES')

In [11]:
df.drop(['WEEKEND'],axis=1,inplace = True)

In [12]:
# Extract day of week and hour
df['hour'] = df['DATE OF OCCURRENCE'].dt.hour
# Use conditions to determine 'Weekend'
df['WEEKEND'] = (
    (df['WEEKDAY NUM'] == 4) & (df['hour'] >= 17) |  # Friday after 5 PM
    (df['WEEKDAY NUM'].isin([5, 6]))                 # Saturday or Sunday
).map({True: 'Yes', False: 'No'})
# Drop intermediate columns if desired
df.drop(['hour'], axis=1, inplace=True)

Consolidate Offenses: 

In [13]:
# Create a new column 'Offenses' based on 'PRIMARY DESCRIPTION'
df['OFFENSES'] = df['PRIMARY DESCRIPTION']

In [14]:
# Get the value counts for 'Offenses' and identify offenses with fewer than 500 occurrences
value_counts = df['OFFENSES'].value_counts()
# Replace offenses that occur fewer than 500 times with "OTHER OFFENSE"
to_replace = value_counts[value_counts<500].index
df['OFFENSES'] = df['OFFENSES'].replace(to_replace, "OTHER OFFENSE")

In [15]:
# Consolidate specific offenses into other categories
df['OFFENSES'] = df['OFFENSES'].replace({
    "PUBLIC PEACE VIOLATION": "PUBLIC ORDER",  # Consolidate Public Peace Violation into new category: 'PUBLIC ORDER'
    "INTERFERENCE WITH PUBLIC OFFICER": "PUBLIC ORDER",  # Consolidate Interference with Public Officer into new category: 'PUBLIC ORDER'
    "CRIMINAL SEXUAL ASSAULT": "SEX OFFENSE"  # Conslidate Criminal Sexual Assault into: 'SEX OFFENSE'
})

In [16]:
list(df)

['CASE#',
 'DATE OF OCCURRENCE',
 'BLOCK',
 'IUCR',
 'PRIMARY DESCRIPTION',
 'SECONDARY DESCRIPTION',
 'LOCATION DESCRIPTION',
 'ARREST',
 'DOMESTIC',
 'BEAT',
 'WARD',
 'FBI CD',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION',
 'TIME OF DAY',
 'MONTH',
 'WEEKDAY',
 'WEEKDAY NUM',
 'WEEKEND',
 'OFFENSES']

In [17]:
processed_df = df[['WARD', 'TIME OF DAY', 'MONTH', 'WEEKEND', 'DATE OF OCCURRENCE', 'OFFENSES', 'X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION']]
processed_df

Unnamed: 0,WARD,TIME OF DAY,MONTH,WEEKEND,DATE OF OCCURRENCE,OFFENSES,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,35,Late Evening,January,No,2024-01-16 01:00:00,THEFT,1150337.0,1918345.0,41.931844,-87.722951,"(41.931843966, -87.722950868)"
1,42,Late Noon,December,Yes,2023-12-31 16:30:00,BATTERY,1176592.0,1902931.0,41.888994,-87.626935,"(41.888993854, -87.626934833)"
2,21,Early Evening,January,No,2024-01-16 18:50:00,WEAPONS VIOLATION,1170976.0,1839080.0,41.713905,-87.649425,"(41.713904887, -87.649424515)"
3,27,Early Morning,November,No,2023-11-30 07:28:00,NARCOTICS,1153117.0,1905117.0,41.895490,-87.713086,"(41.895490399, -87.713086271)"
4,42,Late Evening,December,Yes,2023-12-31 00:55:00,WEAPONS VIOLATION,1175975.0,1903895.0,41.891653,-87.629172,"(41.891653037, -87.62917162)"
...,...,...,...,...,...,...,...,...,...,...,...
257072,27,Early Morning,November,Yes,2024-11-23 08:25:00,OTHER OFFENSE,1150853.0,1903735.0,41.891743,-87.721438,"(41.891742661, -87.721437661)"
257073,44,Early Noon,November,Yes,2024-11-23 14:45:00,THEFT,1170069.0,1921303.0,41.939552,-87.650352,"(41.939552474, -87.650352367)"
257074,21,Late Morning,November,Yes,2024-11-23 11:04:00,OTHER OFFENSE,1172801.0,1836375.0,41.706442,-87.642820,"(41.706441994, -87.642820119)"
257075,27,Late Evening,November,Yes,2024-11-23 00:54:00,BATTERY,1156727.0,1899212.0,41.879214,-87.699988,"(41.879214143, -87.699987616)"


In [18]:
# # Convert 'DATE  OF OCCURRENCE' to datetime format
# df['DATE  OF OCCURRENCE'] = pd.to_datetime(df['DATE  OF OCCURRENCE'], errors='coerce')

# # Create a function to categorize time into buckets
# def categorize_time(hour):
#     if 0 <= hour < 4:
#         return "Early Morning"
#     elif 4 <= hour < 8:
#         return "Morning"
#     elif 8 <= hour < 12:
#         return "Late Morning"
#     elif 12 <= hour < 16:
#         return "Afternoon"
#     elif 16 <= hour < 20:
#         return "Evening"
#     else:  # 20 <= hour < 24
#         return "Late Evening"

# # Apply the time categorization to create 'TIME OF DAY' column
# df['TIME OF DAY'] = df['DATE  OF OCCURRENCE'].dt.hour.apply(categorize_time)

#### Convert Month and Time of Day columns to sine/cosine

    - Ordinal Encoding: Works well if months have a natural order relevant to the problem (e.g., sales data trends over a year).
    - One-Hot Encoding: Suitable for models like logistic regression that cannot infer order from numerical values.
    - Cyclical Encoding: Best when the cyclical nature of months is important (e.g., temperature data over a year).

In [28]:
df['TIME OF DAY'].unique()

array(['Late Evening', 'Late Noon', 'Early Evening', 'Early Morning',
       'Early Noon', 'Late Morning'], dtype=object)

Include 'MONTH' column:

In [None]:
# Create a mapping for months to numbers (1-12)
month_order = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

In [24]:
df['MONTH_ENCODED'] = df['MONTH'].map(month_order)

In [26]:
# Apply sine and cosine transformations
df['MONTH_SIN'] = np.sin(2 * np.pi * df['MONTH_ENCODED'] / 12)
df['MONTH_COS'] = np.cos(2 * np.pi * df['MONTH_ENCODED'] / 12)

In [27]:
df.head(2)

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,...,LOCATION,TIME OF DAY,MONTH,WEEKDAY,WEEKDAY NUM,WEEKEND,OFFENSES,MONTH_ENCODED,MONTH_SIN,MONTH_COS
0,JH117298,2024-01-16 01:00:00,038XX W DIVERSEY AVE,810,THEFT,OVER $500,STREET,N,N,2524,...,"(41.931843966, -87.722950868)",Late Evening,January,Tuesday,1,No,THEFT,1,0.5,0.866025
1,JG561057,2023-12-31 16:30:00,004XX N WABASH AVE,460,BATTERY,SIMPLE,STREET,N,N,1834,...,"(41.888993854, -87.626934833)",Late Noon,December,Sunday,6,Yes,BATTERY,12,-2.449294e-16,1.0


##### Why Ordinal Encoding Works for XGBoost
Tree-Based Model: XGBoost is based on decision trees, which split on feature thresholds. It does not assume any linear relationship between the numerical values of the encoded categories, so ordinal encoding works well.
Efficiency: Ordinal encoding is computationally efficient compared to one-hot encoding, which increases the number of features significantly.

In [29]:
# Define an ordinal mapping for time_of_day
time_order = {
    'Early Morning': 1,
    'Late Morning': 2,
    'Early Noon': 3,
    'Late Noon': 4,
    'Early Evening': 5,
    'Late Evening': 6
}
# Apply the mapping
df['TIME ENCODED'] = df['TIME OF DAY'].map(time_order)

In [30]:
df.head(2)

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,...,TIME OF DAY,MONTH,WEEKDAY,WEEKDAY NUM,WEEKEND,OFFENSES,MONTH_ENCODED,MONTH_SIN,MONTH_COS,TIME ENCODED
0,JH117298,2024-01-16 01:00:00,038XX W DIVERSEY AVE,810,THEFT,OVER $500,STREET,N,N,2524,...,Late Evening,January,Tuesday,1,No,THEFT,1,0.5,0.866025,6
1,JG561057,2023-12-31 16:30:00,004XX N WABASH AVE,460,BATTERY,SIMPLE,STREET,N,N,1834,...,Late Noon,December,Sunday,6,Yes,BATTERY,12,-2.449294e-16,1.0,4
