# Consolidated Pre-processing Notebook

In [None]:
import pandas as pd
import numpy as np

In [77]:
#Load dataset from ../raw_data/
df = pd.read_csv("../raw_data/chicago.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257077 entries, 0 to 257076
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   CASE#                   257077 non-null  object 
 1   DATE  OF OCCURRENCE     257077 non-null  object 
 2   BLOCK                   257077 non-null  object 
 3    IUCR                   257077 non-null  object 
 4    PRIMARY DESCRIPTION    257077 non-null  object 
 5    SECONDARY DESCRIPTION  257077 non-null  object 
 6    LOCATION DESCRIPTION   256032 non-null  object 
 7   ARREST                  257077 non-null  object 
 8   DOMESTIC                257077 non-null  object 
 9   BEAT                    257077 non-null  int64  
 10  WARD                    257077 non-null  int64  
 11  FBI CD                  257077 non-null  object 
 12  X COORDINATE            257011 non-null  float64
 13  Y COORDINATE            257011 non-null  float64
 14  LATITUDE            

In [78]:
df.head(3)

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,WARD,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,JH117298,01/16/2024 01:00:00 AM,038XX W DIVERSEY AVE,0810,THEFT,OVER $500,STREET,N,N,2524,35,06,1150337.0,1918345.0,41.931844,-87.722951,"(41.931843966, -87.722950868)"
1,JG561057,12/31/2023 04:30:00 PM,004XX N WABASH AVE,0460,BATTERY,SIMPLE,STREET,N,N,1834,42,08B,1176592.0,1902931.0,41.888994,-87.626935,"(41.888993854, -87.626934833)"
2,JH117691,01/16/2024 06:50:00 PM,010XX W 99TH ST,143A,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,STREET,Y,N,2232,21,15,1170976.0,1839080.0,41.713905,-87.649425,"(41.713904887, -87.649424515)"


Raw datset columns has typographical errors. Below code resolves this:

In [79]:
# Remove leading and trailing spaces from each column name
df.columns = df.columns.str.strip()

# Apply strip() to each column where the data type is string (object)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# If the column name has multiple spaces (e.g., "DATE  OF OCCURRENCE"), replace them
df.columns = df.columns.str.replace('  ', ' ', regex=False)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Drop na values in location coordinates (total 66 rows): 

In [80]:
# Remove rows where any of the specified columns have missing data
df = df.dropna(subset=['X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION'])

Feature Engineer the following features: 
1. Time of Day (Early Morning, etc.), per 4h increment
2. Weekend? (i.e. Friday 5pm onwards to Sunday 11:59pm)
3. Month

In [81]:
# Convert 'DATE OF OCCURRENCE' to datetime format
df['DATE OF OCCURRENCE'] = pd.to_datetime(df['DATE OF OCCURRENCE'], errors='coerce')

# Create a function to categorize time into buckets
def categorize_time(hour):
    if 0 <= hour < 6:
        return "Late Evening"
    elif 6 <= hour < 9:
        return "Early Morning"
    elif 9 <= hour < 12:
        return "Late Morning"
    elif 12 <= hour < 15:
        return "Early Noon"
    elif 15 <= hour < 18:
        return "Late Noon"
    else:
        return "Early Evening"

# Apply the time categorization to create 'TIME OF DAY' column
df['TIME OF DAY'] = df['DATE OF OCCURRENCE'].dt.hour.map(categorize_time)

In [82]:
# Extract the month from 'DATE  OF OCCURRENCE' and create a new column 'MONTH'
df['MONTH'] = df['DATE OF OCCURRENCE'].dt.month_name()

In [83]:
df['WEEKDAY'] = df['DATE OF OCCURRENCE'].dt.strftime('%A')

In [84]:
df['WEEKDAY NUM'] = df['DATE OF OCCURRENCE'].dt.weekday

In [85]:
df['WEEKEND'] = np.where(df['WEEKDAY NUM'] <= 4, 'NO','YES')

In [86]:
df.drop(['WEEKEND'],axis=1,inplace = True)

In [87]:
# Extract day of week and hour
df['hour'] = df['DATE OF OCCURRENCE'].dt.hour
# Use conditions to determine 'Weekend'
df['WEEKEND'] = (
    (df['WEEKDAY NUM'] == 4) & (df['hour'] >= 17) |  # Friday after 5 PM
    (df['WEEKDAY NUM'].isin([5, 6]))                 # Saturday or Sunday
).map({True: '1', False: '0'})
# Drop intermediate columns if desired
df.drop(['hour'], axis=1, inplace=True)

#### Convert Month and Time of Day columns to sine/cosine

    - Ordinal Encoding: Works well if months have a natural order relevant to the problem (e.g., sales data trends over a year).
    - One-Hot Encoding: Suitable for models like logistic regression that cannot infer order from numerical values.
    - Cyclical Encoding: Best when the cyclical nature of months is important (e.g., temperature data over a year).

In [88]:
# Create a mapping for months to numbers (1-12)
month_order = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

In [89]:
df['MONTH_ENCODED'] = df['MONTH'].map(month_order)

In [90]:
# Apply sine and cosine transformations
df['MONTH_SIN'] = np.sin(2 * np.pi * df['MONTH_ENCODED'] / 12)
df['MONTH_COS'] = np.cos(2 * np.pi * df['MONTH_ENCODED'] / 12)

In [91]:
# Define an ordinal mapping for time_of_day
time_order = {
    'Early Morning': 1,
    'Late Morning': 2,
    'Early Noon': 3,
    'Late Noon': 4,
    'Early Evening': 5,
    'Late Evening': 6
}
# Apply the mapping
df['TIME ENCODED'] = df['TIME OF DAY'].map(time_order)

### Spliting THEFT into two group of data 'THEFT UNDER $500' and 'THEFT OVER $500'

In [92]:
# Filter for 'PRIMARY DESCRIPTION' = 'THEFT'
theft_data = df[df['PRIMARY DESCRIPTION'] == 'THEFT']

# Group the data by 'PRIMARY DESCRIPTION' and 'SECONDARY DESCRIPTION', then count occurrences
theft_grouped_data = theft_data.groupby(['PRIMARY DESCRIPTION', 'SECONDARY DESCRIPTION']).size().reset_index(name='Count')

# Assign the desired names dynamically
group_1_description = 'THEFT UNDER $500'
group_2_description = 'THEFT OVER $500'

#   iloc    PRIMARY DESCRIPTION                 SECONDARY DESCRIPTION   Count
#   0               THEFT                          $500 AND UNDER       19167
#   1               THEFT                           ATTEMPT THEFT         386
#   2               THEFT                DELIVERY CONTAINER THEFT          32
#   3               THEFT                           FROM BUILDING        5422
#   4               THEFT    FROM COIN-OPERATED MACHINE OR DEVICE          13
#   5               THEFT                               OVER $500       18248
#   6               THEFT                          POCKET-PICKING        1371
#   7               THEFT                         PURSE-SNATCHING         275
#   8               THEFT                            RETAIL THEFT       13432
#   9               THEFT                THEFT FROM MOTOR VEHICLE        1748

# Split the data into Group 1 and Group 2 based on manual grouping (row indices)
group_1 = theft_grouped_data.iloc[[0, 1, 2, 3, 4, 6, 7, 9]]  # Rows for Group 1
group_2 = theft_grouped_data.iloc[[5, 8]]  # Rows for Group 2

# Define a mapping dictionary based on group_1 and group_2
mapping = {}

# Add entries to the mapping for group_1
for secondary_desc in group_1['SECONDARY DESCRIPTION']:
    mapping[secondary_desc] = group_1_description

# Add entries to the mapping for group_2
for secondary_desc in group_2['SECONDARY DESCRIPTION']:
    mapping[secondary_desc] = group_2_description

# Update the 'PRIMARY DESCRIPTION' column in the original DataFrame
df.loc[df['PRIMARY DESCRIPTION'] == 'THEFT', 'PRIMARY DESCRIPTION'] = df.loc[df['PRIMARY DESCRIPTION'] == 'THEFT', 'SECONDARY DESCRIPTION'].map(mapping)


Consolidate Offenses: 

In [93]:
# Create a new column 'Offenses' based on 'PRIMARY DESCRIPTION'
df['OFFENSES'] = df['PRIMARY DESCRIPTION']
df['OFFENSES'].value_counts()

OFFENSES
BATTERY                              45766
THEFT OVER $500                      31671
CRIMINAL DAMAGE                      28638
THEFT UNDER $500                     28400
ASSAULT                              23508
MOTOR VEHICLE THEFT                  22331
OTHER OFFENSE                        16773
DECEPTIVE PRACTICE                   15059
ROBBERY                               9442
BURGLARY                              8178
WEAPONS VIOLATION                     7914
NARCOTICS                             5821
CRIMINAL TRESPASS                     4884
OFFENSE INVOLVING CHILDREN            1637
CRIMINAL SEXUAL ASSAULT               1521
SEX OFFENSE                           1218
PUBLIC PEACE VIOLATION                 976
INTERFERENCE WITH PUBLIC OFFICER       682
HOMICIDE                               575
STALKING                               497
ARSON                                  492
PROSTITUTION                           280
CONCEALED CARRY LICENSE VIOLATION      201
LI

In [94]:
# Get the value counts for 'Offenses' and identify offenses with fewer than 500 occurrences
value_counts = df['OFFENSES'].value_counts()
# Replace offenses that occur fewer than 500 times with "OTHER OFFENSE"
to_remove = value_counts[value_counts<10000].index

In [95]:
df = df.loc[df['OFFENSES'].isin(value_counts[value_counts >= 10000].index)]

In [96]:
df['OFFENSES'].value_counts()

OFFENSES
BATTERY                45766
THEFT OVER $500        31671
CRIMINAL DAMAGE        28638
THEFT UNDER $500       28400
ASSAULT                23508
MOTOR VEHICLE THEFT    22331
OTHER OFFENSE          16773
DECEPTIVE PRACTICE     15059
Name: count, dtype: int64

In [97]:
# Get the value counts for 'Offenses' and identify offenses with fewer than 500 occurrences
value_counts = df['OFFENSES'].value_counts()
# Replace offenses that occur fewer than 500 times with "OTHER OFFENSE"
to_replace = value_counts[value_counts<500].index
df['OFFENSES'] = df['OFFENSES'].replace(to_replace, "OTHER OFFENSE")

In [98]:
# Consolidate specific offenses into other categories
df['OFFENSES'] = df['OFFENSES'].replace({
    "PUBLIC PEACE VIOLATION": "PUBLIC ORDER",  # Consolidate Public Peace Violation into new category: 'PUBLIC ORDER'
    "INTERFERENCE WITH PUBLIC OFFICER": "PUBLIC ORDER",  # Consolidate Interference with Public Officer into new category: 'PUBLIC ORDER'
    "CRIMINAL SEXUAL ASSAULT": "SEX OFFENSE"  # Conslidate Criminal Sexual Assault into: 'SEX OFFENSE'
})

In [99]:
processed_df = df[['WARD', 'TIME ENCODED', 'MONTH_SIN', 'MONTH_COS', 'WEEKEND', 'DATE OF OCCURRENCE', 'OFFENSES', 'LATITUDE', 'LONGITUDE']]
processed_df

Unnamed: 0,WARD,TIME ENCODED,MONTH_SIN,MONTH_COS,WEEKEND,DATE OF OCCURRENCE,OFFENSES,LATITUDE,LONGITUDE
0,35,6,5.000000e-01,8.660254e-01,0,2024-01-16 01:00:00,THEFT OVER $500,41.931844,-87.722951
1,42,4,-2.449294e-16,1.000000e+00,1,2023-12-31 16:30:00,BATTERY,41.888994,-87.626935
5,16,3,5.000000e-01,8.660254e-01,1,2024-01-06 12:50:00,DECEPTIVE PRACTICE,41.793299,-87.664566
6,1,3,8.660254e-01,-5.000000e-01,1,2024-04-07 13:56:00,THEFT OVER $500,41.906797,-87.671862
8,49,4,1.000000e+00,6.123234e-17,0,2024-03-22 15:30:00,THEFT UNDER $500,42.007825,-87.670842
...,...,...,...,...,...,...,...,...,...
257072,27,1,-5.000000e-01,8.660254e-01,1,2024-11-23 08:25:00,OTHER OFFENSE,41.891743,-87.721438
257073,44,3,-5.000000e-01,8.660254e-01,1,2024-11-23 14:45:00,THEFT UNDER $500,41.939552,-87.650352
257074,21,2,-5.000000e-01,8.660254e-01,1,2024-11-23 11:04:00,OTHER OFFENSE,41.706442,-87.642820
257075,27,6,-5.000000e-01,8.660254e-01,1,2024-11-23 00:54:00,BATTERY,41.879214,-87.699988


In [100]:
# processed_df.to_csv('../raw_data/preprocessed_chicago.csv', index=False)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv("../raw_data/preprocessed_chicago.csv")

# Step 1: Define target and features
target = 'OFFENSES'
X = data.drop(columns=[target])  # Drop target column
y = data[target]  # Extract target column

# Step 2: Remove 'DATE OF OCCURRENCE'
if 'DATE OF OCCURRENCE' in X.columns:
    X = X.drop(columns=['DATE OF OCCURRENCE'])

# Step 3: Sin-Cos Transformation for "TIME ENCODED"
if 'TIME ENCODED' in X.columns:
    time_col = X['TIME ENCODED']
    X['TIME_ENCODED_sin'] = np.sin(2 * np.pi * time_col / 6)  # Cyclical encoding
    X['TIME_ENCODED_cos'] = np.cos(2 * np.pi * time_col / 6)
    X = X.drop(columns=['TIME ENCODED'])  # Drop the original TIME ENCODED column

# Step 4: Handle Missing Values
# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object' or X[col].dtype == 'category']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Print columns before processing to validate
print("Categorical columns before preprocessing:", categorical_cols)
print("Numerical columns before preprocessing:", numerical_cols)

# Create missing indicator columns for numerical features
for col in numerical_cols:
    if X[col].isnull().any():  # Add missing indicator only if there are missing values
        X[f'is_{col}_missing'] = X[col].isnull().astype(int)

# Impute missing values for categorical columns with the most frequent value
# if categorical_cols:
#     cat_imputer = SimpleImputer(strategy='most_frequent')
#     X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

# Impute missing values for numerical columns with the median
if numerical_cols:
    num_imputer = SimpleImputer(strategy='median')
    X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])

# Step 5: Preprocessing pipeline for both numerical and categorical features
transformers = []

# Add transformations for categorical columns
if categorical_cols:
    transformers.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols))

# Add transformations for numerical columns
if numerical_cols:
    transformers.append(('num', StandardScaler(), numerical_cols))

# Ensure at least one transformer exists
if not transformers:
    raise ValueError("No valid columns found for transformation.")

# ColumnTransformer to apply preprocessing steps
preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='passthrough'  # Keeps other columns as is, in case there are other features
)

# Apply preprocessing pipeline to the features
X_processed = preprocessor.fit_transform(X)

# Step 6: Train-Test Split (70-30) with Stratification
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42, stratify=y)

# Step 7: Model definition (HistGradientBoostingClassifier)
model = HistGradientBoostingClassifier(max_iter=50, random_state=42)  # Reduced iterations for faster training

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True, zero_division=1)

# Display the results
print("Model: HistGradientBoostingClassifier")
print(f"Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{report}")

Categorical columns before preprocessing: []
Numerical columns before preprocessing: ['WARD', 'MONTH_SIN', 'MONTH_COS', 'WEEKEND', 'LATITUDE', 'LONGITUDE', 'TIME_ENCODED_sin', 'TIME_ENCODED_cos']
Model: HistGradientBoostingClassifier
Accuracy: 0.2777
Classification Report:
{'ASSAULT': {'precision': 0.18518518518518517, 'recall': 0.004254112308564946, 'f1-score': 0.008317161075686166, 'support': 7052.0}, 'BATTERY': {'precision': 0.2585657661162853, 'recall': 0.774435542607429, 'f1-score': 0.38769073706087176, 'support': 13730.0}, 'CRIMINAL DAMAGE': {'precision': 0.23409018228333867, 'recall': 0.08519553072625698, 'f1-score': 0.12492533492618824, 'support': 8592.0}, 'DECEPTIVE PRACTICE': {'precision': 0.2826780021253985, 'recall': 0.05887560867640549, 'f1-score': 0.09745374610734567, 'support': 4518.0}, 'MOTOR VEHICLE THEFT': {'precision': 0.24233983286908078, 'recall': 0.05194805194805195, 'f1-score': 0.08555623847572219, 'support': 6699.0}, 'OTHER OFFENSE': {'precision': 0.263513513513