In [1]:
# check for required file

import os
from pathlib import Path

FILE_PATH = '../data/NYPD_Complaint_Data_Historic_Cleaned_Reduced_Merged.csv'

file_path = Path(FILE_PATH)

# Check if the file exists
if file_path.exists():
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}. Please ensure the file is downloaded correctly.")

File found: ../data/NYPD_Complaint_Data_Historic_Cleaned_Reduced_Merged.csv


In [2]:
# Load the dataset
import pandas as pd
import numpy as np

df = pd.read_csv(FILE_PATH)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1588788 entries, 0 to 1588787
Data columns (total 13 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   OFNS_DESC                        1588788 non-null  object 
 1   BORO_NM                          1588788 non-null  object 
 2   PREM_TYP_DESC                    1588788 non-null  object 
 3   Latitude                         1588788 non-null  float64
 4   Longitude                        1588788 non-null  float64
 5   VIC_AGE_GROUP                    1588788 non-null  object 
 6   VIC_RACE                         1588788 non-null  object 
 7   VIC_SEX                          1588788 non-null  object 
 8   Hour                             1588788 non-null  float64
 9   OFNS_DESC_Severity_Score         1588788 non-null  int64  
 10  Crime_Category                   1588788 non-null  object 
 11  local_population_count_1km       1588788 non-null 

In [3]:
df.head()

Unnamed: 0,OFNS_DESC,BORO_NM,PREM_TYP_DESC,Latitude,Longitude,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,Hour,OFNS_DESC_Severity_Score,Crime_Category,local_population_count_1km,mental_health_service_count_1km
0,ROBBERY,BROOKLYN,STREET,40.701593,-73.948747,<18,WHITE,M,19.0,8,Violent Crimes,201434.0,4
1,RAPE,MANHATTAN,STREET,40.737203,-73.983273,<18,BLACK,F,23.0,10,Violent Crimes,310938.0,20
2,RAPE,BRONX,HOSPITAL,40.810352,-73.924942,<18,BLACK,F,23.0,10,Violent Crimes,224242.0,12
3,FELONY ASSAULT,QUEENS,OTHER,40.59368,-73.790074,25-44,WHITE,F,12.0,9,Violent Crimes,214372.0,0
4,ROBBERY,BRONX,STREET,40.859853,-73.894368,<18,BLACK HISPANIC,M,20.0,8,Violent Crimes,229936.0,8


In [4]:
# train ML model to perform classification. The target will be Crime_Category. 
# Features should omit the OFNS_DESC column and the OFNS_DESC_Severity_Score as singal from these columns are
# are reflected in the target and should not be used for training to avoid overfitting.

# One Hot Encode Categorical Columns
# Scale Ordinal/Numerical Columns


In [5]:
# Model training logic is encapsulated in nyc_crime_category_model.py file

# Initialize the NycCrimeCategoryModel with configurable parameters.

#         Args:
#             random_state (int): Random state for reproducibility.
#             test_size (float): Proportion of the dataset to include in the test split.
#             n_estimators (int): Number of trees in the RandomForest.
#             max_depth (int): Maximum depth of the trees in the RandomForest.
#             drop_columns (list): Columns to drop from the dataset.
#             target_column (str): The name of the target column.
#             model_filename (str): Default filename for saving and loading the model.
#             verbose_level (int): Controls the verbosity of the RandomForestClassifier.
#             sampling_strategy (str, dict, or callable): Strategy for undersampling the majority classes.

from nyc_crime_category_random_forest_model import NycCrimeCategoryRandomForestModel

# Initialize the model with optional parameters
model = NycCrimeCategoryRandomForestModel(
    random_state=42,
    test_size=0.30,
    n_estimators=180,
    max_depth=18,
    drop_columns=['OFNS_DESC', 'OFNS_DESC_Severity_Score'],
    target_column='Crime_Category',
    model_filename='../data/nyc_crime_category_random_forest_model_180_18.pkl',
    verbose_level=1
)

# Load and preprocess the data
model.load_data(FILE_PATH)
model.preprocess_data()

# Split the data
model.split_data()

# Print the class distribution before undersampling
model.print_class_distribution()

# Train the model
model.train_model()

# Save the model
model.save_model()

# KNOWN ISSUE - class distribution - minority classes are very under prepresented in the data. 
# Tried using SMOTE (Synthetic Minority Over-sampling Technique) to bulk up Minority classes.
# Tried using RandomUnderSampler to reduce the majority classes by randomly removing instances.
# In both cases I could not get it to work.

Class distribution in training data (percentage):
Crime_Category
7     28.232227
9     27.279749
1     19.552111
3      8.452719
6      7.521011
5      6.269742
8      1.529918
2      0.743874
10     0.272805
0      0.138740
4      0.007103
Name: proportion, dtype: float64


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 53.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   17.6s finished


Model accuracy: 0.4048
Model saved to ../data/nyc_crime_category_random_forest_model_180_18.pkl


In [6]:


# n_estimators=30, max_depth=3 => 15 sec => Model accuracy: 0.36
# n_estimators=50, max_depth=5 => 30 sec => Model accuracy: 0.37
# n_estimators=100, max_depth=10 => 5 mins => Model accuracy: 0.38
# n_estimators=150, max_depth=15 => 20 mins => Model accuracy: 0.39
# n_estimators=180, max_depth=18 => +50 mins => Model accuracy: 0.40
