In [1]:
# check for required file

import os
from pathlib import Path
import matplotlib.pyplot as plt

FILE_PATH = '../data/NYPD_Complaint_Data_Historic_Cleaned_Reduced_Merged.csv'

file_path = Path(FILE_PATH)

# Check if the file exists
if file_path.exists():
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}. Please ensure the file is downloaded correctly.")

File found: ../data/NYPD_Complaint_Data_Historic_Cleaned_Reduced_Merged.csv


In [2]:
# Load the dataset
import pandas as pd
import numpy as np

df = pd.read_csv(FILE_PATH)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1588788 entries, 0 to 1588787
Data columns (total 13 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   OFNS_DESC                        1588788 non-null  object 
 1   BORO_NM                          1588788 non-null  object 
 2   PREM_TYP_DESC                    1588788 non-null  object 
 3   Latitude                         1588788 non-null  float64
 4   Longitude                        1588788 non-null  float64
 5   VIC_AGE_GROUP                    1588788 non-null  object 
 6   VIC_RACE                         1588788 non-null  object 
 7   VIC_SEX                          1588788 non-null  object 
 8   Hour                             1588788 non-null  float64
 9   OFNS_DESC_Severity_Score         1588788 non-null  int64  
 10  Crime_Category                   1588788 non-null  object 
 11  local_population_count_1km       1588788 non-null 

In [3]:
df.head()

Unnamed: 0,OFNS_DESC,BORO_NM,PREM_TYP_DESC,Latitude,Longitude,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,Hour,OFNS_DESC_Severity_Score,Crime_Category,local_population_count_1km,mental_health_service_count_1km
0,ROBBERY,BROOKLYN,STREET,40.701593,-73.948747,<18,WHITE,M,19.0,8,Violent Crimes,201434.0,4
1,RAPE,MANHATTAN,STREET,40.737203,-73.983273,<18,BLACK,F,23.0,10,Violent Crimes,310938.0,20
2,RAPE,BRONX,HOSPITAL,40.810352,-73.924942,<18,BLACK,F,23.0,10,Violent Crimes,224242.0,12
3,FELONY ASSAULT,QUEENS,OTHER,40.59368,-73.790074,25-44,WHITE,F,12.0,9,Violent Crimes,214372.0,0
4,ROBBERY,BRONX,STREET,40.859853,-73.894368,<18,BLACK HISPANIC,M,20.0,8,Violent Crimes,229936.0,8


In [4]:
# CHECKING FOR CLASS IMBALANCE
# Train ML model to perform classification. The target will be Crime_Category. 
# Need to make sure classes are evenly represented
# Calculate the distribution of classes
class_distribution = df['Crime_Category'].value_counts(normalize=True) * 100

print("Class distribution (percentage):")
class_distribution

Class distribution (percentage):


Crime_Category
Theft and Larceny                           28.185447
Violent Crimes                              27.295586
Family and Personal Offenses                19.553144
Miscellaneous and Specific Offenses          8.471615
Public Order and Administrative Offenses     7.527121
Property Crimes                              6.265594
Traffic                                      1.530978
Fraud and Financial Crimes                   0.750509
Weapons and Dangerous Offenses               0.272912
Drug and Alcohol-Related Offenses            0.139729
Negligence and Careless Acts                 0.007364
Name: proportion, dtype: float64

In [5]:
# train ML model to perform classification. The target will be Crime_Category. 
# Features should omit the OFNS_DESC column and the OFNS_DESC_Severity_Score as singal from these columns are
# are reflected in the target and should not be used for training to avoid overfitting.

# One Hot Encode Categorical Columns
# Scale Ordinal/Numerical Columns

## Address class imbalance issues
# SMOTE (Synthetic Minority Over-sampling Technique)
# Random Under-Sampling: Reduces the majority classes by randomly removing instances.
# SMOTEENN does both

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN

# Drop columns not used for training
df = df.drop(columns=['OFNS_DESC', 'OFNS_DESC_Severity_Score'])

# Assuming the remaining columns, identify categorical and numerical
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('Crime_Category')  # Remove the target from the list of categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Label encode the target
label_encoder = LabelEncoder()
df['Crime_Category'] = label_encoder.fit_transform(df['Crime_Category'])

# Separate features and target
X = df.drop('Crime_Category', axis=1)
y = df['Crime_Category']

# Create the preprocessing pipeline to encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply preprocessing to the entire dataset before resampling
X_encoded = preprocessor.fit_transform(X)
print("preprocessor finished")

enn = EditedNearestNeighbours(n_neighbors=1) 

# Apply SMOTEENN to balance the classes
smote_enn = SMOTEENN(smote=SMOTE(random_state=42),
                     enn=enn,
                     random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_encoded, y)

# Re-examine the class distribution after SMOTEENN
class_distribution_resampled = y_resampled.value_counts(normalize=True) * 100
print("Class distribution after SMOTEENN (percentage):")
class_distribution_resampled

preprocessor finished


In [None]:
# Now split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline = Pipeline(steps=[
    ('classifier', RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42, verbose=1))
])

pipeline.fit(X_train, y_train)

# Use the pipeline to make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))