In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, classification_report, confusion_matrix
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:

# Haversine formula to calculate distance between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return r * c


In [3]:
# Define police districts with coordinates
police_districts = {
    "District 1 (Central)": (41.8345, -87.6216),
    "District 2 (Wentworth)": (41.8027, -87.6185),
    "District 3 (Grand Crossing)": (41.752, -87.6001),
    "District 4 (South Chicago)": (41.7531, -87.5573),
    "District 5 (Calumet)": (41.7365, -87.607),
    "District 6 (Gresham)": (41.7445, -87.6616),
    "District 7 (Englewood)": (41.7843, -87.6745),
    "District 8 (Chicago Lawn)": (41.7794, -87.6864),
    "District 9 (Deering)": (41.827, -87.667),
    "District 10 (Ogden)": (41.8782, -87.7119),
    "District 11 (Harrison)": (41.8589, -87.7107),
    "District 12 (Near West Side)": (41.8844, -87.6456),
    "District 13 (Jefferson Park)": (41.8914, -87.7377),
    "District 14 (Shakespeare)": (41.8986, -87.6743),
    "District 15 (Austin)": (41.8763, -87.7724),
    "District 16 (Albion Park)": (41.9762, -87.7243),
    "District 17 (Woodlawn)": (41.7874, -87.592),
    "District 18 (Pullman)": (41.7317, -87.6079),
    "District 19 (Southwest)": (41.794, -87.74),
    "District 20 (North Lawndale)": (41.8655, -87.7111),
    "District 21 (Near North Side)": (41.9264, -87.6482),
    "District 22 (Lincoln Park)": (41.9252, -87.6549),
}

In [4]:
# Load dataset
file_path = '../raw_data/preprocessed_chicago.csv'  # Update with your file path
data = pd.read_csv(file_path)


In [5]:
# Ensure 'DATE OF OCCURRENCE' is in datetime format
data['DATE'] = pd.to_datetime(data['DATE OF OCCURRENCE'])
data['HOUR'] = data['DATE'].dt.floor('h')

# Aggregate data to hourly counts
hourly_counts = data.groupby('HOUR').size().reset_index(name='CRIME_COUNT')
hourly_dominant_crime = data.groupby('HOUR')['OFFENSES'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

hourly_data = pd.merge(hourly_counts, hourly_dominant_crime, on='HOUR', how='left')

# Create cyclical features
hourly_data['TIME_SIN'] = np.sin(2 * np.pi * hourly_data['HOUR'].dt.hour / 24)
hourly_data['TIME_COS'] = np.cos(2 * np.pi * hourly_data['HOUR'].dt.hour / 24)
hourly_data['MONTH_SIN'] = np.sin(2 * np.pi * hourly_data['HOUR'].dt.month / 12)
hourly_data['MONTH_COS'] = np.cos(2 * np.pi * hourly_data['HOUR'].dt.month / 12)
hourly_data['ROLLING_7DAY'] = hourly_data['CRIME_COUNT'].rolling(window=7).mean()
hourly_data['DAY_OF_WEEK'] = hourly_data['HOUR'].dt.dayofweek
hourly_data['CRIME_COUNT_LAG1'] = hourly_data['CRIME_COUNT'].shift(1)
hourly_data['CRIME_COUNT_LAG24'] = hourly_data['CRIME_COUNT'].shift(24)

# Drop NaN values
hourly_data = hourly_data.dropna()

# Add Latitude and Longitude-based features
latitude = data['LATITUDE']
longitude = data['LONGITUDE']

In [26]:
def calculate_nearest_distance(lat, lon):
    distances = [haversine(lat, lon, station[0], station[1]) for station in police_districts.values()]
    return min(distances)

data['DISTANCE_TO_POLICE'] = [calculate_nearest_distance(lat, lon) for lat, lon in zip(latitude, longitude)]

# Merge spatial features with hourly data
hourly_data['DISTANCE_TO_POLICE'] = data.groupby('HOUR')['DISTANCE_TO_POLICE'].transform('mean')

# Add WARD feature (Label Encoding)
label_encoder_ward = LabelEncoder() #-- label_encoder_ward.pkl
data['WARD_ENCODED'] = label_encoder_ward.fit_transform(data['WARD'])

hourly_data['WARD'] = data.groupby('HOUR')['WARD_ENCODED'].transform(lambda x: x.mode()[0] if not x.mode().empty else None)

# Feature matrix and target variable
X = hourly_data[['TIME_SIN', 'TIME_COS', 'CRIME_COUNT_LAG1', 'CRIME_COUNT_LAG24', 'ROLLING_7DAY', 'DISTANCE_TO_POLICE', 'WARD', 'DAY_OF_WEEK', 'MONTH_SIN', 'MONTH_COS']]

y_reg = hourly_data['CRIME_COUNT']
y_class = hourly_data['OFFENSES']

# Encode categorical target for classification
label_encoder = LabelEncoder()
y_class_encoded = label_encoder.fit_transform(y_class)

with open('label_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)


In [7]:
# Train-test split
X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.3, random_state=42, shuffle=True)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class_encoded, test_size=0.3, random_state=42, shuffle=True, stratify=y_class_encoded)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_res, y_class_res = smote.fit_resample(X_train_class, y_train_class)


In [9]:
import pickle

In [16]:
# Initialize models
regressor = xgb.XGBRegressor(n_estimators=100, random_state=42)
classifier = RandomForestClassifier(n_estimators=200, random_state=42)

# RandomizedSearchCV for Random Forest
param_dist_rf = {
    'n_estimators': [200],
    'max_depth': [20],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'bootstrap': [False]
}
random_search_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=2,
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
random_search_rf.fit(X_res, y_class_res)
best_rf_classifier = random_search_rf.best_estimator_

# Save the trained model
with open('model.pkl', 'wb') as f:
    pickle.dump(random_search_rf, f)

# # Train models
# regressor.fit(X_train, y_train_reg)
# best_rf_classifier.fit(X_res, y_class_res)



In [None]:
# Predictions
y_test_pred_xgb = regressor.predict(X_test)
y_test_pred_rf = best_rf_classifier.predict(X_test_class)

# Regression evaluation
mse_test_xgb = mean_squared_error(y_test_reg, y_test_pred_xgb)
mae_test_xgb = mean_absolute_error(y_test_reg, y_test_pred_xgb)
r2_test_xgb = regressor.score(X_test, y_test_reg)

print(f"XGBoost Regression MSE: {mse_test_xgb}")
print(f"XGBoost Regression MAE: {mae_test_xgb}")
print(f"XGBoost Regression R²: {r2_test_xgb}")

In [None]:
# Classification evaluation
print("Classification Report:\n", classification_report(y_test_class, y_test_pred_rf))

# Evaluate using Accuracy for classification
accuracy_test_rf = accuracy_score(y_test_class, y_test_pred_rf)
print(f"Random Forest Classification Accuracy on Test Set: {accuracy_test_rf}")

# Compute confusion matrix
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test_class, y_test_pred_rf)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Feature importance for Random Forest
feature_importances = best_rf_classifier.feature_importances_
plt.barh(X.columns, feature_importances)
plt.xlabel("Feature Importance")
plt.title("Feature Importance from Random Forest")
plt.show()

# Residuals of XGBoost Regression
plt.scatter(y_test_reg, y_test_pred_xgb - y_test_reg)
plt.hlines(0, xmin=y_test_reg.min(), xmax=y_test_reg.max(), colors='r', linestyles='--')
plt.xlabel('True Values')
plt.ylabel('Residuals')
plt.title('Residuals of XGBoost Regression')
plt.show()


In [41]:
def preprocess_input(input_data):

    # Ensure 'DATE OF OCCURRENCE' is in datetime format
    input_data['DATE'] = pd.to_datetime(input_data['DATE OF OCCURRENCE'])
    input_data['HOUR'] = input_data['DATE'].dt.floor('h')

    # Aggregate data to hourly counts
    input_data = input_data.groupby('HOUR').size().reset_index(name='CRIME_COUNT')

    #hourly_dominant_crime = data.groupby('HOUR')['OFFENSES'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

    #hourly_data = pd.merge(hourly_counts, hourly_dominant_crime, on='HOUR', how='left')


    # Convert 'DATE OF OCCURRENCE' to datetime and extract hour
    input_data['DATE'] = pd.to_datetime(input_data['DATE OF OCCURRENCE'])
    input_data['HOUR'] = input_data['DATE'].dt.floor('h')

    # Generate cyclical features (sine and cosine of time and month)
    input_data['TIME_SIN'] = np.sin(2 * np.pi * input_data['HOUR'].dt.hour / 24)
    input_data['TIME_COS'] = np.cos(2 * np.pi * input_data['HOUR'].dt.hour / 24)
    input_data['MONTH_SIN'] = np.sin(2 * np.pi * input_data['HOUR'].dt.month / 12)
    input_data['MONTH_COS'] = np.cos(2 * np.pi * input_data['HOUR'].dt.month / 12)

    # Calculate distance to nearest police district
    input_data['DISTANCE_TO_POLICE'] = [calculate_nearest_distance(lat, lon) for lat, lon in zip(input_data['LATITUDE'], input_data['LONGITUDE'])]

    # Generate lag features and rolling mean for crime counts (assuming data with past crime counts is available)
    input_data['CRIME_COUNT_LAG1'] = input_data['CRIME_COUNT'].shift(1)
    input_data['CRIME_COUNT_LAG24'] = input_data['CRIME_COUNT'].shift(24)
    input_data['ROLLING_7DAY'] = input_data['CRIME_COUNT'].rolling(window=7).mean()


    # Encode 'WARD' (you should have a predefined mapping for 'WARD' if necessary)
    label_encoder_ward = LabelEncoder()
    input_data['WARD_ENCODED'] = label_encoder.fit_transform(input_data['WARD'])

    # Add 'DAY_OF_WEEK' feature
    input_data['DAY_OF_WEEK'] = input_data['HOUR'].dt.dayofweek

    # Drop NaN values due to lag features and rolling mean
    input_data = input_data.dropna()

    return input_data

In [42]:
#Prediction function
def predict_top_5_crimes(input_data):
    # Preprocess the input data
    preprocessed_data = preprocess_input(input_data)

    # Extract features for prediction (same as used during training)
    X_input = preprocessed_data[['TIME_SIN', 'TIME_COS', 'CRIME_COUNT_LAG1', 'CRIME_COUNT_LAG24', 'ROLLING_7DAY',
                                 'DISTANCE_TO_POLICE', 'WARD_ENCODED', 'DAY_OF_WEEK', 'MONTH_SIN', 'MONTH_COS']]

    # Get class probabilities from the Random Forest classifier
    probas = classifier.predict_proba(X_input)

    # Get top 5 classes with highest probabilities
    top_5_idx = np.argsort(probas[0])[-5:][::-1]  # Sort in descending order
    top_5_classes = label_encoder.classes_[top_5_idx]
    top_5_probabilities = probas[0][top_5_idx]

    # Create a dictionary for the top 5 predicted crimes and their probabilities
    result = {
        'Top 5 Crimes': {top_5_classes[i]: top_5_probabilities[i] for i in range(5)}
    }

    return result

# Sample input data
sample_input_data = {
    'WARD': [27],
    'DATE OF OCCURRENCE': ['1/16/2025 1:00'],
    'LATITUDE': [41.79329893],
    'LONGITUDE': [-87.66456619]
}

# Convert to DataFrame
input_df = pd.DataFrame(sample_input_data)

# Call the prediction function
top_5_crimes = predict_top_5_crimes(input_df)

# Output the result
print(top_5_crimes)

KeyError: 'DATE OF OCCURRENCE'

In [43]:
print(input_df.columns)


Index(['WARD', 'DATE OF OCCURRENCE', 'LATITUDE', 'LONGITUDE', 'DATE', 'HOUR'], dtype='object')


In [44]:
sample_input_data

{'WARD': [27],
 'DATE OF OCCURRENCE': ['1/16/2025 1:00'],
 'LATITUDE': [41.79329893],
 'LONGITUDE': [-87.66456619]}

In [17]:
import pickle

# Load the LabelEncoder
with open("label_encoder_ward.pkl", "rb") as file:
    label_encoder_ward = pickle.load(file)

# Load the Random Forest model
with open("model.pkl", "rb") as file:
    classifier = pickle.load(file)

print("LabelEncoder and Random Forest model successfully loaded!")


LabelEncoder and Random Forest model successfully loaded!


In [24]:
import pandas as pd
import numpy as np

# Sample input data
sample_input_data = {
    'WARD': [4],
    'DATE OF OCCURRENCE': ['1/16/2025 1:00'],
    'LATITUDE': [41.79329893],
    'LONGITUDE': [-87.66456619]
}

# Convert input data to DataFrame
input_df = pd.DataFrame(sample_input_data)

# Preprocess the input data (without encoding 'WARD')
def preprocess_input(input_data):
    # Ensure 'DATE OF OCCURRENCE' is in datetime format
    input_data['DATE'] = pd.to_datetime(input_data['DATE OF OCCURRENCE'])
    input_data['HOUR'] = input_data['DATE'].dt.floor('h')

    # Generate cyclical features (sine and cosine of time and month)
    input_data['TIME_SIN'] = np.sin(2 * np.pi * input_data['HOUR'].dt.hour / 24)
    input_data['TIME_COS'] = np.cos(2 * np.pi * input_data['HOUR'].dt.hour / 24)
    input_data['MONTH_SIN'] = np.sin(2 * np.pi * input_data['HOUR'].dt.month / 12)
    input_data['MONTH_COS'] = np.cos(2 * np.pi * input_data['HOUR'].dt.month / 12)

    # Don't encode 'WARD', just keep it as it is
    # input_data['WARD_ENCODED'] = label_encoder_ward.transform(input_data['WARD'])

    # Add 'DAY_OF_WEEK' feature
    input_data['DAY_OF_WEEK'] = input_data['HOUR'].dt.dayofweek

    # Add placeholder features (you can replace them with actual calculations)
    input_data['CRIME_COUNT_LAG1'] = 0
    input_data['CRIME_COUNT_LAG24'] = 0
    input_data['ROLLING_7DAY'] = 0
    input_data['DISTANCE_TO_POLICE'] = 0

    return input_data

# Preprocess the data
preprocessed_input = preprocess_input(input_df)

# Extract the features for prediction (including 'WARD')
X_input = preprocessed_input[['TIME_SIN', 'TIME_COS', 'CRIME_COUNT_LAG1', 'CRIME_COUNT_LAG24', 'ROLLING_7DAY',
                              'DISTANCE_TO_POLICE', 'WARD', 'DAY_OF_WEEK', 'MONTH_SIN', 'MONTH_COS']]

# Generate predictions
probas = classifier.predict_proba(X_input)

# Get the top 5 predictions
top_5_idx = np.argsort(probas[0])[-5:][::-1]  # Sort probabilities in descending order
top_5_classes = classifier.classes_[top_5_idx]
top_5_probabilities = probas[0][top_5_idx]

# Display results
result = {
    'Top 5 Crimes': {top_5_classes[i]: top_5_probabilities[i] for i in range(5)}
}
print("Top 5 Predicted Crimes:")
print(result)

Top 5 Predicted Crimes:
{'Top 5 Crimes': {1: 0.25625, 2: 0.21958333333333332, 0: 0.18125, 5: 0.10124999999999998, 4: 0.09583333333333334}}


In [46]:
print(classifier)

RandomForestClassifier(n_estimators=200, random_state=42)
