Importing libraries. Using RandomForestRegressor and KMeans Clustering

In [20]:
import os
os.environ["OMP_NUM_THREADS"] = "4"  
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from joblib import dump, load
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error

Data Preprocess

In [21]:
# latitude,longitude,crime-severity,male-female-ratio,timestamp

def preprocess_data(df):
    df['datetime'] = pd.to_datetime(df['timestamp'])

    #temporal features
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['month'] = df['datetime'].dt.month

    #encoding for the above
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)

    return df.drop(['timestamp','datetime'], axis=1)


Creating spatial features using Kmeans

In [22]:
def create_spatial_features(df, kmeans_model):
    coords = df[['latitude', 'longitude']].values
    distances = kmeans_model.transform(coords)
    df['min_cluster_dist'] = distances.min(axis=1)
    
    # Get nearest cluster label
    df['cluster'] = kmeans_model.predict(coords)
    
    return df

Train pipeline with train-test spilt

In [66]:
#{'regressor__max_depth': 10, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300}
def train_model(data_path, n_clusters=10):
    # Load and preprocess data
    df = pd.read_csv(data_path)
    df = preprocess_data(df)
    
    # Spatial clustering
    coords = df[['latitude', 'longitude']].values
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(coords)

    # Calling spatial features
    df = create_spatial_features(df, kmeans)

    # Calculate cluster crime severity
    cluster_severity = df.groupby('cluster')['crime-severity'].mean().to_dict()
    df['cluster_severity'] = df['cluster'].map(cluster_severity)
    
    # Create severity-adjusted ratio
    # df['severity_adjusted_ratio'] = df['male-female-ratio'] * df['cluster_severity']
    # If the area (cluster) has low crime severity (<3) the male-female ratio is not given weight.
    df['severity_adjusted_ratio'] = np.where(
        df['cluster_severity'] <= 3,
        df['cluster_severity'],
        df['male-female-ratio'] * df['cluster_severity']
    )
    spatial_threshold = df['min_cluster_dist'].quantile(0.95)

    # Features aur target
    features = ['latitude', 'longitude', 'severity_adjusted_ratio',
                'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
                'month', 'cluster_severity']
    
    

    # Downsample low-severity samples with high ratios
    high_ratio_mask = (df['male-female-ratio'] > 1.2) & (df['crime-severity'] <= 3)
    df_filtered = df[~high_ratio_mask]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        df_filtered[features], 
        df_filtered['crime-severity'],
        test_size=0.2, 
        random_state=42
    )
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            max_depth=10,
            min_samples_split=10,
            min_samples_leaf=1
        ))
    ])
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # errors
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    
    # Save model
    artifacts = {
        'model': model,
        'kmeans': kmeans,
        'cluster_severity': cluster_severity,
        'spatial_threshold': spatial_threshold,
        'features': features
    }
    dump(artifacts, 'crime_prediction_model.joblib')
    
    return model, kmeans

In [83]:
# Main
train_model('women_crime_data_2.csv',250)

Mean Absolute Error: 1.18
Mean Squared Error: 2.38


(Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  RandomForestRegressor(max_depth=10, min_samples_split=10,
                                        n_estimators=300, random_state=42))]),
 KMeans(n_clusters=250, random_state=42))

In [69]:
def predict_severity(input_data):
    # Load model and artifacts
    artifacts = load('crime_prediction_model.joblib')
    model = artifacts['model']
    kmeans = artifacts['kmeans']
    cluster_severity = artifacts['cluster_severity']
    spatial_threshold = artifacts['spatial_threshold']
    features = artifacts['features']

    
    # Create DataFrame from input
    df = pd.DataFrame([input_data])
    df = preprocess_data(df)
    df = create_spatial_features(df, kmeans)

    print(df['min_cluster_dist'].values)  #given here for checking purpose
    print(df['cluster'].values) 

    if df['min_cluster_dist'].iloc[0] > spatial_threshold:
        # print("Location is far away from training data. Returning 0 severity.")
        return 0.0

    # Add cluster-based features
    df['cluster_severity'] = df['cluster'].map(cluster_severity)
    # df['severity_adjusted_ratio'] = df['male-female-ratio'] * df['cluster_severity']
    print(df['cluster_severity'].values)

    df['severity_adjusted_ratio'] = np.where(
        df['cluster_severity'] <= 3,
        df['cluster_severity'],
        df['male-female-ratio'] * df['cluster_severity']
    )     
    
    
    return model.predict(df[features])[0]

In [92]:
input_data = {
        'latitude': 22.95168,
        'longitude': 88.38841,                  #I have chosen low risk area here which is not present in train set
        'timestamp': '2025-02-10 08:30:00',
        'male-female-ratio': 2.5
}
    
prediction = predict_severity(input_data)
print(f"Predicted crime severity: {prediction:.2f}")

[0.00271141]
[55]
[4.]
Predicted crime severity: 4.93


In [87]:
input_data = {
        'latitude': 22.94821,
        'longitude': 88.38052,                  #I have chosen high risk area here which partially matches in train set
        'timestamp': '2025-02-04 21:30:00', 
        'male-female-ratio': 1.5
}
    
prediction = predict_severity(input_data)
print(f"Predicted crime severity: {prediction:.2f}")

[0.00382189]
[18]
[5.]
Predicted crime severity: 4.22


Grid Search for Hyperparameter Tuning

In [49]:
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('women_crime_data_2.csv')
df = preprocess_data(df)

# Spatial clustering
coords = df[['latitude', 'longitude']].values
kmeans = KMeans(n_clusters=100, random_state=42)
kmeans.fit(coords)

# Calling spatial features
df = create_spatial_features(df, kmeans)

cluster_severity = df.groupby('cluster')['crime-severity'].mean().to_dict()
df['cluster_severity'] = df['cluster'].map(cluster_severity)

# Create severity-adjusted ratio
# df['severity_adjusted_ratio'] = df['male-female-ratio'] * df['cluster_severity']
# If the area (cluster) has low crime severity (<3) the male-female ratio is not given weight.
df['severity_adjusted_ratio'] = np.where(
    df['cluster_severity'] <= 3,
    df['cluster_severity'],
    df['male-female-ratio'] * df['cluster_severity']
)

# Features aur target
features = ['latitude', 'longitude', 'severity_adjusted_ratio',
                'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
                'month', 'cluster_severity']

X = df[features]
y = df['crime-severity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# rf_model = RandomForestRegressor(random_state=42)
rf_model = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(random_state=42))
    ])


# hyperparameters test for grid search
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model,
                           param_grid=param_grid,
                           n_jobs=6,                        # I had 8 cores, hence chosen 6 parallel job
                           cv=5,                      
                           scoring='neg_mean_squared_error', 
                           verbose=1)                

# Fit grid search to training data
grid_search.fit(X_train, y_train)

# Retrieve best parameters and score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_ 

print("Best Hyperparameters:", best_params)
print("Best Cross-Validation Score (MSE):", best_score)

# Evaluate on test set using the best estimator found by grid search
y_pred = grid_search.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error:", test_mse)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300}
Best Cross-Validation Score (MSE): 3.8575543527755665
Test Mean Squared Error: 3.91132992388097
