In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn modules
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    classification_report,
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    roc_curve
)

# For model loading and saving
import joblib

# For interactive user input
from IPython.display import display

# Visualization settings
sns.set(style='whitegrid')
%matplotlib inline

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Load model

In [2]:
# Load the scaler and the best model
scaler_X = joblib.load('scaler_X.joblib')
best_rf = joblib.load('best_random_forest_model.joblib')  # Ensure this is the correct model file name

# Load the training data to retrieve the median crime rate in the original scale
preprocessed_train = pd.read_csv('../Data/preprocessed_train.csv')

# Since the target variable is log-transformed, we need to inverse transform it to get the median in the original scale
original_crime_rates = np.expm1(preprocessed_train['crime_rate_per_100000'])
crime_rate_median_original = original_crime_rates.median()
print(f"Median Crime Rate (Threshold) in Original Scale: {crime_rate_median_original:.4f}")

Median Crime Rate (Threshold) in Original Scale: 176.4591


# Prediction Function

In [3]:
def predict_safety(input_data):
    """
    Predict the safety level for new input data.

    Parameters:
    - input_data: DataFrame containing the same features used in training.

    Returns:
    - DataFrame with predicted crime rates and safety levels.
    """
    # Ensure the features are in the correct order
    required_columns = [
        'murder', 'rape', 'robbery', 'burglary',
        'unemployment_rate_2022', 'population_estimate_2022',
        'poverty_index', 'education_improvement'
    ]

    input_data = input_data[required_columns]

    # Apply log1p transformation to skewed features (as done in preprocessing)
    skewed_features = [
        'murder', 'rape', 'robbery', 'burglary',
        'unemployment_rate_2022', 'population_estimate_2022'
    ]
    for feature in skewed_features:
        input_data[feature] = np.log1p(input_data[feature])

    # Apply the scaler
    input_data_scaled = scaler_X.transform(input_data)

    # Predict crime rates in log-transformed space
    predicted_crime_rates_log = best_rf.predict(input_data_scaled)

    # Inverse transform to get crime rates in original scale
    predicted_crime_rates = np.expm1(predicted_crime_rates_log)

    # Classify safety levels using the median in original scale
    safety_levels = ['Safe' if rate <= crime_rate_median_original else 'High-Risk' for rate in predicted_crime_rates]

    # Prepare the results DataFrame
    results = input_data.copy()
    results['Predicted_Crime_Rate'] = predicted_crime_rates
    results['Safety_Level'] = safety_levels

    return results

In [5]:
# Function to collect user input
def user_input_features():
    print("Please enter the following details:")

    murder = float(input("Number of murders: "))
    rape = float(input("Number of rapes: "))
    robbery = float(input("Number of robberies: "))
    burglary = float(input("Number of burglaries: "))
    unemployment_rate_2022 = float(input("Unemployment rate in 2022 (%): "))
    population_estimate_2022 = int(input("Population estimate in 2022: "))
    poverty_index = float(input("Poverty index (sum of poverty percentages): "))
    education_improvement = float(input("Education improvement (difference in no HS degree percentages): "))

    # Create a DataFrame with the input
    input_data = pd.DataFrame({
        'murder': [murder],
        'rape': [rape],
        'robbery': [robbery],
        'burglary': [burglary],
        'unemployment_rate_2022': [unemployment_rate_2022],
        'population_estimate_2022': [population_estimate_2022],
        'poverty_index': [poverty_index],
        'education_improvement': [education_improvement]
    })

    return input_data

# Collect user input
new_input_data = user_input_features()

Please enter the following details:


In [25]:
# Predict safety level for the input data
prediction_results = predict_safety(new_input_data)

# Display the results
print("\nPrediction Results:")
display(prediction_results)


Prediction Results:


Unnamed: 0,murder,rape,robbery,burglary,unemployment_rate_2022,population_estimate_2022,poverty_index,education_improvement,Predicted_Crime_Rate,Safety_Level
0,4.787492,5.303305,7.483807,8.516393,1.410987,12.564425,88.8,2.83,736.44302,High-Risk


In [26]:
# Interpret the results
safety_level = prediction_results['Safety_Level'].values[0]
predicted_crime_rate = prediction_results['Predicted_Crime_Rate'].values[0]

print(f"\nBased on the input data, the predicted crime rate per 100,000 is {predicted_crime_rate:.2f}.")
print(f"The area is classified as '{safety_level}'.")


Based on the input data, the predicted crime rate per 100,000 is 736.44.
The area is classified as 'High-Risk'.
