In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("train.csv")
print("Dataset Shape:", df.shape)

Dataset Shape: (20000, 22)


In [3]:
df.fillna({
    "Cross_Street": "Unknown",
    "Weapon_Used_Code": df["Weapon_Used_Code"].median(),
    "Weapon_Description": "Unknown",
    "Victim_Age": df["Victim_Age"].median(),
    "Victim_Sex": "Unknown",
    "Victim_Descent": "Unknown",
}, inplace=True)

In [4]:
df["Date_Occurred"] = pd.to_datetime(df["Date_Occurred"], errors='coerce')
df["Hour"] = df["Time_Occurred"] // 100
df.drop(columns=["Date_Reported", "Date_Occurred", "Time_Occurred"], inplace=True)

In [5]:
label_encoders = {}
categorical_cols = ["Crime_Category"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [6]:
df = df[["Longitude", "Latitude", "Hour", "Crime_Category"]]

In [7]:
X, y = df.drop(columns=['Crime_Category']), df['Crime_Category']
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

In [9]:
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(rf, param_grid, scoring='accuracy', cv=3, verbose=1)
grid_search_rf.fit(X_scaled, y_resampled)
print("Best Random Forest parameters:", grid_search_rf.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Random Forest parameters: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 200}


In [10]:
best_rf = grid_search_rf.best_estimator_
best_rf.fit(X_scaled, y_resampled)
y_pred_rf = best_rf.predict(X_scaled)
print("Random Forest Accuracy:", accuracy_score(y_resampled, y_pred_rf))
print(classification_report(y_resampled, y_pred_rf))

Random Forest Accuracy: 0.9946996971255501
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11666
           1       0.99      1.00      1.00     11666
           2       1.00      1.00      1.00     11666
           3       1.00      1.00      1.00     11666
           4       0.99      0.99      0.99     11666
           5       0.99      0.99      0.99     11666

    accuracy                           0.99     69996
   macro avg       0.99      0.99      0.99     69996
weighted avg       0.99      0.99      0.99     69996



In [11]:
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(set(y_resampled)), activation='softmax')
])
mlp_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = mlp_model.fit(X_scaled, y_resampled, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.2463 - loss: 1.7184 - val_accuracy: 0.0000e+00 - val_loss: 2.4546
Epoch 2/50
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2636 - loss: 1.6985 - val_accuracy: 0.0000e+00 - val_loss: 2.4170
Epoch 3/50
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2638 - loss: 1.6933 - val_accuracy: 0.0000e+00 - val_loss: 2.4280
Epoch 4/50
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2710 - loss: 1.6866 - val_accuracy: 0.0000e+00 - val_loss: 2.4794
Epoch 5/50
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2747 - loss: 1.6811 - val_accuracy: 0.0000e+00 - val_loss: 2.4349
Epoch 6/50
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2794 - loss: 1.6771 - val_accuracy: 0.0000e+00 - val_loss:

In [12]:
y_pred_mlp = np.argmax(mlp_model.predict(X_scaled), axis=1)
print("MLP Accuracy:", accuracy_score(y_resampled, y_pred_mlp))
print(classification_report(y_resampled, y_pred_mlp))

[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 625us/step
MLP Accuracy: 0.27110120578318764
              precision    recall  f1-score   support

           0       0.31      0.51      0.38     11666
           1       0.28      0.10      0.15     11666
           2       0.28      0.59      0.38     11666
           3       0.00      0.00      0.00     11666
           4       0.22      0.43      0.29     11666
           5       0.00      0.00      0.00     11666

    accuracy                           0.27     69996
   macro avg       0.18      0.27      0.20     69996
weighted avg       0.18      0.27      0.20     69996



In [14]:
joblib.dump(best_rf, "random_forest_model.joblib")
mlp_model.save("mlp_modelrf.keras")
joblib.dump(scaler, "scalerrf.joblib")
joblib.dump(label_encoders, "label_encodersrf.joblib")
print("Models, scaler, and label encoders saved successfully!")

Models, scaler, and label encoders saved successfully!
