In [None]:

import sys
from pathlib import Path

root = Path().resolve()
src_path = str(root.joinpath(Path("src")))

sys.path.append(src_path)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dashboard.update_model import load_model
from dashboard.models import Model
from analytics.ml import ML

from utils.utils import get_data

In [None]:

alerts = get_data()
alerts.filter_by_group_time(120, True)

In [None]:

alerts.data[(alerts.data["group"] == 144) & (alerts.data["type"] == "ACCIDENT")].shape[0]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [None]:

full_data = ML.generate_neg_simulated_data(alerts.data)
full_data.head()

In [None]:

ohe = OneHotEncoder(handle_unknown="ignore")
enc_type = ohe.fit_transform(full_data[["type"]])
data = full_data.drop(columns=["type"], axis=1).reset_index(drop=True)
enc_df = pd.DataFrame(enc_type.toarray(), columns=ohe.get_feature_names_out()).reset_index(drop=True)
data = pd.concat([data, enc_df], axis=1)
x_vars = ["hour", "day_type", "week_day", "day", *ohe.get_feature_names_out()]

ohe = OneHotEncoder(handle_unknown="ignore")
enc_group = ohe.fit_transform(data[["group"]])
data = data.drop(columns=["group"], axis=1).reset_index(drop=True)
enc_df_g = pd.DataFrame(enc_group.toarray(), columns=ohe.get_feature_names_out()).reset_index(drop=True)
data = pd.concat([data, enc_df_g], axis=1)
x_vars = [*x_vars, *ohe.get_feature_names_out()]

In [None]:

m = {"s": 1, "f": 0}
data["day_type"] = data["day_type"].map(lambda x: m[x])

In [None]:

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

RANDOM_STATE = 42

# --- Build X, y
assert "happen" in data.columns, "Target column 'happen' not found."
y = data["happen"].astype(int).values
X = data[x_vars]

# (Optional) columns safe for scaling (LogReg benefits; trees/XGB do not require it)
# We'll scale only for LogisticRegression via a Pipeline.
numeric_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
X = X[numeric_cols]  # ensure purely numeric

# --- Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE)
xgb = XGBClassifier(
random_state=RANDOM_STATE,
n_estimators=80,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="auc",
    n_jobs=-1,
    colsample_bytree=0.8,
    gamma=1.0,
    learning_rate=0.1,
    max_depth=10,
    min_child_weight=1,
    subsample=0.8,
)

xgb.fit(X_train, y_train)

In [None]:

xgb.predict_proba(data[(data["type_ACCIDENT"] == 1) & (data["group_145"] == 1)][x_vars])

In [None]:

from sklearn.model_selection import cross_val_score

cross_val_score(xgb, X_test, y_test, cv=20)

