In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import classification_report, accuracy_score, f1_score
import itertools

In [2]:
#file_path = r"C:\Users\jens.nilsen\OneDrive - Bouvet Norge AS\Documents\GitHub\trfkaipoc\data_2022-2025.csv"
file_path = r"C:\Users\jens.nilsen\OneDrive - Bouvet Norge AS\Documents\GitHub\trfkaipoc\data_2022-2025_norge.csv"
df = pd.read_csv(file_path, sep=";")

In [3]:
df = df[df["EGS.VEDTAK.10670"].notna()]

In [4]:
df['Avslag_ind'] = df['EGS.VEDTAK.10670'].apply(lambda x: 1 if x == "Avslag" else 0)

In [5]:
if "Kurvatur, horisontalelement" in df.columns:
    df["Kurvatur, horisontal"] = df["Kurvatur, horisontalelement"]

In [6]:
features = [
    'Avslag_ind',
    "ÅDT, total",
    "ÅDT, andel lange kjøretøy",
    "Fartsgrense",
    "Avkjørsel, holdningsklasse",
    "Funksjonsklasse",
    "Avkjørsler",
    "Trafikkulykker",
    "EGS.BRUKSOMRÅDE.1256", 
    "Kurvatur, horisontal", 
    #"Kurvatur, horisontalelement", 
    "Kurvatur, stigning"
]

# Encode categorical features
df_encoded = pd.get_dummies(df[features])

In [7]:
df_encoded=df_encoded.dropna()

In [8]:
df_encoded['sving_ind'] = np.where(df_encoded['Kurvatur, horisontal'].abs() > 99000, 0, 1)
df_encoded['sving'] = np.where(df_encoded['Kurvatur, horisontal'].abs() < 99000, df_encoded['Kurvatur, horisontal'].abs(), 0)
df_encoded['bakke']=df_encoded['Kurvatur, stigning'].abs()
df_encoded['bakke_sigmoid'] = 1/(1+np.exp(-0.001*df_encoded['Kurvatur, horisontal'].abs()))
df_encoded['antall_lange_kj']=df_encoded['ÅDT, total']*df_encoded['ÅDT, andel lange kjøretøy']/100
df_encoded = df_encoded.drop(['Kurvatur, horisontal', 'Kurvatur, stigning'], axis=1)
y = df_encoded['Avslag_ind']        # target
X = df_encoded.drop(columns=['Avslag_ind'])  # all other columns

In [9]:
poly = PolynomialFeatures(3, include_bias=False, interaction_only=True) 
X = pd.DataFrame(
    poly.fit_transform(X),
    columns=poly.get_feature_names_out(X.columns)
)

In [10]:
# Identify binary dummy columns: only {0,1} or {0.0,1.0}
binary_cols = [
    col for col in X.columns
    if np.isin(X[col].dropna().unique(), [0,1]).all()
]

continuous_cols = [col for col in X.columns if col not in binary_cols]

# Transform only continuous columns
scaler = PowerTransformer()
X_cont_scaled = pd.DataFrame(
    scaler.fit_transform(X[continuous_cols]),
    columns=continuous_cols,
    index=X.index
)

# Combine back into full feature matrix
X = pd.concat([X_cont_scaled, X[binary_cols]], axis=1)

In [11]:
model = RandomForestClassifier(
    n_estimators=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
model.fit(X, y)

# Get feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
importances_sorted = importances.sort_values(ascending=False)
importances
# Select top 10 features
top_features = importances_sorted.index[:25]
X_top = X[top_features]

In [12]:
X=X_top

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=43)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define the parameter distributions
param_dist = {
    'n_estimators': randint(300, 5000),
    'max_depth': randint(5, 25),
    'min_samples_split': randint(2, 10),
    'sampling_strategy': uniform(0.1, 0.5),

}

model = BalancedRandomForestClassifier(random_state=42, n_jobs=-1, replacement=True)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=100,                  # number of random combinations to try
    cv=4,
    scoring="average_precision",
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)


Fitting 4 folds for each of 100 candidates, totalling 400 fits


In [None]:
y_pred=random_search.predict(X_test)
y_proba=random_search.predict_proba(X_test)[:,1]

In [None]:
print(classification_report(y_test, y_pred, digits=4))

In [None]:
df_plot = pd.DataFrame({'y_test': y_test, 'y_proba': y_proba})

# Plot distributions
plt.figure(figsize=(8,5))
sns.kdeplot(data=df_plot[df_plot['y_test'] == 0]['y_proba'], label='Actual class 0', fill=True)
sns.kdeplot(data=df_plot[df_plot['y_test'] == 1]['y_proba'], label='Actual class 1', fill=True)
plt.title("Distribution of predicted probabilities by actual class")
plt.xlabel("Predicted probability (positive class)")
plt.ylabel("Density")
plt.legend()

In [None]:
from sklearn import set_config

set_config(enable_metadata_routing=True)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# BalancedRandomForest
brf = BalancedRandomForestClassifier(random_state=42, n_jobs=-1)

# XGBoost with early stopping
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="aucpr",
    n_jobs=-1,
    random_state=42,
    use_label_encoder=False
)

# CatBoost with early stopping
cat = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=0,
    random_state=42
)


In [None]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[('brf', brf), ('xgb', xgb), ('cat', cat)],
    voting='soft',
    n_jobs=-1
)


In [None]:
param_distributions = {
    # BalancedRandomForest
    'brf__n_estimators': randint(300, 1200),
    'brf__max_depth': randint(5, 25),
    'brf__min_samples_split': randint(2, 10),
    'brf__sampling_strategy': uniform(0.2, 0.3),

    # XGBoost
    'xgb__n_estimators': randint(400, 1500),
    'xgb__learning_rate': uniform(0.01, 0.2),
    'xgb__max_depth': randint(3, 10),
    'xgb__scale_pos_weight': uniform(1, 10),
    'xgb__subsample': uniform(0.6, 0.4),
    'xgb__colsample_bytree': uniform(0.5, 0.5),

    # CatBoost
    'cat__iterations': randint(500, 2000),
    'cat__depth': randint(4, 10),
    'cat__learning_rate': uniform(0.01, 0.1),
    'cat__scale_pos_weight': uniform(1, 10)
}


In [None]:
search = RandomizedSearchCV(
    ensemble,
    param_distributions=param_distributions,
    n_iter=30,
    scoring="average_precision",
    cv=3,
    verbose=2,
    n_jobs=1,  # Windows-safe
    random_state=42
)



In [None]:
fit_params = {
    'xgb__eval_set': [(X_test, y_test)],
    'xgb__early_stopping_rounds': 50,
    'cat__eval_set': [(X_test, y_test)],
    'cat__early_stopping_rounds': 50
}

In [None]:
# 1. Search without early stopping

search.fit(X_train, y_train)  # no fit_params

# 2. Get the best estimator
best_model = search.best_estimator_

# 3. Refit XGB and CatBoost with early stopping
best_model.named_estimators_['xgb'].fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    verbose=20
)

best_model.named_estimators_['cat'].fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    verbose=20
)


In [None]:
y_pred=best_model.predict(X_test)
y_proba=best_model.predict_proba(X_test)[:,1]

In [None]:
df_plot = pd.DataFrame({'y_test': y_test, 'y_proba': y_proba})

# Plot distributions
plt.figure(figsize=(8,5))
sns.kdeplot(data=df_plot[df_plot['y_test'] == 0]['y_proba'], label='Actual class 0', fill=True)
sns.kdeplot(data=df_plot[df_plot['y_test'] == 1]['y_proba'], label='Actual class 1', fill=True)
plt.title("Distribution of predicted probabilities by actual class")
plt.xlabel("Predicted probability (positive class)")
plt.ylabel("Density")
plt.legend()