# Training and Evaluation of Tree Models on SF Incident Report Data

## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gc

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import optuna
from optuna.integration import XGBoostPruningCallback

## Experiments
Start off with initial training of target models on 80/20 train test split to see initial performance. Then move on to OOB CV and fine-tuning.

### Decision Trees
DT and RF don't handle categorical's natively. Train these initial models on non-normalized, ordinal encoding dataset. Further consideration of encodings can be treated later as a fine-tuning parameter. 

In [2]:
treedata = pd.read_csv('tree_dataset.csv', index_col=0)
#display(treedata.head())

ord_enc = OrdinalEncoder()
cat_cols = ['day','a_neigh','intsct','pd'] # neigh alr encoded
treedata_ordinal = treedata.copy()
treedata_ordinal[cat_cols] = ord_enc.fit_transform(treedata_ordinal[cat_cols])
display(treedata_ordinal.head())


Unnamed: 0,year,month_cont,day,time,lat,long,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd,cat
0,2023,3.52,4.0,17.5,37.76229,-122.401324,28.0,54.0,712.0,0.0,10.0,10.0,9.0,2.0,Assault
1,2021,7.23,6.0,8.3,37.753837,-122.418594,18.0,53.0,1102.0,3.0,9.0,9.0,2.0,3.0,Assault
2,2021,6.13,0.0,9.67,37.785893,-122.419739,35.0,20.0,5178.0,4.0,5.0,6.0,10.0,4.0,Assault
3,2021,7.39,1.0,12.33,37.783214,-122.410765,35.0,20.0,9111.0,10.0,5.0,6.0,10.0,5.0,Disorderly Conduct
4,2019,6.37,5.0,16.5,37.775953,-122.408846,33.0,32.0,5583.0,8.0,6.0,6.0,10.0,1.0,Sex Offense


In [30]:
# get train test split
train, test = train_test_split(treedata_ordinal, test_size=0.3)
print(f'Train size: {len(train)}, Test size: {len(test)}')

trainX = train.drop('cat', axis=1)
trainY = train['cat']
testX = test.drop('cat', axis=1)
testY = test['cat']

model = DecisionTreeClassifier(random_state=42, max_depth=25) 
model.fit(trainX, trainY)

probs = model.predict_proba(testX)
score = log_loss(testY, probs)
print(f'Log loss: {score}')

Train size: 428614, Test size: 183692
Log loss: 18.358534313380257


### Random Forests

In [31]:
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(trainX, trainY)

probs = model.predict_proba(testX)
score = log_loss(testY, probs)
print(f'Log loss: {score}')

Log loss: 3.5281388141251746


### XGBoost

In [32]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

trainX, testX, trainY, testY = train_test_split(X, Y_enc, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(tree_method='hist', enable_categorical=True, eval_metric='logloss',
                          n_estimators=80, max_depth=15, verbosity=1)
model.fit(trainX, trainY)

probs = model.predict_proba(testX)
score = log_loss(testY, probs)
print(f'Log loss: {score}')

Log loss: 1.8146900870940692


### LightGBM

In [33]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

trainX, testX, trainY, testY = train_test_split(X, Y_enc, test_size=0.2, random_state=42)

lgb_train = lgb.Dataset(trainX, label=trainY, categorical_feature=cat_cols)
lgb_test = lgb.Dataset(testX, label=testY, categorical_feature=[cat_cols], reference=lgb_train)

# lgb training params
params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': 17,
}

model = lgb.train(params, lgb_train, valid_sets=[lgb_test])

probs = model.predict(testX)
score = log_loss(testY, probs)
print(f'Log Loss: {score}')

New categorical_feature is ['day', 'a_neigh', 'intsct', 'pd']


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9096
[LightGBM] [Info] Number of data points in the train set: 489844, number of used features: 14
[LightGBM] [Info] Start training from score -5.448822
[LightGBM] [Info] Start training from score -2.371989
[LightGBM] [Info] Start training from score -2.490861
[LightGBM] [Info] Start training from score -5.659936
[LightGBM] [Info] Start training from score -3.634227
[LightGBM] [Info] Start training from score -3.910991
[LightGBM] [Info] Start training from score -3.204725
[LightGBM] [Info] Start training from score -3.015075
[LightGBM] [Info] Start training from score -8.160200
[LightGBM] [Info] Start training from score -3.424879
[LightGBM] [Info] Start training from score -2.504508
[LightGBM] [Info] Start training from score -3.39985

### CatBoost

In [35]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

trainX, testX, trainY, testY = train_test_split(X, Y_enc, test_size=0.2, random_state=42)

model = CatBoostClassifier(
    iterations=100,
    cat_features=cat_cols,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    verbose=20
)

model.fit(trainX, trainY, eval_set=(testX, testY), early_stopping_rounds=10)

probs = model.predict_proba(testX)
score = log_loss(testY, probs)
print(f'Log Loss: {score}')

Learning rate set to 0.29453
0:	learn: 2.1942963	test: 2.1945704	best: 2.1945704 (0)	total: 3.68s	remaining: 6m 4s
20:	learn: 1.8253740	test: 1.8206266	best: 1.8206266 (20)	total: 1m 37s	remaining: 6m 5s
40:	learn: 1.7974090	test: 1.7972357	best: 1.7972357 (40)	total: 3m 13s	remaining: 4m 38s
60:	learn: 1.7730444	test: 1.7757998	best: 1.7757998 (60)	total: 4m 54s	remaining: 3m 8s
80:	learn: 1.7590789	test: 1.7658269	best: 1.7658269 (80)	total: 6m 34s	remaining: 1m 32s
99:	learn: 1.7487093	test: 1.7591181	best: 1.7591181 (99)	total: 8m 11s	remaining: 0us

bestTest = 1.759118139
bestIteration = 99

Log Loss: 1.7591181393988837


## Fine-tuning

### Decision Tree

In [None]:
treedata = pd.read_csv('tree_dataset.csv', index_col=0)

ord_enc = OrdinalEncoder()
cat_cols = ['day','a_neigh','intsct','pd'] # neigh alr encoded
treedata_ordinal = treedata.copy()
treedata_ordinal[cat_cols] = ord_enc.fit_transform(treedata_ordinal[cat_cols])
display(treedata_ordinal.head())

Unnamed: 0,year,month_cont,day,time,lat,long,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd,cat
0,2023,3.52,4.0,17.5,37.76229,-122.401324,28.0,54.0,712.0,0.0,10.0,10.0,9.0,2.0,Assault
1,2021,7.23,6.0,8.3,37.753837,-122.418594,18.0,53.0,1102.0,3.0,9.0,9.0,2.0,3.0,Assault
2,2021,6.13,0.0,9.67,37.785893,-122.419739,35.0,20.0,5178.0,4.0,5.0,6.0,10.0,4.0,Assault
3,2021,7.39,1.0,12.33,37.783214,-122.410765,35.0,20.0,9111.0,10.0,5.0,6.0,10.0,5.0,Disorderly Conduct
4,2019,6.37,5.0,16.5,37.775953,-122.408846,33.0,32.0,5583.0,8.0,6.0,6.0,10.0,1.0,Sex Offense


In [None]:
X = treedata_ordinal.drop(labels=['cat'], axis=1).to_numpy()
Y = treedata_ordinal['cat'].to_numpy()

def dt_objective(trial):
    # define parameters to search thru
    max_depth = trial.suggest_int('max_depth', 3, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 10, 100)
    criterion =  trial.suggest_categorical('criterion', ['gini', 'entropy'])

    # initialize model
    model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
        criterion=criterion,
        random_state=42,
        class_weight='balanced'
    )

    # get cv split
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    splits = kf.split(X)

    # train and validate
    mean_scores = 0.0
    for train_idx, val_idx in splits:
        x = X[train_idx]
        y = Y[train_idx]
        valx = X[val_idx]
        valy = Y[val_idx]

        model.fit(x, y)
        probs = model.predict_proba(valx)
        score = log_loss(valy, probs)
        mean_scores = mean_scores + score
    mean_scores = mean_scores / 5

    return mean_scores

# train
study = optuna.create_study(direction='minimize')
study.optimize(dt_objective, n_trials=100)

best_params = study.best_params
print(f'Best params: {best_params}')
print(f'Best score: {study.best_value}')

[I 2025-04-10 17:46:39,293] A new study created in memory with name: no-name-945005c8-d9c3-4a84-9090-f3e0f829ce2e
[I 2025-04-10 17:46:48,729] Trial 0 finished with value: 2.713520751312558 and parameters: {'max_depth': 32, 'min_samples_split': 17, 'min_samples_leaf': 2, 'max_leaf_nodes': 11, 'criterion': 'entropy'}. Best is trial 0 with value: 2.713520751312558.
[I 2025-04-10 17:46:59,528] Trial 1 finished with value: 2.666909725808474 and parameters: {'max_depth': 28, 'min_samples_split': 12, 'min_samples_leaf': 16, 'max_leaf_nodes': 51, 'criterion': 'gini'}. Best is trial 1 with value: 2.666909725808474.
[I 2025-04-10 17:47:11,342] Trial 2 finished with value: 2.6575333702805786 and parameters: {'max_depth': 13, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_leaf_nodes': 70, 'criterion': 'gini'}. Best is trial 2 with value: 2.6575333702805786.
[I 2025-04-10 17:47:22,966] Trial 3 finished with value: 2.6590737771047754 and parameters: {'max_depth': 24, 'min_samples_split': 2, 'mi

Best params: {'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 8, 'max_leaf_nodes': 100, 'criterion': 'gini'}
Best score: 2.6474022205348673


### RandomForests

In [40]:
X = treedata_ordinal.drop(labels='cat', axis=1).to_numpy()
Y = treedata_ordinal['cat'].to_numpy()

def rf_objective(trial):
    # define hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 20, 500)
    max_depth = trial.suggest_int('max_depth', 3, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt','log2'])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=True,
        random_state=42
    )

    # get cv split
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    splits = kf.split(X, Y)

    # train and validate
    total_scores = 0.0
    for idx, (train_idx, val_idx) in enumerate(splits):
        x = X[train_idx]
        y = Y[train_idx]
        valx = X[val_idx]
        valy = Y[val_idx]

        model.fit(x, y)
        probs = model.predict_proba(valx)
        score = log_loss(valy, probs)
        total_scores = total_scores + score

        # check for pruning
        trial.report(score, step=idx)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
    return total_scores / 5

# train
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
study = optuna.create_study(direction='minimize', pruner=pruner)
study.optimize(rf_objective, n_trials=20)

best_params = study.best_params
print(f'Best params: {best_params}')
print(f'Best score: {study.best_value}')

[I 2025-04-10 21:25:58,477] A new study created in memory with name: no-name-8c8b88f1-6f35-483f-a991-518ecd88830c
[I 2025-04-10 21:30:25,871] Trial 0 finished with value: 1.7798537826050378 and parameters: {'n_estimators': 159, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 20, 'max_features': 'log2'}. Best is trial 0 with value: 1.7798537826050378.
[I 2025-04-10 21:37:04,884] Trial 1 finished with value: 1.722189023187346 and parameters: {'n_estimators': 178, 'max_depth': 31, 'min_samples_split': 16, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 1 with value: 1.722189023187346.
[I 2025-04-10 21:57:58,285] Trial 2 finished with value: 1.7289113006692127 and parameters: {'n_estimators': 185, 'max_depth': 27, 'min_samples_split': 4, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 1 with value: 1.722189023187346.
[I 2025-04-10 22:15:47,682] Trial 3 finished with value: 1.845533502499859 and parameters: {'n_estimators': 326, 'max_depth': 10, 'mi

Best params: {'n_estimators': 467, 'max_depth': 21, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best score: 1.7091375533201152


### XGBoost

In [None]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

import warnings
warnings.filterwarnings("ignore", message="The reported value is ignored because this `step`.*") # ignore warning about reporting scores

# encode target
label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

def xgb_objective(trial):
    # Hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 20, 90)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 0.3, log=True)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    l2 = trial.suggest_float('lambda', 1e-4, 10, log=True)
    l1 = trial.suggest_float('alpha', 1e-4, 10, log=True)

    params = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 17,
        'tree_method': 'hist',
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'lambda': l2,
        'alpha': l1,
        'verbosity': 0,
        'enable_categorical': True
    }

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    total_scores = 0.0

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, Y_enc)):
        x_train, y_train = X.iloc[train_idx], Y_enc[train_idx]
        x_val, y_val = X.iloc[val_idx], Y_enc[val_idx]

        dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
        dval = xgb.DMatrix(x_val, label=y_val, enable_categorical=True)

        watchlist = [(dtrain, 'train'), (dval, 'valid')]

        pruning_callback = XGBoostPruningCallback(trial, 'valid-mlogloss')

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=n_estimators,
            evals=watchlist,
            callbacks=[pruning_callback],
            early_stopping_rounds=20,
            verbose_eval=False
        )

        preds = model.predict(dval)
        score = log_loss(y_val, preds)
        total_scores += score

        #trial.report(score, fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

        # having alot of issues with kernel crashes due to memory, so explicitly free memory here
        del model, dtrain, dval
        gc.collect()

    return total_scores / 3

pruner = optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=1)
study = optuna.create_study(direction='minimize', pruner=pruner)
study.optimize(xgb_objective, n_trials=20, gc_after_trial=True)

best_params = study.best_params
print(f'Best params: {best_params}')
print(f'Best score: {study.best_value}')


[I 2025-04-11 23:36:47,306] A new study created in memory with name: no-name-f3eafbf3-cdb6-4616-8cb6-de3c23d44942
[I 2025-04-11 23:40:29,747] Trial 0 finished with value: 2.6605469471593435 and parameters: {'n_estimators': 69, 'max_depth': 10, 'learning_rate': 0.0015708985304745703, 'subsample': 0.5334449979352542, 'colsample_bytree': 0.743364179081981, 'lambda': 0.041297585294020056, 'alpha': 0.03261340645855585}. Best is trial 0 with value: 2.6605469471593435.
[I 2025-04-11 23:42:11,648] Trial 1 finished with value: 1.910405770992236 and parameters: {'n_estimators': 42, 'max_depth': 9, 'learning_rate': 0.0470773707913018, 'subsample': 0.7955850010943698, 'colsample_bytree': 0.6682725194508972, 'lambda': 0.00032562505331460025, 'alpha': 0.25973908728398015}. Best is trial 1 with value: 1.910405770992236.
[I 2025-04-11 23:43:37,240] Trial 2 finished with value: 2.6397102275165376 and parameters: {'n_estimators': 65, 'max_depth': 5, 'learning_rate': 0.002058710432932007, 'subsample': 0.

Best params: {'n_estimators': 67, 'max_depth': 19, 'learning_rate': 0.14163877525270438, 'subsample': 0.7365240318079787, 'colsample_bytree': 0.500628249073628, 'lambda': 0.707567617688826, 'alpha': 0.0019699445302213004}
Best score: 1.7072941222081317


### LightGBM

In [4]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

def lgb_objective(trial):
    # parameters
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 17,
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 32, 512),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-4, 10, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-4, 10, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'verbose': -1
    }

    # split
    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    splits = kf.split(X, Y_enc)
    total_scores = 0.0
    for idx, (train_idx, val_idx) in enumerate(splits):
        lgb_train = lgb.Dataset(X.iloc[train_idx], label=Y_enc[train_idx], categorical_feature=cat_cols)
        lgb_test = lgb.Dataset(X.iloc[val_idx], label=Y_enc[val_idx], categorical_feature=cat_cols, reference=lgb_train)

        model = lgb.train(params, lgb_train, valid_sets=[lgb_test])

        probs = model.predict(X.iloc[val_idx])
        score = log_loss(Y_enc[val_idx], probs)
        total_scores += score

        trial.report(score, idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

        del model, lgb_train, lgb_test
        gc.collect()
    
    return total_scores / 3

pruner = optuna.pruners.MedianPruner(n_warmup_steps=3, n_min_trials=1)
study = optuna.create_study(direction='minimize', pruner=pruner)
study.optimize(lgb_objective, n_trials=20, gc_after_trial=True)

best_params = study.best_params
print(f'Best params: {best_params}')
print(f'Best score: {study.best_value}')

[I 2025-04-12 15:11:44,663] A new study created in memory with name: no-name-86c74260-08c0-45b2-b020-caf88b250b61
[I 2025-04-12 15:16:54,895] Trial 0 finished with value: 1.741943244293828 and parameters: {'learning_rate': 0.023183856662859825, 'num_leaves': 344, 'max_depth': 17, 'lambda_l1': 0.005843293254335405, 'lambda_l2': 2.4657904707348424, 'feature_fraction': 0.7223684553906875, 'bagging_fraction': 0.5643119215628734}. Best is trial 0 with value: 1.741943244293828.
[I 2025-04-12 15:19:24,408] Trial 1 finished with value: 2.034428326304679 and parameters: {'learning_rate': 0.0002586045909739928, 'num_leaves': 348, 'max_depth': 9, 'lambda_l1': 3.7264041576182017, 'lambda_l2': 0.00019950947022326758, 'feature_fraction': 0.6679199144717741, 'bagging_fraction': 0.5331812784237497}. Best is trial 0 with value: 1.741943244293828.
[I 2025-04-12 15:20:32,770] Trial 2 finished with value: 1.748786266036235 and parameters: {'learning_rate': 0.16557990283552415, 'num_leaves': 57, 'max_depth

KeyboardInterrupt: 