# Training and Evaluation of Tree Models on SF Incident Report Data

## Imports

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import optuna

## Experiments
Start off with initial training of target models on 80/20 train test split to see initial performance. Then move on to OOB CV and fine-tuning.

### Decision Trees
DT and RF don't handle categorical's natively. Train these initial models on non-normalized, ordinal encoding dataset. Further consideration of encodings can be treated later as a fine-tuning parameter. 

In [2]:
treedata = pd.read_csv('tree_dataset.csv', index_col=0)
#display(treedata.head())

ord_enc = OrdinalEncoder()
cat_cols = ['day','a_neigh','intsct','pd'] # neigh alr encoded
treedata_ordinal = treedata.copy()
treedata_ordinal[cat_cols] = ord_enc.fit_transform(treedata_ordinal[cat_cols])
display(treedata_ordinal.head())


Unnamed: 0,year,month_cont,day,time,lat,long,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd,cat
0,2023,3.52,4.0,17.5,37.76229,-122.401324,28.0,54.0,712.0,0.0,10.0,10.0,9.0,2.0,Assault
1,2021,7.23,6.0,8.3,37.753837,-122.418594,18.0,53.0,1102.0,3.0,9.0,9.0,2.0,3.0,Assault
2,2021,6.13,0.0,9.67,37.785893,-122.419739,35.0,20.0,5178.0,4.0,5.0,6.0,10.0,4.0,Assault
3,2021,7.39,1.0,12.33,37.783214,-122.410765,35.0,20.0,9111.0,10.0,5.0,6.0,10.0,5.0,Disorderly Conduct
4,2019,6.37,5.0,16.5,37.775953,-122.408846,33.0,32.0,5583.0,8.0,6.0,6.0,10.0,1.0,Sex Offense


In [None]:
# get train test split
train, test = train_test_split(treedata_ordinal, test_size=0.2, random_state=42)
print(f'Train size: {len(train)}, Test size: {len(test)}')

trainX = train.drop('cat', axis=1)
trainY = train['cat']
testX = train.drop('cat', axis=1)
testY = train['cat']

model = DecisionTreeClassifier(random_state=42, max_depth=25) # also 94% at min_samples_leaf = 3
model.fit(trainX, trainY)

pred = model.predict(testX)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

Train size: 489844, Test size: 122462
Accuracy: 0.8042642147295874


### Random Forests

In [29]:
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(trainX, trainY)

pred = model.predict(testX)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

Accuracy: 0.9430083863434073


### XGBoost

In [3]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

trainX, testX, trainY, testY = train_test_split(X, Y_enc, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(tree_method='hist', enable_categorical=True, eval_metric='logloss',
                          n_estimators=80, max_depth=15, verbosity=1)
model.fit(trainX, trainY)

pred = model.predict(testX)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

Accuracy: 0.45670493704169457


### LightGBM

In [8]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

trainX, testX, trainY, testY = train_test_split(X, Y_enc, test_size=0.2, random_state=42)

lgb_train = lgb.Dataset(trainX, label=trainY, categorical_feature=cat_cols)
lgb_test = lgb.Dataset(testX, label=testY, categorical_feature=[cat_cols], reference=lgb_train)

# lgb training params
params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': 17,
    'boosting_type': 'gbdt',
    'num_leaves': 62,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

model = lgb.train(params, lgb_train, valid_sets=[lgb_test])

pred = np.argmax(model.predict(testX), axis=1)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

New categorical_feature is ['day', 'a_neigh', 'intsct', 'pd']


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9096
[LightGBM] [Info] Number of data points in the train set: 489844, number of used features: 14
[LightGBM] [Info] Start training from score -5.448822
[LightGBM] [Info] Start training from score -2.371989
[LightGBM] [Info] Start training from score -2.490861
[LightGBM] [Info] Start training from score -5.659936
[LightGBM] [Info] Start training from score -3.634227
[LightGBM] [Info] Start training from score -3.910991
[LightGBM] [Info] Start training from score -3.204725
[LightGBM] [Info] Start training from score -3.015075
[LightGBM] [Info] Start training from score -8.160200
[LightGBM] [Info] Start training from score -3.424879
[LightGBM] [Info] Start training from score -2.504508
[LightGBM] [Info] Start training from score -3.39985

### CatBoost

In [14]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

trainX, testX, trainY, testY = train_test_split(X, Y_enc, test_size=0.2, random_state=42)

model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    cat_features=cat_cols,
    loss_function='MultiClass',
    eval_metric='MultiClass'
)

model.fit(trainX, trainY, eval_set=(testX, testY), early_stopping_rounds=10)

pred = model.predict(testX)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

0:	learn: 2.5546110	test: 2.5544083	best: 2.5544083 (0)	total: 5.84s	remaining: 9m 38s
1:	learn: 2.4206589	test: 2.4203804	best: 2.4203804 (1)	total: 11.6s	remaining: 9m 29s
2:	learn: 2.3296555	test: 2.3293282	best: 2.3293282 (2)	total: 18.1s	remaining: 9m 46s
3:	learn: 2.2633601	test: 2.2626812	best: 2.2626812 (3)	total: 24.3s	remaining: 9m 42s
4:	learn: 2.2091157	test: 2.2085499	best: 2.2085499 (4)	total: 30s	remaining: 9m 30s
5:	learn: 2.1662214	test: 2.1656258	best: 2.1656258 (5)	total: 36.2s	remaining: 9m 26s
6:	learn: 2.1311676	test: 2.1307211	best: 2.1307211 (6)	total: 42.3s	remaining: 9m 21s
7:	learn: 2.1021412	test: 2.1018688	best: 2.1018688 (7)	total: 48.3s	remaining: 9m 15s
8:	learn: 2.0759448	test: 2.0757264	best: 2.0757264 (8)	total: 54.7s	remaining: 9m 12s
9:	learn: 2.0547242	test: 2.0546699	best: 2.0546699 (9)	total: 1m	remaining: 9m 6s
10:	learn: 2.0366470	test: 2.0366935	best: 2.0366935 (10)	total: 1m 6s	remaining: 8m 59s
11:	learn: 2.0198471	test: 2.0200660	best: 2.02

## Fine-tuning

### Decision Tree

In [None]:
treedata = pd.read_csv('tree_dataset.csv', index_col=0)

ord_enc = OrdinalEncoder()
cat_cols = ['day','a_neigh','intsct','pd'] # neigh alr encoded
treedata_ordinal = treedata.copy()
treedata_ordinal[cat_cols] = ord_enc.fit_transform(treedata_ordinal[cat_cols])
display(treedata_ordinal.head())

X = treedata_ordinal.drop(labels=['cat'], axis=1).to_numpy()
Y = treedata_ordinal['cat'].to_numpy()

Unnamed: 0,year,month_cont,day,time,lat,long,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd,cat
0,2023,3.52,4.0,17.5,37.76229,-122.401324,28.0,54.0,712.0,0.0,10.0,10.0,9.0,2.0,Assault
1,2021,7.23,6.0,8.3,37.753837,-122.418594,18.0,53.0,1102.0,3.0,9.0,9.0,2.0,3.0,Assault
2,2021,6.13,0.0,9.67,37.785893,-122.419739,35.0,20.0,5178.0,4.0,5.0,6.0,10.0,4.0,Assault
3,2021,7.39,1.0,12.33,37.783214,-122.410765,35.0,20.0,9111.0,10.0,5.0,6.0,10.0,5.0,Disorderly Conduct
4,2019,6.37,5.0,16.5,37.775953,-122.408846,33.0,32.0,5583.0,8.0,6.0,6.0,10.0,1.0,Sex Offense
