# Training and Evaluation of Tree Models on SF Incident Report Data

## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

## Experiments
Start off with initial training of target models on 80/20 train test split to see initial performance. Then move on to OOB CV and fine-tuning.

### Decision Trees
DT and RF don't handle categorical's natively. Train these initial models on non-normalized, ordinal encoding dataset. Further consideration of encodings can be treated later as a fine-tuning parameter. 

In [2]:
treedata = pd.read_csv('tree_dataset.csv', index_col=0)
#display(treedata.head())

ord_enc = OrdinalEncoder()
cat_cols = ['day','a_neigh','intsct','pd'] # neigh alr encoded
treedata_ordinal = treedata.copy()
treedata_ordinal[cat_cols] = ord_enc.fit_transform(treedata_ordinal[cat_cols])
display(treedata_ordinal.head())


Unnamed: 0,year,month_cont,day,time,lat,long,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd,cat
0,2023,3.52,4.0,17.5,37.76229,-122.401324,28.0,54.0,712.0,0.0,10.0,10.0,9.0,2.0,Assault
1,2021,7.23,6.0,8.3,37.753837,-122.418594,18.0,53.0,1102.0,3.0,9.0,9.0,2.0,3.0,Assault
2,2021,6.13,0.0,9.67,37.785893,-122.419739,35.0,20.0,5178.0,4.0,5.0,6.0,10.0,4.0,Assault
3,2021,7.39,1.0,12.33,37.783214,-122.410765,35.0,20.0,9111.0,10.0,5.0,6.0,10.0,5.0,Disorderly Conduct
4,2019,6.37,5.0,16.5,37.775953,-122.408846,33.0,32.0,5583.0,8.0,6.0,6.0,10.0,1.0,Sex Offense


In [None]:
# get train test split
train, test = train_test_split(treedata_ordinal, test_size=0.2, random_state=42)
print(f'Train size: {len(train)}, Test size: {len(test)}')

trainX = train.drop('cat', axis=1)
trainY = train['cat']
testX = train.drop('cat', axis=1)
testY = train['cat']

model = DecisionTreeClassifier(random_state=42, max_depth=25) # also 94% at min_samples_leaf = 3
model.fit(trainX, trainY)

pred = model.predict(testX)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

Train size: 489844, Test size: 122462
Accuracy: 0.8042642147295874


### Random Forests

In [29]:
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(trainX, trainY)

pred = model.predict(testX)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

Accuracy: 0.9430083863434073


### XGBoost

In [3]:
treedata[cat_cols] = treedata[cat_cols].astype('category')
X = treedata.drop('cat', axis=1)
Y = treedata['cat']

label_enc = LabelEncoder()
Y_enc = label_enc.fit_transform(Y)

trainX, testX, trainY, testY = train_test_split(X, Y_enc, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(tree_method='hist', enable_categorical=True, eval_metric='logloss',
                          n_estimators=80, max_depth=15, verbosity=1)
model.fit(trainX, trainY)

pred = model.predict(testX)
score = accuracy_score(testY, pred)
print(f'Accuracy: {score}')

Accuracy: 0.45670493704169457


### LightGBM

### CatBoost

## Fine-tuning