# Training and Evaluation of ML Models on SF Incident Report Data

## Imports

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import optuna

## Loading Data
Using autoencoded data for these models

In [2]:
data = pd.read_csv('preprocessed_data.csv', index_col=0)
X = data.to_numpy()

labels = pd.read_csv('tree_dataset.csv', index_col=0)
Y = labels['cat']
enc = LabelEncoder()
Y = enc.fit_transform(Y)

del data, labels
print(X.shape)
print(Y.shape)

(612306, 8)
(612306,)


## Logistic Regression Baseline

In [3]:
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=42)

model = LogisticRegression(solver='lbfgs', max_iter=500, class_weight='balanced')
model.fit(trainX, trainY)

probs = model.predict_proba(testX)
score = log_loss(testY, probs)
print(f'Log Loss: {score}')

Log Loss: 2.7513173774818314


In [4]:
def lg_objective(trial):
    params = {
        'solver': 'lbfgs',
        'max_iter': 500,
        'class_weight': 'balanced',
        'C': trial.suggest_float('C', 1e-4, 1e3, log=True)
    }

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    total_score = 0.0
    for idx, (train_idx, val_idx) in enumerate(kf.split(X, Y)):
        x = X[train_idx]
        y = Y[train_idx]
        valx = X[val_idx]
        valy = Y[val_idx]

        model = LogisticRegression(**params)
        model.fit(x,y)

        probs = model.predict_proba(valx)
        score = log_loss(valy, probs)
        total_score += score

        trial.report(score, idx)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return total_score / 3

pruner = optuna.pruners.MedianPruner(n_warmup_steps=3)
study = optuna.create_study(direction='minimize', pruner=pruner)
study.optimize(lg_objective, n_trials=5)

print(study.best_params)
print(study.best_value)

[I 2025-04-14 11:15:18,467] A new study created in memory with name: no-name-52fffb13-dc95-4489-b3e3-90658d801d73
[I 2025-04-14 11:15:26,660] Trial 0 finished with value: 2.7513513173826234 and parameters: {'C': 6.680865956413374}. Best is trial 0 with value: 2.7513513173826234.
[I 2025-04-14 11:15:34,469] Trial 1 finished with value: 2.7524488501201154 and parameters: {'C': 0.0021929924425371744}. Best is trial 0 with value: 2.7513513173826234.
[I 2025-04-14 11:15:42,404] Trial 2 finished with value: 2.7513525353494823 and parameters: {'C': 0.7080957984901818}. Best is trial 0 with value: 2.7513513173826234.
[I 2025-04-14 11:15:50,142] Trial 3 finished with value: 2.751354926506009 and parameters: {'C': 0.25007150117366356}. Best is trial 0 with value: 2.7513513173826234.
[I 2025-04-14 11:15:55,681] Trial 4 finished with value: 2.7643207641807095 and parameters: {'C': 0.00022194089045665132}. Best is trial 0 with value: 2.7513513173826234.


{'C': 6.680865956413374}
2.7513513173826234


## KNN

In [7]:
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=42)

model = KNeighborsClassifier(n_neighbors=50, weights='distance')
model.fit(trainX, trainY)

probs = model.predict_proba(testX)
score = log_loss(testY, probs)
print(f'Log Loss: {score}')

Log Loss: 6.057461532399648


In [8]:
def knn_objective(trial):
    params = {
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'n_neighbors': trial.suggest_int('n_neighbors', 10, 200),
        'p': trial.suggest_categorical('p', [1,2])
    }

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    total_score = 0.0
    for idx, (train_idx, val_idx) in enumerate(kf.split(X,Y)):
        x = X[train_idx]
        y = Y[train_idx]
        valx = X[val_idx]
        valy = Y[val_idx]

        model = KNeighborsClassifier(**params)
        model.fit(x,y)
        
        probs = model.predict_proba(valx)
        score = log_loss(valy, probs)
        total_score += score

        trial.report(score, idx)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
    return total_score / 3

pruner = optuna.pruners.MedianPruner(n_warmup_steps=3)
study = optuna.create_study(direction='minimize', pruner=pruner)
study.optimize(knn_objective, n_trials=10)

[I 2025-04-15 10:21:17,841] A new study created in memory with name: no-name-02b7ae2f-0c08-49c5-acbf-7b808306799a
[I 2025-04-15 10:22:11,239] Trial 0 finished with value: 2.495832976583977 and parameters: {'weights': 'uniform', 'n_neighbors': 100, 'p': 2}. Best is trial 0 with value: 2.495832976583977.
[I 2025-04-15 10:22:46,511] Trial 1 finished with value: 3.4066970589084042 and parameters: {'weights': 'uniform', 'n_neighbors': 50, 'p': 2}. Best is trial 0 with value: 2.495832976583977.
[I 2025-04-15 10:24:25,007] Trial 2 finished with value: 2.318619977662842 and parameters: {'weights': 'uniform', 'n_neighbors': 122, 'p': 1}. Best is trial 2 with value: 2.318619977662842.
[I 2025-04-15 10:26:29,644] Trial 3 finished with value: 4.691516879339209 and parameters: {'weights': 'distance', 'n_neighbors': 176, 'p': 1}. Best is trial 2 with value: 2.318619977662842.
[I 2025-04-15 10:28:34,523] Trial 4 finished with value: 4.669947742172231 and parameters: {'weights': 'distance', 'n_neighbo

## Neural Networks