# Aim

1. train a decision tree model

In [53]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree

In [2]:
DATA_ROOT = f"../data"

In [3]:
df_train = pd.read_pickle(f"{DATA_ROOT}/train/model/data.pkl")
df_test = pd.read_pickle(f"{DATA_ROOT}/test/model/data.pkl")

# Class weight

1. each class has a different number of occurences
1. adding class weight will handle imabalances in the labels distributions


In [12]:
from sklearn.utils.class_weight import compute_class_weight

"""
for "balanced" 
w = n_samples / (n_classes * np.bincount(y))
"""

classes = [1, 2, 3, 4]
w = compute_class_weight("balanced", classes=classes, y=df_train["Severity"],)
class_weights = {i: j for i, j in zip(classes, w)}

# get value counts
value_counts = df_train["Severity"].value_counts().to_dict()

In [31]:
print("value_counts of labels in train set:")
print(dict(sorted(value_counts.items(), key=lambda x: x[0])))

print("\nweights of labels:")
print(dict(sorted(class_weights.items(), key=lambda x: x[0])))

value_counts of labels in train set:
{1: 969, 2: 1993515, 3: 887615, 4: 92331}

weights of labels:
{1: 767.3968008255933, 2: 0.37301324544836634, 3: 0.8377590509398782, 4: 8.053714353792333}


In [44]:
from sklearn import tree

X = df_train.iloc[:, :-1]
Y = df_train["Severity"]
clf = tree.DecisionTreeClassifier(max_depth=10, class_weight=class_weights)
clf = clf.fit(X, Y)

In [45]:
y_pred_train = clf.predict(df_train.iloc[:, :-1])

In [46]:
y_train = df_train["Severity"].values
y_test = df_test["Severity"].values

In [47]:
from sklearn.metrics import confusion_matrix

In [48]:
confusion_matrix(y_train, y_train)

array([[    969,       0,       0,       0],
       [      0, 1993515,       0,       0],
       [      0,       0,  887615,       0],
       [      0,       0,       0,   92331]])

In [49]:
confusion_matrix(y_train, y_pred_train)

array([[   771,     75,    123,      0],
       [509445, 815954, 390386, 277730],
       [112266,  35021, 636170, 104158],
       [   208,   2459,   2675,  86989]])

In [50]:
y_pred_test = clf.predict(df_test.iloc[:, :-1])

In [51]:
confusion_matrix(y_test, y_test)

array([[ 28205,      0,      0,      0],
       [     0, 379695,      0,      0],
       [     0,      0, 111298,      0],
       [     0,      0,      0,  19989]])

In [52]:
confusion_matrix(y_test, y_pred_test)

array([[    33,  12833,    454,  14885],
       [ 47072, 221620,  53751,  57252],
       [  7350,  17922,  58280,  27746],
       [    92,   2543,   1275,  16079]])

In [68]:
fn = df_train.columns.tolist()[:-1]
cn = [str(i) for i in [1, 2, 3, 4]]

In [73]:
from dtreeviz.trees import *

In [90]:
ix = 4
# display(df_train.iloc[ix, :-1])
print(clf.predict([df_train.iloc[ix, :-1]]),)

print(
    explain_prediction_path(
        clf,
        df_train.iloc[ix, :-1],
        feature_names=fn,
        class_names=cn,
        explanation_type="plain_english",
    ),
)

[4]
kw_ramp < 0.5
Astronomical_Twilight_1 < 0.5
Timezone_1 < 0.5
0.5 <= Timezone_2 
County_1 < 0.5
Side_0 < 0.5
TMC < 100.0
0.0 <= Distance(mi)  < 1.07
zip_len < 7.5

