In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
data = pd.read_csv('../data/clean_data.csv')

In [3]:
X = data.drop('Severity', axis=1)
y = data['Severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1234)

model = DecisionTreeClassifier(random_state=1234)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234, )

param_grid = {
    'max_depth': [3, 10, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model, param_grid = param_grid, cv = cv, scoring='f1_macro', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

best_model = DecisionTreeClassifier(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

{'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [4]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'F1 Score weightened: {f1_score(y_test, y_pred, average="weighted")}')
print(f'F1 Score macro: {f1_score(y_test, y_pred, average="macro")}')

Accuracy: 0.8090496402969954
F1 Score weightened: 0.8020028243658216
F1 Score macro: 0.48434050623616254


In [None]:
# from sklearn.metrics import roc_curve, auc
# from sklearn.preprocessing import label_binarize
# import matplotlib.pyplot as plt

# y_test_binarized = label_binarize(y_test, classes=[1, 2, 3, 4])
# y_pred_proba = best_model.predict_proba(X_test)


# fpr = dict()
# tpr = dict()
# roc_auc = dict()
# for i in range(4): 
#     fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_pred_proba[:, i])
#     roc_auc[i] = auc(fpr[i], tpr[i])

# plt.figure()
# colors = ['blue', 'red', 'green', 'orange'] 
# for i, color in zip(range(4), colors):
#     plt.plot(fpr[i], tpr[i], color=color, lw=2,
#              label='ROC curve of class {0} (area = {1:0.2f})'
#              ''.format(i+1, roc_auc[i]))

# plt.plot([0, 1], [0, 1], 'k--', lw=2)
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend(loc="lower right")
# plt.show()
