In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

pd.options.display.max_columns = 1000

In [2]:
train_data = pd.read_csv('data/train_cleaned.csv', index_col=0)

numerical_columns = ['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1']

train_data.head()

Unnamed: 0_level_0,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,cp_data,outcome
surgery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1.0,0.0,530001,38.1,132.0,24.0,1.0,3.0,2.0,2.0,1.0,0.0,3.0,2.0,0.0,6.5,1.0,1.0,57.0,8.5,2.0,3.4,1.0,2209,0.0,0.0
1.0,0.0,533836,37.5,88.0,12.0,1.0,2.0,4.0,2.0,3.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,33.0,64.0,2.0,2.0,1.0,2208,0.0,1.0
1.0,0.0,529812,38.3,120.0,28.0,1.0,3.0,5.0,1.0,2.0,3.0,0.0,2.0,2.0,3.5,1.0,0.0,37.0,6.4,2.0,3.4,1.0,5124,0.0,2.0
1.0,0.0,5262541,37.1,72.0,30.0,0.0,3.0,5.0,2.0,3.0,3.0,0.0,2.0,1.0,2.0,1.0,1.0,53.0,7.0,1.0,3.9,1.0,2208,1.0,2.0
0.0,0.0,5299629,38.0,52.0,48.0,2.0,2.0,3.0,1.0,0.0,3.0,1.0,2.0,0.0,7.0,3.0,3.0,47.0,7.3,1.0,2.6,0.0,0,1.0,2.0


In [3]:
target_names = ['lived', 'euthanized', 'died']
feature_names = ['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse', 'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen', 'packed_cell_volume', 'total_protein', 'abdomo_appearance', 'abdomo_protein', 'surgical_lesion', 'lesion_1', 'cp_data']

opdel data i features og labels

In [4]:
X = train_data.drop(columns=['outcome'])
y = train_data['outcome']

X_train, X_test,\
    y_train, y_test = train_test_split(X, y,
                                       test_size=0.2,
                                       random_state=42)

In [5]:
scaler = StandardScaler().fit(X_train[numerical_columns])

print(scaler.mean_)

[3.81846154e+01 7.94787449e+01 3.00465587e+01 4.41639676e+00
 4.97484818e+01 2.14990891e+01 3.29655870e+00 3.89474393e+03]


In [6]:
X_train[numerical_columns] = scaler.transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [7]:
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300, 350],  # Antal beslutningstræer
    'max_depth': [None, 5, 10, 15],   # Maksimal dybde af træerne
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],   # Minimum antal prøver påkrævet for at opdele en knude
    'min_samples_leaf': [1, 2],     # Minimum antal prøver i en bladknude
    'max_features': ['sqrt', 'log2'],  # Antallet af funktioner at overveje ved hver opdeling
    'bootstrap': [True]         # Om bootstrap sampling skal udføres
}

In [8]:
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_macro',
    'recall': 'recall_macro',
    'f1': 'f1_macro',
    'roc_auc': 'roc_auc_ovr'  # Use roc_auc_ovr for multiclass classification
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)  # cv er antallet af krydsvalideringsfold

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


In [9]:
# Get the best model and evaluate on the test data
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

# Evaluate multiple metrics on the test data
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test), average='macro', multi_class='ovr')

# Print the best hyperparameters and test metrics
print("Best Hyperparameters:", grid_search.best_params_)
print("Test Metrics:")
print("  Accuracy:", accuracy)
print("  Precision:", precision)
print("  Recall:", recall)
print("  F1 Score:", f1)
print("  ROC AUC Score:", roc_auc)

Best Hyperparameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 300}
Test Metrics:
  Accuracy: 0.7246963562753036
  Precision: 0.731453976001233
  Recall: 0.7021860014284256
  F1 Score: 0.7082988267770877
  ROC AUC Score: 0.8479793270992831


In [10]:
# Extract single tree
specific_estimator = best_model.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(specific_estimator, out_file='tree.dot', 
                feature_names = feature_names,
                class_names = target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

ValueError: Length of feature_names, 25 does not match number of features, 24