In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [11]:
file_path = 'Data/colon-dataset-processed.csv'
data = pd.read_csv(file_path)
data['Class'] = data['Class']

In [12]:

X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:

tree_clf = DecisionTreeClassifier(random_state=42)
path = tree_clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [14]:
trees = []
for ccp_alpha in ccp_alphas:
    tree_clf = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
    tree_clf.fit(X_train, y_train)
    trees.append(tree_clf)

In [15]:

tree_scores = [accuracy_score(y_test, tree.predict(X_test)) for tree in trees]
max_accuracy_idx = np.argmax(tree_scores)
best_alpha = ccp_alphas[max_accuracy_idx]
best_tree = trees[max_accuracy_idx]
best_accuracy = tree_scores[max_accuracy_idx]

In [16]:

best_tree.fit(X_train, y_train)
y_pred_best_tree = best_tree.predict(X_test)
best_accuracy = accuracy_score(y_test, y_pred_best_tree)
conf_matrix_best_tree = confusion_matrix(y_test, y_pred_best_tree)
class_report_best_tree = classification_report(y_test, y_pred_best_tree)
print(f"Best ccp_alpha: {best_alpha}")
print(f"Test Accuracy: {best_accuracy}")
print("Confusion Matrix:")
print(conf_matrix_best_tree)
print("Classification Report:")
print(class_report_best_tree)

Best ccp_alpha: 0.007019927536231884
Test Accuracy: 0.8695652173913043
Confusion Matrix:
[[11  1]
 [ 2  9]]
Classification Report:
              precision    recall  f1-score   support

   diagnosed       0.85      0.92      0.88        12
     healthy       0.90      0.82      0.86        11

    accuracy                           0.87        23
   macro avg       0.87      0.87      0.87        23
weighted avg       0.87      0.87      0.87        23



In [17]:
#save model
import pickle
with open('Models/DT_colon.pkl', 'wb') as f:
    pickle.dump(best_tree, f)