In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the Breast Cancer dataset
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(data=np.c_[cancer['data'], cancer['target']],
                          columns=np.append(cancer['feature_names'], 'target'))


In [3]:
# Feature Selection
features = cancer['feature_names']


In [4]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(cancer_df, test_size=0.2, random_state=42)


In [5]:
# Preprocessing
def preprocess_data(data, features):
    X = data[features]
    y = data['target']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y


In [6]:
# Train a Decision Tree Classifier model
def train_decision_tree(X_train, y_train):
    dt_model = DecisionTreeClassifier(random_state=42)
    dt_model.fit(X_train, y_train)
    return dt_model

In [7]:
# Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)
    return accuracy, report, matrix

In [8]:
# Cross-validate the model
def cross_validate_decision_tree(X, y, folds=5):
    dt_model = DecisionTreeClassifier(random_state=42)
    scores = cross_val_score(dt_model, X, y, cv=folds, scoring='accuracy')
    return scores

In [9]:
# Preprocess the training data
X_train_scaled, y_train = preprocess_data(train_data, features)

In [10]:
# Train a Decision Tree Classifier model
dt_model = train_decision_tree(X_train_scaled, y_train)

In [11]:
# Preprocess the test data
X_test_scaled, y_test = preprocess_data(test_data, features)

In [12]:
# Evaluate the model
accuracy, report, confusion_matrix = evaluate_model(dt_model, X_test_scaled, y_test)
print(f'Accuracy of the model on the test set: {accuracy:.2f}')
print('\nClassification Report:')
print(report)
print('\nConfusion Matrix:')
print(confusion_matrix)

Accuracy of the model on the test set: 0.92

Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.86      0.89        43
         1.0       0.92      0.96      0.94        71

    accuracy                           0.92       114
   macro avg       0.92      0.91      0.91       114
weighted avg       0.92      0.92      0.92       114


Confusion Matrix:
[[37  6]
 [ 3 68]]


In [13]:
# Cross-validate the model
X_all_scaled, y_all = preprocess_data(cancer_df, features)
cross_val_scores = cross_validate_decision_tree(X_all_scaled, y_all)
print(f'\nCross-Validation Scores: {cross_val_scores}')


Cross-Validation Scores: [0.9122807  0.90350877 0.92982456 0.95614035 0.88495575]
