In [None]:
import numpy as np
import pandas as pd
import sklearn as sk

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
breast_cancer = load_breast_cancer()
print (breast_cancer.feature_names)
print (breast_cancer.target_names)

In [None]:
X = breast_cancer.data
y = breast_cancer.target

df = pd.DataFrame(X, columns=breast_cancer.feature_names)

# Add the target variable to the dataframe
df['target'] = y

# Print the first 5 rows of the dataframe
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)

print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
from sklearn import tree

plt.figure(figsize=(20,20))
features = df.columns
classes = breast_cancer.target_names
tree.plot_tree(model,feature_names=features,class_names=classes,filled=True)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_test_pred, labels=[0,1])
sns.heatmap(cm,annot=True,yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g')
#disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
#disp.plot()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
print(f'Test score {accuracy_score(y_test_pred,y_test)}')

### Constraining with prepruning hyperparameters

In [None]:
model = DecisionTreeClassifier(max_depth=2, random_state=42)
model.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(20,20))
features = df.columns
classes = breast_cancer.target_names
tree.plot_tree(model,feature_names=features,class_names=classes,filled=True)
plt.show()

# Pre pruning

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2,4,6],
         'min_samples_split': [2,3,4],
         'min_samples_leaf': [1,2]}

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf,param_grid=params)
gcv.fit(X_train,y_train)

In [None]:
model = gcv.best_estimator_
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f'Test score {accuracy_score(y_test_pred,y_test)}')

cm = confusion_matrix(y_test, y_test_pred, labels=[0,1])
sns.heatmap(cm,annot=True,yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g')
plt.show()

In [None]:
plt.figure(figsize=(20,20))
features = df.columns
classes = breast_cancer.target_names
tree.plot_tree(model,feature_names=features,class_names=classes,filled=True)
plt.show()

#### Post pruning demo 

In [None]:
path = model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)
print(impurities)

In [None]:
# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

In [None]:
# We will remove the last element in clfs and ccp_alphas
# because it is the trivial tree with only one node

clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

In [None]:
train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X_train)
    y_test_pred = c.predict(X_test)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

In [None]:
clf_ = tree.DecisionTreeClassifier(random_state=0,ccp_alpha=0.008)
clf_.fit(X_train,y_train)
y_train_pred = clf_.predict(X_train)
y_test_pred = clf_.predict(X_test)

print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')

cm = confusion_matrix(y_test, y_test_pred, labels=[0,1])
sns.heatmap(cm,annot=True,yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g')
plt.show()

In [None]:
plt.figure(figsize=(20,20))
features = df.columns
classes = ['Benign','Malignant']
tree.plot_tree(clf_,feature_names=features,class_names=classes,filled=True)
plt.show()