In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score

# Visualization to see how our decision tree look 
import graphviz

# Experiment 1 : Default parameters

In [2]:
#Load Data
cancer_data = load_breast_cancer(as_frame=True)

# get features and targets
X = cancer_data["data"]
Y = cancer_data["target"]

# retrive feature name and class label values of visualization
feature_names = cancer_data["feature_names"]
target_names = cancer_data["target_names"]

# split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

clf = tree.DecisionTreeClassifier()

# train
clf = clf.fit(X_train,Y_train)

# To know how deep our decision tree have depth
print("depth of the decision tree ",clf.get_depth())
print("no of leaves of the decision tree",clf.get_n_leaves())

#  To check overfitting so that we use train erro
y_train_predicted_lables=clf.predict(X_train)
# test error
y_test_predicted_lables=clf.predict(X_test)

# Accuracy 
print("\n train accuracy",accuracy_score(Y_train,y_train_predicted_lables))
print("\n test accuracy", accuracy_score(Y_test,y_test_predicted_lables))


depth of the decision tree  7
no of leaves of the decision tree 19

 train accuracy 1.0

 test accuracy 0.9181286549707602


# Let's visualize the decision tree which we build

In [6]:
dot_data = tree.export_graphviz(clf,feature_names=feature_names,
                               class_names=target_names,
                               filled=True)

graph = graphviz.Source(dot_data)
graph.render("default_params")

'default_params.pdf'

# Experiment 2 :max_depth

In [7]:
#Load Data
cancer_data = load_breast_cancer(as_frame=True)

# get features and targets
X = cancer_data["data"]
Y = cancer_data["target"]

# retrive feature name and class label values of visualization
feature_names = cancer_data["feature_names"]
target_names = cancer_data["target_names"]

# split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

clf = tree.DecisionTreeClassifier(max_depth=4)

# train
clf = clf.fit(X_train,Y_train)

# To know how deep our decision tree have depth
print("depth of the decision tree ",clf.get_depth())
print("no of leaves of the decision tree",clf.get_n_leaves())

#  To check overfitting so that we use train erro
y_train_predicted_lables=clf.predict(X_train)
# test error
y_test_predicted_lables=clf.predict(X_test)

# Accuracy 
print("\n train accuracy",accuracy_score(Y_train,y_train_predicted_lables))
print("\n test accuracy", accuracy_score(Y_test,y_test_predicted_lables))


depth of the decision tree  4
no of leaves of the decision tree 10

 train accuracy 0.9773869346733668

 test accuracy 0.935672514619883


**Experiment 2 through we doing our depth less**

In [8]:
dot_data = tree.export_graphviz(clf,feature_names=feature_names,
                               class_names=target_names,
                               filled=True)

graph = graphviz.Source(dot_data)
graph.render("default_params")

'default_params.pdf'

# Experiment 3 : min_samples_leaf

In [9]:
#Load Data
cancer_data = load_breast_cancer(as_frame=True)

# get features and targets
X = cancer_data["data"]
Y = cancer_data["target"]

# retrive feature name and class label values of visualization
feature_names = cancer_data["feature_names"]
target_names = cancer_data["target_names"]

# split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

clf = tree.DecisionTreeClassifier(min_samples_leaf=50)

# train
clf = clf.fit(X_train,Y_train)

# To know how deep our decision tree have depth
print("depth of the decision tree ",clf.get_depth())
print("no of leaves of the decision tree",clf.get_n_leaves())

#  To check overfitting so that we use train erro
y_train_predicted_lables=clf.predict(X_train)
# test error
y_test_predicted_lables=clf.predict(X_test)

# Accuracy 
print("\n train accuracy",accuracy_score(Y_train,y_train_predicted_lables))
print("\n test accuracy", accuracy_score(Y_test,y_test_predicted_lables))


depth of the decision tree  3
no of leaves of the decision tree 5

 train accuracy 0.9296482412060302

 test accuracy 0.8947368421052632


In [10]:
dot_data = tree.export_graphviz(clf,feature_names=feature_names,
                               class_names=target_names,
                               filled=True)

graph = graphviz.Source(dot_data)
graph.render("default_params")

'default_params.pdf'

# Experiment Part 4 : max_leaf_nodes

In [11]:
#Load Data
cancer_data = load_breast_cancer(as_frame=True)

# get features and targets
X = cancer_data["data"]
Y = cancer_data["target"]

# retrive feature name and class label values of visualization
feature_names = cancer_data["feature_names"]
target_names = cancer_data["target_names"]

# split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

clf = tree.DecisionTreeClassifier(max_leaf_nodes=8)

# train
clf = clf.fit(X_train,Y_train)

# To know how deep our decision tree have depth
print("depth of the decision tree ",clf.get_depth())
print("no of leaves of the decision tree",clf.get_n_leaves())

#  To check overfitting so that we use train erro
y_train_predicted_lables=clf.predict(X_train)
# test error
y_test_predicted_lables=clf.predict(X_test)

# Accuracy 
print("\n train accuracy",accuracy_score(Y_train,y_train_predicted_lables))
print("\n test accuracy", accuracy_score(Y_test,y_test_predicted_lables))


depth of the decision tree  5
no of leaves of the decision tree 8

 train accuracy 0.9773869346733668

 test accuracy 0.9415204678362573


In [12]:
dot_data = tree.export_graphviz(clf,feature_names=feature_names,
                               class_names=target_names,
                               filled=True)

graph = graphviz.Source(dot_data)
graph.render("default_params")

'default_params.pdf'

# Final Experiment : use all parmas max_leaf_nodes,max_depth,min_samples_leaf

In [13]:
#Load Data
cancer_data = load_breast_cancer(as_frame=True)

# get features and targets
X = cancer_data["data"]
Y = cancer_data["target"]

# retrive feature name and class label values of visualization
feature_names = cancer_data["feature_names"]
target_names = cancer_data["target_names"]

# split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

clf = tree.DecisionTreeClassifier(max_depth=4,min_samples_leaf=50,max_leaf_nodes=8)

# train
clf = clf.fit(X_train,Y_train)

# To know how deep our decision tree have depth
print("depth of the decision tree ",clf.get_depth())
print("no of leaves of the decision tree",clf.get_n_leaves())

#  To check overfitting so that we use train erro
y_train_predicted_lables=clf.predict(X_train)
# test error
y_test_predicted_lables=clf.predict(X_test)

# Accuracy 
print("\n train accuracy",accuracy_score(Y_train,y_train_predicted_lables))
print("\n test accuracy", accuracy_score(Y_test,y_test_predicted_lables))


depth of the decision tree  3
no of leaves of the decision tree 5

 train accuracy 0.9296482412060302

 test accuracy 0.8947368421052632


In [14]:
dot_data = tree.export_graphviz(clf,feature_names=feature_names,
                               class_names=target_names,
                               filled=True)

graph = graphviz.Source(dot_data)
graph.render("default_params")

'default_params.pdf'