### Decision Tree using scikit-learn

In [16]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
#Load the dataset
iris = load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
X = iris.data #Input Variables
y = iris.target #Ouptut Variable

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42) #Split the data into training and test sets.

print("Number of training Samples: ", X_train.shape[0])
print("Number of test Samples: ", X_test.shape[0])

Number of training Samples:  105
Number of test Samples:  45


In [13]:
#Training with depths = [1, 2, 3]
depths = [1,2,3]

for depth in depths:
    model = DecisionTreeClassifier(max_depth = depth, random_state = 42)
    model.fit(X_train, y_train)

    #Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Accuracy
    training_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Max Depth: {depth}")
    print(f"Training Accuracy: {training_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")
    print()

Max Depth: 1
Training Accuracy: 0.6476190476190476
Test Accuracy: 0.7111111111111111

Max Depth: 2
Training Accuracy: 0.9428571428571428
Test Accuracy: 0.9777777777777777

Max Depth: 3
Training Accuracy: 0.9523809523809523
Test Accuracy: 1.0



### Signs of Underfitting and Overfitting

With Max Depth = 1, both the training and test accuracy are relatively low. This is a sign of **underfitting**, where the model isn't able to accurately capture the underlying patterns in the data.

With Max Depth = 2 and Max Depth = 3, both the training and test accuracy are high. This means that the model is able to capture the underlying patterns accurately and is able to make better predictions.

If the training accuracy was high but the test accuracy was low, then the model would be **overfitted** which is not the case with any of these depths.