In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [11]:
data = pd.read_csv("iris_dataset.csv", names=['length', 'width', 'petal_length', 'petal_width', 'class'])
print(data.head())

   length  width  petal_length  petal_width        class
0     5.1    3.5           1.4          0.2  Iris-setosa
1     4.9    3.0           1.4          0.2  Iris-setosa
2     4.7    3.2           1.3          0.2  Iris-setosa
3     4.6    3.1           1.5          0.2  Iris-setosa
4     5.0    3.6           1.4          0.2  Iris-setosa


In [12]:
print(data.isnull().sum())
# No null values

length          0
width           0
petal_length    0
petal_width     0
class           0
dtype: int64


In [13]:
data.shape

(150, 5)

In [14]:
data

Unnamed: 0,length,width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [15]:
X = data.drop(columns=["class"])
y = data["class"]

In [17]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=32)

In [39]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(105, 4)
(105,)
(45, 4)
(45,)


#### Preprocessing

In [40]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
X_train_scaled

array([[ 2.16472815e+00,  2.03891415e+00,  1.63135491e+00,
         1.26564316e+00],
       [ 2.43574800e-01, -5.46665389e-01,  4.65308175e-01,
        -6.99601745e-02],
       [ 3.63646884e-01, -2.88107435e-01,  2.32098827e-01,
         6.36001586e-02],
       [ 2.43574800e-01, -2.88107435e-01,  4.65308175e-01,
         1.97160492e-01],
       [-1.07721813e+00,  7.46124382e-01, -1.45866894e+00,
        -1.40556351e+00],
       [ 4.83718969e-01, -1.32233925e+00,  5.81912849e-01,
         3.30720825e-01],
       [ 2.43574800e-01, -2.95494805e-02,  4.07005838e-01,
         1.97160492e-01],
       [-8.37073960e-01,  1.00468234e+00, -1.45866894e+00,
        -1.40556351e+00],
       [-1.19729021e+00, -1.32233925e+00,  3.48703501e-01,
         5.97841491e-01],
       [-1.19729021e+00, -1.58089721e+00, -3.50924542e-01,
        -3.37080841e-01],
       [-1.19729021e+00,  2.29008474e-01, -1.40036661e+00,
        -1.53912384e+00],
       [-1.43743438e+00,  4.87566428e-01, -1.51697128e+00,
      

#### Decision Tree Classification

In [42]:
clf = DecisionTreeClassifier(random_state=32)
clf.fit(X_train_scaled, y_train)

In [43]:
y_pred = clf.predict(X_test_scaled)

In [45]:
y_pred

array(['Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
       'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-setosa'], dtype=object)

In [44]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100, "%")

Accuracy: 95.55555555555556 %


In [87]:
print(classification_report(y_test, y_pred))


                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        19
Iris-versicolor       1.00      0.83      0.91        12
 Iris-virginica       0.88      1.00      0.93        14

       accuracy                           0.96        45
      macro avg       0.96      0.94      0.95        45
   weighted avg       0.96      0.96      0.96        45



In [58]:
import pydotplus as pdp
from sklearn.tree import export_graphviz

dot_data = export_graphviz(clf, out_file=None, feature_names=X.columns)
graph = pdp.graph_from_dot_data(dot_data)
graph.write_png("decision_tree.png")

True

### Hyperparameter Tuning

In [70]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7],
}


In [78]:
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3)
grid_search.fit(X_train_scaled, y_train)

In [79]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_score)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 3}
Best Accuracy: 0.9333333333333332


In [90]:
best_estimator = grid_search.best_estimator_
y_pred_tuned = best_estimator.predict(X_test_scaled)


In [91]:
classification_report_tuned = classification_report(y_test, y_pred_tuned)
print(classification_report_tuned)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        19
Iris-versicolor       0.92      1.00      0.96        12
 Iris-virginica       1.00      0.93      0.96        14

       accuracy                           0.98        45
      macro avg       0.97      0.98      0.97        45
   weighted avg       0.98      0.98      0.98        45

