In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [34]:
df = pd.read_csv('Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [35]:
X = df[df.columns.difference(['Id','Species'])]
y = LabelEncoder().fit_transform(df['Species'])

X_train, X_test, Y_train, Y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   stratify=y)

In [36]:
nm = StandardScaler()
dt = DecisionTreeClassifier()
pipe = Pipeline(steps=[('normalizer', nm), ('classifier', dt)]) 
pipe.steps

[('normalizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('classifier',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                         max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort=False,
                         random_state=None, splitter='best'))]

In [38]:
dt = DecisionTreeClassifier()
scores = cross_validate(pipe, X_train, Y_train, cv=3,
                        scoring=('f1_macro', 'precision_macro'),
                        return_train_score=True)
model = pipe.fit(X_train, Y_train)
print('Precision:')
print(scores['test_precision_macro'])
print('=============================================')
print('F1-Score:')
print(scores['test_f1_macro'])

Precision:
[0.94444444 0.88888889 0.86309524]
F1-Score:
[0.94444444 0.88888889 0.84561404]


In [46]:
distribution = {
    'classifier__max_depth': sp_randInt(5, 10),
    'classifier__max_features': sp_randInt(1,4),
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__min_samples_split': sp_randInt(5,10),
    'classifier__min_samples_leaf': sp_randInt(1,5)
}
cv_randm = RandomizedSearchCV(pipe, distribution, cv=3, random_state=42)
randm_search = cv_randm.fit(X_train, Y_train)



{'classifier__criterion': 'gini',
 'classifier__max_depth': 9,
 'classifier__max_features': 1,
 'classifier__min_samples_leaf': 3,
 'classifier__min_samples_split': 6}

In [63]:
param_grid = {
   'classifier__max_depth': [randm_search.best_params_['classifier__max_depth']-1, randm_search.best_params_['classifier__max_depth'], 
                 randm_search.best_params_['classifier__max_depth']+1],
    'classifier__min_samples_leaf': [randm_search.best_params_['classifier__min_samples_leaf']-1, randm_search.best_params_['classifier__min_samples_leaf'],
                        randm_search.best_params_['classifier__min_samples_leaf']+1],
    'classifier__min_samples_split':[randm_search.best_params_['classifier__min_samples_split']-1, randm_search.best_params_['classifier__min_samples_split'],
                        randm_search.best_params_['classifier__min_samples_split']+1],
}
cv_grid = GridSearchCV(pipe, param_grid, n_jobs=-1)
grid_clf = cv_grid.fit(X_train, Y_train)



In [64]:
test_data = X_test.copy()
test_data['y_actual'] = Y_test

test_data['pred'] = grid_clf.predict(X_test)
report = pd.DataFrame(classification_report(test_data['y_actual'], test_data['pred'], output_dict = True)).transpose()
display(report)

Unnamed: 0,precision,recall,f1-score,support
0,1.0,1.0,1.0,15.0
1,1.0,0.933333,0.965517,15.0
2,0.9375,1.0,0.967742,15.0
accuracy,0.977778,0.977778,0.977778,0.977778
macro avg,0.979167,0.977778,0.977753,45.0
weighted avg,0.979167,0.977778,0.977753,45.0
