In [110]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [121]:
class ModifiedLabelEncoder(LabelEncoder):
    
    def __init__(self, target):
        self.target = target
        
    def fit(self, y, *args, **kwargs):
        y = self.target
        return super().fit_transform(y).reshape(-1, 1)
    


class ReturnValue:    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test


class ModifiedTrainTestSplit(): 
    
    def __init__(self, df, train_features, target, test_len):
        self.test_len = test_len
        self.df = df
        self.target = target
        self.train_features = train_features
    
    def fit(self, features, labels, test_len_1=None):
        
        features = df[self.train_features]
        labels = df[self.target]
        test_len_1 = self.test_len
        
        from sklearn.model_selection import train_test_split
        #Splitting the data into train/test
        X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                      stratify=labels, 
                                                      test_size=test_len_1,
                                                      random_state=42)
        
        return ReturnValue(X_train, X_test, y_train, y_test)

In [126]:
le = ModifiedLabelEncoder(target=df['Species'])
split = ModifiedTrainTestSplit(df, train_features=list(df.columns.difference(['Id', 'Species', 'y_actual'])), 
                               target='y_actual', test_len=0.3)

In [129]:
pipe = Pipeline(steps=[('LabelEncoder', le)]) 
#                        ('train_test_split', split)])
x = pipe.fit(df)
y 

Pipeline(memory=None,
         steps=[('LabelEncoder',
                 ModifiedLabelEncoder(target=0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object))],
         verbose=False)

In [None]:
pipe1 = Pipeline(steps=[])

In [23]:
split = modified_train_test_split()
le = ModifiedLabelEncoder()
dt = DecisionTreeClassifier()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,y_actual
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0


In [16]:
dt = DecisionTreeClassifier()
scores = cross_validate(dt, X_train, y_train, cv=3,
                        scoring=('f1_macro', 'precision_macro'),
                        return_train_score=True)
model = dt.fit(X_train, y_train)
print('Precision:')
print(scores['test_precision_macro'])
print('=============================================')
print('F1-Score:')
print(scores['test_f1_macro'])

Precision:
[0.93333333 1.         0.91111111]
F1-Score:
[0.91534392 1.         0.90890269]


In [17]:
distribution = {
    'max_depth': sp_randInt(5, 10),
    'max_features': sp_randInt(1,4),
    'criterion':['gini', 'entropy'],
    'min_samples_split': sp_randInt(5,10),
    'min_samples_leaf': sp_randInt(1,5)
}
clf_randm = RandomizedSearchCV(model, distribution, cv=3, random_state=42)
randm_search = clf_randm.fit(X_train, y_train)
randm_search.best_params_



{'criterion': 'gini',
 'max_depth': 7,
 'max_features': 3,
 'min_samples_leaf': 4,
 'min_samples_split': 9}

In [18]:
param_grid = {
   'max_depth': [randm_search.best_params_['max_depth']-1, randm_search.best_params_['max_depth'], 
                 randm_search.best_params_['max_depth']+1],
    'max_features': [randm_search.best_params_['max_features']-1, randm_search.best_params_['max_features'],
                    randm_search.best_params_['max_features']+1],
    'min_samples_leaf': [randm_search.best_params_['min_samples_leaf']-1, randm_search.best_params_['min_samples_leaf'],
                        randm_search.best_params_['min_samples_leaf']+1],
    'min_samples_split':[randm_search.best_params_['min_samples_split']-1, randm_search.best_params_['min_samples_split'],
                        randm_search.best_params_['min_samples_split']+1],
}
grid_search = GridSearchCV(model, param_grid, n_jobs=-1)
clf = grid_search.fit(X_train, y_train)



In [21]:
test_data = X_test.copy()
test_data['y_actual'] = y_test

test_data['pred'] = clf.predict(X_test)
report = pd.DataFrame(classification_report(test_data['y_actual'], test_data['pred'], output_dict = True)).transpose()
report

Unnamed: 0,precision,recall,f1-score,support
0,1.0,1.0,1.0,15.0
1,0.933333,0.933333,0.933333,15.0
2,0.933333,0.933333,0.933333,15.0
accuracy,0.955556,0.955556,0.955556,0.955556
macro avg,0.955556,0.955556,0.955556,45.0
weighted avg,0.955556,0.955556,0.955556,45.0
