In [1]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=5000, n_features=10, n_redundant=2, n_informative=5, n_clusters_per_class=2, flip_y=0.09, class_sep = 1.1,
                          random_state=4184)

In [3]:
X.shape

(5000, 10)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [5]:
X_train.shape

(4000, 10)

In [6]:
y_train.shape

(4000,)

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42, criterion="entropy", min_samples_split=10, max_depth=5)

In [8]:
clf

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [9]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [10]:
from sklearn.metrics import confusion_matrix, f1_score

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[432  63]
 [189 316]]


In [13]:
print("F1 score = {:.2f}".format(f1_score(y_test, y_pred)))

F1 score = 0.71


In [17]:
from sklearn.model_selection import GridSearchCV

base_clf = DecisionTreeClassifier(splitter="best", class_weight=None, random_state=42)
parameters_grid = {'criterion': ('gini', 'entropy'),
                  'max_depth': [2, 10, 50, 100, 200],
                  'max_leaf_nodes': [None, 5, 10, 50, 100],
                  'min_impurity_decrease': [0, 0.1, 0.2],
                  'min_samples_leaf': [1, 10, 50],
                  'min_samples_split': [2, 10, 50]}
clf_gs = GridSearchCV(base_clf, param_grid=parameters_grid, scoring="f1", n_jobs=3, cv=5, return_train_score=True)
%time clf_gs.fit(X_train, y_train)

Wall time: 32.7 s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=3,
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [2, 10, 50, 100, 200],
                     

In [18]:
clf_gs.best_params_

{'criterion': 'entropy',
 'max_depth': 50,
 'max_leaf_nodes': 100,
 'min_impurity_decrease': 0,
 'min_samples_leaf': 10,
 'min_samples_split': 2}

In [19]:
y_pred_gs = clf_gs.predict(X_test)
print(confusion_matrix(y_test, y_pred_gs))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_gs)))

[[405  90]
 [111 394]]
F1 Score = 0.80


In [20]:
def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

In [21]:
results_gs = cv_results_to_df(clf_gs.cv_results_)
results_gs

Unnamed: 0,criterion,max_depth,max_leaf_nodes,min_impurity_decrease,min_samples_leaf,min_samples_split,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
1326,entropy,200,100.0,0.0,10,2,0.038381,0.001000,0.879076,0.004364,0.815684,0.008439,1
1327,entropy,200,100.0,0.0,10,10,0.042800,0.001292,0.879076,0.004364,0.815684,0.008439,1
1057,entropy,50,100.0,0.0,10,10,0.040398,0.001000,0.879076,0.004364,0.815684,0.008439,1
1056,entropy,50,100.0,0.0,10,2,0.040500,0.001200,0.879076,0.004364,0.815684,0.008439,1
1191,entropy,100,100.0,0.0,10,2,0.042600,0.001201,0.879076,0.004364,0.815684,0.008439,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
837,entropy,10,5.0,0.0,1,2,0.020499,0.001200,0.678763,0.008997,0.667952,0.029160,1315
838,entropy,10,5.0,0.0,1,10,0.020998,0.001202,0.678763,0.008997,0.667952,0.029160,1315
839,entropy,10,5.0,0.0,1,50,0.021900,0.001000,0.678763,0.008997,0.667952,0.029160,1315
840,entropy,10,5.0,0.0,10,2,0.019999,0.001101,0.678763,0.008997,0.667952,0.029160,1315


In [22]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

parameters_dist = {
              'criterion': ["entropy", "gini", "gini"],
              'max_depth': randint(2, 200),
              'max_features': [None, "auto"],
              'max_leaf_nodes': randint(5, 500),
              'min_impurity_decrease': uniform(0.0, 0.5),
              'min_samples_leaf': randint(2, 50),
              'min_samples_split': randint(2, 50)}

clf_rs = RandomizedSearchCV(base_clf, param_distributions=parameters_dist, cv=5, n_jobs=3, scoring="f1", return_train_score=True, n_iter=5000)

%time clf_rs.fit(X_train, y_train)

Wall time: 1min


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=DecisionTreeClassifier(class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort=False,
                                                    random_state=42,
                                                    splitter='best'),


In [23]:
y_pred_rs = clf_rs.predict(X_test)
print(confusion_matrix(y_test, y_pred_rs))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_rs)))

[[410  85]
 [ 93 412]]
F1 Score = 0.82


In [24]:
results_rs = cv_results_to_df(clf_rs.cv_results_)
results_rs

Unnamed: 0,criterion,max_depth,max_features,max_leaf_nodes,min_impurity_decrease,min_samples_leaf,min_samples_split,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
3073,gini,75,,75,0.001162,2,3,0.028106,0.001501,0.864988,0.007504,0.801445,0.013550,1
1641,entropy,113,,373,0.001492,20,34,0.037483,0.001116,0.850497,0.003251,0.795571,0.008642,2
1684,entropy,31,,347,0.002441,27,24,0.028614,0.001007,0.828335,0.005931,0.792448,0.014432,3
4266,gini,15,,472,0.000939,32,15,0.019216,0.001100,0.828934,0.007141,0.792255,0.008882,4
738,gini,86,,246,0.000309,35,41,0.019399,0.001194,0.824625,0.006918,0.787084,0.014087,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
958,entropy,181,,451,0.081685,48,36,0.011509,0.001101,0.544182,0.034021,0.525858,0.025312,4980
1136,entropy,134,,33,0.085046,46,32,0.011411,0.001089,0.544182,0.034021,0.525858,0.025312,4980
3065,entropy,62,,177,0.076913,37,20,0.012608,0.001092,0.544182,0.034021,0.525858,0.025312,4980
4771,gini,2,auto,438,0.010581,41,28,0.002600,0.001004,0.528540,0.039431,0.515660,0.053271,4999
