In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [4]:
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    random_state=42
)

### Method 1: Evaluate the model using train, test split and tune parameters by trail and error

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(criterion="gini", max_depth=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82       130
           1       0.80      0.83      0.82       120

    accuracy                           0.82       250
   macro avg       0.82      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250



Method 2: Evaluate the model using Cross_Val_Score

In [10]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(DecisionTreeClassifier(criterion="gini", max_depth=5), X, y, cv=5)
scores

array([0.775, 0.795, 0.745, 0.805, 0.77 ])

In [11]:
scores = cross_val_score(DecisionTreeClassifier(criterion="gini", max_depth=10), X, y, cv=5)
scores

array([0.755, 0.725, 0.805, 0.795, 0.815])

In [12]:
scores = cross_val_score(DecisionTreeClassifier(criterion="entropy", max_depth=5), X, y, cv=5)
scores

array([0.765, 0.78 , 0.75 , 0.815, 0.78 ])

In [13]:
scores = cross_val_score(DecisionTreeClassifier(criterion="entropy", max_depth=10), X, y, cv=5)
scores

array([0.775, 0.79 , 0.815, 0.775, 0.78 ])

In [15]:
criterion = ["entropy", "gini"]
max_depth = [5, 10, 15]

arg_scores = {}

for c in criterion:
    for d in max_depth:
        scores_list = cross_val_score(DecisionTreeClassifier(criterion=c, max_depth=d), X, y, cv=5)
        arg_scores[c + "_" + str(d)] = np.average(scores_list)
        # print(c, d, np.average(scores_list))

arg_scores

{'entropy_5': 0.78,
 'entropy_10': 0.7869999999999999,
 'entropy_15': 0.817,
 'gini_5': 0.781,
 'gini_10': 0.78,
 'gini_15': 0.79}

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, 15]
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=False)
grid_search.fit(X, y)

grid_search.cv_results_

{'mean_fit_time': array([0.00700102, 0.00946598, 0.0100141 , 0.00840592, 0.01307178,
        0.01289182]),
 'std_fit_time': array([0.00049256, 0.00051904, 0.00077197, 0.00011646, 0.00202164,
        0.0004453 ]),
 'mean_score_time': array([0.00092931, 0.00094981, 0.00108047, 0.00100603, 0.00126171,
        0.00106287]),
 'std_score_time': array([4.37315570e-05, 1.20936282e-05, 1.34893633e-04, 1.54067894e-04,
        3.02699631e-04, 4.33132481e-05]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': 'entropy',

In [18]:
df = pd.DataFrame(grid_search.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009862,0.001206,0.001772,0.000108,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.78,0.785,0.745,0.795,0.77,0.775,0.017029,6
1,0.016552,0.005072,0.003169,0.00289,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.785,0.735,0.805,0.78,0.81,0.783,0.026571,4
2,0.024941,0.007392,0.002853,0.002236,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.795,0.73,0.825,0.82,0.825,0.799,0.036249,2
3,0.01414,0.00245,0.002441,0.001407,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.78,0.75,0.81,0.78,0.777,0.0199,5
4,0.017241,0.000523,0.0022,0.000741,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.77,0.805,0.815,0.785,0.785,0.792,0.016,3
5,0.020005,0.002934,0.001734,7.4e-05,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.76,0.79,0.825,0.79,0.87,0.807,0.03763,1


In [19]:
df[["param_criterion", "param_max_depth", "mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.775
1,gini,10,0.783
2,gini,15,0.799
3,entropy,5,0.777
4,entropy,10,0.792
5,entropy,15,0.807


In [20]:
grid_search.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [21]:
grid_search.best_estimator_

In [24]:
from sklearn import svm

model_params = {
    "decision_tree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "criterion": ["gini", "entropy"],
            "max_depth": [5, 10, 15]
        }
    },
    "svm": {
        "model": svm.SVC(gamma="auto"),
        "params": {
            "C": [1, 10, 20],
            "kernel": ["rbf", "linear"]
        }
    }
}

scores = []

for model_name, mp in model_params.items():
    grid_search = GridSearchCV(mp["model"], mp["params"], cv=5, return_train_score=False)
    grid_search.fit(X, y)
    scores.append({
        "model": model_name,
        "best_score": grid_search.best_score_,
        "best_params": grid_search.best_params_
    })

scores

[{'model': 'decision_tree',
  'best_score': 0.8100000000000002,
  'best_params': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': 0.9260000000000002,
  'best_params': {'C': 1, 'kernel': 'rbf'}}]

In [25]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.81,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.926,"{'C': 1, 'kernel': 'rbf'}"
