<a href="https://colab.research.google.com/github/pranavsrinivas29/hyperparameter_opt/blob/main/Decision_tree_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Decision Tree for classification

W/o Optimization

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_text

In [None]:
# Load the Titanic dataset from seaborn
titanic = sns.load_dataset("titanic")

# Preprocessing: Drop columns with missing values and select relevant features
titanic = titanic.dropna(subset=["age", "embarked"])
titanic = titanic[["sex", "class", "age", "sibsp", "parch", "fare", "embarked", "survived"]]

# Encode categorical variables using one-hot encoding
titanic_encoded = pd.get_dummies(titanic, columns=["sex", "class", "embarked"], drop_first=True)

X = titanic_encoded.drop(columns=["survived"])
y = titanic_encoded["survived"]

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Create a Decision Tree Classifier with default hyperparameters
clf = DecisionTreeClassifier(random_state=42)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)




Accuracy: 0.73
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.77       122
           1       0.70      0.64      0.67        92

    accuracy                           0.73       214
   macro avg       0.72      0.72      0.72       214
weighted avg       0.73      0.73      0.73       214



With Randomized Search based Hyp Opt

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Define hyperparameters and their possible values for random search
param_dist_random = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"],
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist_random, n_iter=50, cv=5, n_jobs=-1, random_state=42
)

# Fit the randomized search to the training data
random_search.fit(X_train, y_train)

# Get the best estimator (model with best hyperparameters)
best_clf_random = random_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_random = best_clf_random.predict(X_test)

In [None]:
# Calculate accuracy
accuracy_random = accuracy_score(y_test, y_pred_random)
print(f"Accuracy: {accuracy_random:.2f}")

# Generate a classification report
class_report = classification_report(y_test, y_pred_random)
print("Classification Report:\n", class_report)


Accuracy: 0.77
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.93      0.82       122
           1       0.85      0.55      0.67        92

    accuracy                           0.77       214
   macro avg       0.79      0.74      0.74       214
weighted avg       0.78      0.77      0.76       214



Grid Based Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid=param_dist_random, cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator (model with best hyperparameters)
best_clf_grid = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_grid = best_clf_grid.predict(X_test)



In [None]:
# Calculate accuracy
accuracy_grid = accuracy_score(y_test, y_pred_grid)
print(f"Accuracy: {accuracy_grid:.2f}")

# Generate a classification report
class_report = classification_report(y_test, y_pred_grid)
print("Classification Report:\n", class_report)

Accuracy: 0.79
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.90      0.83       122
           1       0.83      0.64      0.72        92

    accuracy                           0.79       214
   macro avg       0.80      0.77      0.78       214
weighted avg       0.80      0.79      0.78       214



Bayesian Search

In [None]:
from skopt import BayesSearchCV


# Create a BayesSearchCV object
bayes_search = BayesSearchCV(
    clf,
    search_spaces=param_dist_random,
    n_iter=50,  # Number of iterations/evaluations
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,
)

In [None]:
# Fit the Bayesian search to the training data
bayes_search.fit(X_train, y_train)

# Get the best estimator (model with best hyperparameters)
best_clf_bayesian = bayes_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_bayesian = best_clf_bayesian.predict(X_test)

# Calculate accuracy
accuracy_bayesian = accuracy_score(y_test, y_pred_bayesian)
print(f"Accuracy: {accuracy_bayesian:.2f}")

# Generate a classification report
class_report = classification_report(y_test, y_pred_bayesian)
print("Classification Report:\n", class_report)




Accuracy: 0.76
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.81      0.80       122
           1       0.74      0.70      0.72        92

    accuracy                           0.76       214
   macro avg       0.76      0.75      0.76       214
weighted avg       0.76      0.76      0.76       214





In [None]:
# Print the best hyperparameters
print("Best Hyperparameters (Random Search):", random_search.best_params_)

# Print the best hyperparameters
print("Best Hyperparameters (Grid Based Search):", grid_search.best_params_)


# Print the best hyperparameters
print("Best Hyperparameters (Bayesian Search):", bayes_search.best_params_)

Best Hyperparameters (Random Search): {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5, 'criterion': 'entropy'}
Best Hyperparameters (Grid Based Search): {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Hyperparameters (Bayesian Search): OrderedDict([('criterion', 'entropy'), ('max_depth', 10), ('max_features', 'auto'), ('min_samples_leaf', 2), ('min_samples_split', 5)])
