In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz

df = pd.read_csv('../data/heart_attack_prediction_dataset.csv')
df.head()

In [None]:
#shape
df.shape

In [None]:
#check for data types
df.dtypes

In [None]:
#check for missing values
df.isna().sum()

In [None]:
#making column titles consistent
df.columns = [column.lower().replace(" ","_") for column in df.columns]


In [None]:
#dividing blood_pressure column into two. First number is systolic pressure and second
#is diastolic pressure

df['systolic_pressure'] = df['blood_pressure'].apply(lambda x: x.split("/")[0])
df['diastolic_pressure'] = df['blood_pressure'].apply(lambda x: x.split("/")[1])
df = df.drop(columns='blood_pressure')

df

In [None]:
df.drop(columns='patient_id',inplace=True)
df

In [None]:
df = df.drop(columns=['continent','hemisphere'])

In [None]:
#change heart attack risk to boolean

df['heart_attack_risk'] = df['heart_attack_risk'].astype(int)
df['systolic_pressure'] = df['systolic_pressure'].astype(int)
df['diastolic_pressure'] = df['diastolic_pressure'].astype(int)
df.dtypes

### EDA

In [None]:
#QUICK EDA

subset1 = df.iloc[:, 0:5]
subset1['heart_attack_risk'] = df['heart_attack_risk']

sns.pairplot(subset1, hue='heart_attack_risk',height=2.5)
plt.show()

In [None]:
subset2 = df.iloc[:, 6:11]
subset2['heart_attack_risk'] = df['heart_attack_risk']

sns.pairplot(subset2, hue='heart_attack_risk',height=2.5)
plt.show()

In [None]:
subset3 = df.iloc[:, 12:17]
subset3['heart_attack_risk'] = df['heart_attack_risk']

sns.pairplot(subset3, hue='heart_attack_risk',height=2.5)
plt.show()

In [None]:
subset4 = df.iloc[:, 18:22]
subset4['heart_attack_risk'] = df['heart_attack_risk']

sns.pairplot(subset4, hue='heart_attack_risk',height=2.5)
plt.show()

In [None]:
subset5 = df.iloc[:, 23:26]
subset5['heart_attack_risk'] = df['heart_attack_risk']

sns.pairplot(subset5, hue='heart_attack_risk',height=2.5)
plt.show()

In [None]:
numerical_columns = df.select_dtypes(include='number')


corr=np.abs(numerical_columns.corr()) # corr(x,y) = corr(y, x), corr(x,x) = 1

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

### Balance Check

In [None]:
heart_attack_risk = df['heart_attack_risk'].value_counts()
heart_attack_risk.plot(kind='bar')
plt.show()

### Oversampling

In [None]:
from sklearn.utils import resample

risk = df[df['heart_attack_risk'] == 1]
no_risk = df[df['heart_attack_risk'] == 0]

len(risk),len(no_risk)

yes_oversampled = resample(risk, replace=True, n_samples = len(no_risk), random_state=0)

over_sampling = pd.concat([yes_oversampled, no_risk])

heart_attack_risk_plt = over_sampling['heart_attack_risk'].value_counts()
heart_attack_risk_plt.plot(kind='bar')
plt.show()

### Undersampling

In [None]:
no_heart_attack_undersampled = resample(no_risk,
                                    replace=False,
                                    n_samples = len(risk),
                                    random_state=0)

under_sampling = pd.concat([no_heart_attack_undersampled, risk])

no_heart_attack_undersampled.plt = under_sampling['heart_attack_risk'].value_counts()
no_heart_attack_undersampled.plt.plot(kind='bar')
plt.show()

### Normalization and Transformation

In [None]:
#NUMERICAL TRANSFORMATION
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

over_sampling = over_sampling.reset_index(drop=True)  # Reset index if needed

#OneHotEncoder categorical columns
df_categorical_columns = over_sampling[['diabetes','family_history','smoking','obesity','alcohol_consumption','diet','previous_heart_problems','medication_use','sex','country']]
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(df_categorical_columns)

cat_trans_np = ohe.transform(df_categorical_columns)
cat_df = pd.DataFrame(cat_trans_np, columns=ohe.get_feature_names_out())

#Normalizing numerical columns
df_numerical_columns = over_sampling[["age","cholesterol","heart_rate","exercise_hours_per_week","stress_level","sedentary_hours_per_day","income","bmi","triglycerides","physical_activity_days_per_week","systolic_pressure","diastolic_pressure"]]
normalizer = MinMaxScaler()
normalizer.fit(df_numerical_columns)

num_trans_np = normalizer.transform(over_sampling[["age","cholesterol","heart_rate","exercise_hours_per_week","stress_level","sedentary_hours_per_day","income","bmi","triglycerides","physical_activity_days_per_week","systolic_pressure","diastolic_pressure"]])
num_df = pd.DataFrame(num_trans_np, columns = df_numerical_columns.columns, index=over_sampling.index)

df_norm_over = pd.concat([num_df, cat_df], axis=1)
df_norm_over["heart_attack_risk"] = over_sampling["heart_attack_risk"]
df_norm_over

In [None]:
#NUMERICAL TRANSFORMATION
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

under_sampling = under_sampling.reset_index(drop=True)  # Reset index if needed

#OneHotEncoder categorical columns
df_categorical_columns = under_sampling[['diabetes','family_history','smoking','obesity','alcohol_consumption','diet','previous_heart_problems','medication_use','sex','country']]
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(df_categorical_columns)

cat_trans_np = ohe.transform(df_categorical_columns)
cat_df = pd.DataFrame(cat_trans_np, columns=ohe.get_feature_names_out())

#Normalizing numerical columns
df_numerical_columns = under_sampling[["age","cholesterol","heart_rate","exercise_hours_per_week","stress_level","sedentary_hours_per_day","income","bmi","triglycerides","physical_activity_days_per_week","systolic_pressure","diastolic_pressure"]]
normalizer = MinMaxScaler()
normalizer.fit(df_numerical_columns)

num_trans_np = normalizer.transform(under_sampling[["age","cholesterol","heart_rate","exercise_hours_per_week","stress_level","sedentary_hours_per_day","income","bmi","triglycerides","physical_activity_days_per_week","systolic_pressure","diastolic_pressure"]])
num_df = pd.DataFrame(num_trans_np, columns = df_numerical_columns.columns, index=under_sampling.index)

under_sampling = under_sampling.reset_index(drop=True)  # Reset index if needed

df_norm_under = pd.concat([num_df, cat_df], axis=1)
df_norm_under["heart_attack_risk"] = under_sampling["heart_attack_risk"]
df_norm_under

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

features = df_norm_over.drop(columns = ['heart_attack_risk'])
target = df_norm_over['heart_attack_risk']

X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(features, target, test_size = 0.20, random_state=0)

In [None]:
heart_attack_risk_over = df_norm_over['heart_attack_risk'].value_counts()
heart_attack_risk_over.plot(kind='bar')
plt.show()

In [None]:
features = df_norm_under.drop(columns = ['heart_attack_risk'])
target = df_norm_under['heart_attack_risk']

X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(features, target, test_size = 0.20, random_state=0)

heart_attack_risk_under = df_norm_under['heart_attack_risk'].value_counts()
heart_attack_risk_under.plot(kind='bar')
plt.show()

### KNNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_over, y_train_over)
pred = knn.predict(X_test_over)


print(f"The accuracy of the model is {knn.score(X_test_over, y_test_over)*100: .2f}%")

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_under, y_train_under)
pred = knn.predict(X_test_under)


print(f"The accuracy of the model is {knn.score(X_test_under, y_test_under)*100: .2f}%")

## Bagging

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score

bagging_clas_over = BaggingClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=100,  max_samples = 1000)
bagging_clas_over.fit(X_train_over, y_train_over)
y_pred_test_bag_over = bagging_clas_over.predict(X_test_over)


print(f"Accuracy: {accuracy_score(y_test_over, y_pred_test_bag_over):.2f}")
print(f"Precision: {precision_score(y_test_over, y_pred_test_bag_over):.2f}")
print(f"Recall: {recall_score(y_test_over, y_pred_test_bag_over):.2f}")
print(f"F1 Score: {f1_score(y_test_over, y_pred_test_bag_over):.2f}")
print(f"R2 score {bagging_clas_over.score(X_test_over, y_test_over):.2f}")

In [None]:
bagging_clas_under = BaggingClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=100,  max_samples = 1000)
bagging_clas_under.fit(X_train_under, y_train_under)
y_pred_test_bag_under = bagging_clas_under.predict(X_test_under)


print(f"Accuracy: {accuracy_score(y_test_under, y_pred_test_bag_under):.2f}")
print(f"Precision: {precision_score(y_test_under, y_pred_test_bag_under):.2f}")
print(f"Recall: {recall_score(y_test_under, y_pred_test_bag_under):.2f}")
print(f"F1 Score: {f1_score(y_test_under, y_pred_test_bag_under):.2f}")
print(f"R2 score {bagging_clas_under.score(X_test_under, y_test_under):.2f}")

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_over = RandomForestClassifier(n_estimators=100, max_depth=20)
forest_over.fit(X_train_over, y_train_over)
y_pred_test_rf_over = forest_over.predict(X_test_over)

print(f"Accuracy: {accuracy_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Precision: {precision_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Recall: {recall_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"F1 Score: {f1_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"R2 score {forest_over.score(X_test_over, y_test_over):.2f}")

In [None]:
forest_under = RandomForestClassifier(n_estimators=100, max_depth=20)
forest_under.fit(X_train_under, y_train_under)
y_pred_test_rf_under = forest_under.predict(X_test_under)

print(f"Accuracy: {accuracy_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Precision: {precision_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Recall: {recall_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"F1 Score: {f1_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"R2 score {forest_under.score(X_test_under, y_test_under):.2f}")

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clas_over = GradientBoostingClassifier(max_depth=20, n_estimators=100)
gb_clas_over.fit(X_train_over, y_train_over)
y_pred_test_gb_under = gb_clas_over.predict(X_train_over)

print(f"Accuracy: {accuracy_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Precision: {precision_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Recall: {recall_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"F1 Score: {f1_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"R2 score, {gb_clas_over.score(X_test_over, y_test_over): .2f}")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clas_under = GradientBoostingClassifier(max_depth=20, n_estimators=100)
gb_clas_under.fit(X_train_under, y_train_under)
y_pred_test_gb_over = gb_clas_under.predict(X_train_under)

print(f"Accuracy: {accuracy_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Precision: {precision_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Recall: {recall_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"F1 Score: {f1_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"R2 score, {gb_clas_under.score(X_test_under, y_test_under): .2f}")

### Adaptative Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_clas_over = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=100)
ada_clas_over.fit(X_train_over, y_train_over)
y_pred_test_ada_over = ada_clas_over.predict(X_train_over)

print(f"Accuracy: {accuracy_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Precision: {precision_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Recall: {recall_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"F1 Score: {f1_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"R2 score, {ada_clas_over.score(X_test_over, y_test_over): .2f}")

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_clas_under = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=100)
ada_clas_under.fit(X_train_under, y_train_under)
y_pred_test_ada_under = ada_clas_under.predict(X_train_under)

print(f"Accuracy: {accuracy_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Precision: {precision_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Recall: {recall_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"F1 Score: {f1_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"R2 score, {ada_clas_under.score(X_test_under, y_test_under): .2f}")

### Grid Search

In [None]:
# First we need to setup a dictionary with all the values that we want to try for each hyprerparameter
import time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import scipy.stats as st


parameter_grid = {"max_depth": [10, 50],
                  "min_samples_split": [4, 16],
                  "max_leaf_nodes": [250, 100],
                  "max_features": ["sqrt", "log2"]}

# We create an instance or our machine learning model
dt = DecisionTreeClassifier(random_state=123)

# We need to set this two variables to be able to compute a confidence interval
confidence_level = 0.95
folds = 10

# Now we need to create an intance of the GridSearchCV class
gs_over = GridSearchCV(dt, param_grid=parameter_grid, cv=folds, verbose=10) # Here the "cv" allows you to define the number of folds to use.

start_time = time.time()
gs_over.fit(X_train_over, y_train_over)
end_time = time.time()

print("\n")
print(f"Time taken to find the best combination of hyperparameters among the given ones: {end_time - start_time: .4f} seconds")
print("\n")


print(f"The best combination of hyperparameters has been: {gs_over.best_params_}")
print(f"The R2 is: {gs_over.best_score_: .4f}")

results_gs_df_over = pd.DataFrame(gs_over.cv_results_).sort_values(by="mean_test_score", ascending=False)

gs_mean_score_over = results_gs_df_over.iloc[0,-3]
gs_sem_over = results_gs_df_over.iloc[0,-2] / np.sqrt(folds)

gs_tc_over = st.t.ppf(1-((1-confidence_level)/2), df=folds-1)
gs_lower_bound_over = gs_mean_score_over - ( gs_tc_over * gs_sem_over)
gs_upper_bound_over = gs_mean_score_over + ( gs_tc_over * gs_sem_over)

print(f"The R2 confidence interval for the best combination of hyperparameters is: \
    ({gs_lower_bound_over: .4f}, {gs_mean_score_over: .4f}, {gs_upper_bound_over: .4f}) ")

display(results_gs_df_over)

# Let's store the best model
best_model_over = gs_over.best_estimator_

# Now is time evaluate the model in the test set
y_pred_test_df_over = best_model_over.predict(X_test_over)
y_pred_test_df_over = best_model_over.predict(X_test_over)

y_pred_test_over = best_model_over.predict(X_test_over)

print("\n")
print(f"Accuracy: {accuracy_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Precision: {precision_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Recall: {recall_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"F1 Score: {f1_score(y_test_over, y_pred_test_rf_over):.2f}")
print(f"Test R2 score:  {best_model_over.score(X_test_over, y_test_over): .4f}")
print("\n")

In [None]:
# Now we need to create an intance of the GridSearchCV class
gs_under = GridSearchCV(dt, param_grid=parameter_grid, cv=folds, verbose=10) # Here the "cv" allows you to define the number of folds to use.

start_time = time.time()
gs_under.fit(X_train_under, y_train_under)
end_time = time.time()

print("\n")
print(f"Time taken to find the best combination of hyperparameters among the given ones: {end_time - start_time: .4f} seconds")
print("\n")


print(f"The best combination of hyperparameters has been: {gs_over.best_params_}")
print(f"The R2 is: {gs_under.best_score_: .4f}")

results_gs_df_under = pd.DataFrame(gs_under.cv_results_).sort_values(by="mean_test_score", ascending=False)

gs_mean_score_under = results_gs_df_under.iloc[0,-3]
gs_sem_under = results_gs_df_under.iloc[0,-2] / np.sqrt(folds)

gs_tc_under = st.t.ppf(1-((1-confidence_level)/2), df=folds-1)
gs_lower_bound_under = gs_mean_score_under - ( gs_tc_under * gs_sem_under)
gs_upper_bound_under = gs_mean_score_under + ( gs_tc_under * gs_sem_under)

print(f"The R2 confidence interval for the best combination of hyperparameters is: \
    ({gs_lower_bound_under: .4f}, {gs_mean_score_under: .4f}, {gs_upper_bound_under: .4f}) ")

display(results_gs_df_under)

# Let's store the best model
best_model_under = gs_under.best_estimator_

# Now is time evaluate the model in the test set
y_pred_test_df_under = best_model_under.predict(X_test_under)
y_pred_test_df_under = best_model_under.predict(X_test_under)

y_pred_test_under = best_model_under.predict(X_test_under)

print("\n")
print(f"Accuracy: {accuracy_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Precision: {precision_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Recall: {recall_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"F1 Score: {f1_score(y_test_under, y_pred_test_rf_under):.2f}")
print(f"Test R2 score:  {best_model_under.score(X_test_under, y_test_under): .4f}")
print("\n")

### DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score

tree_over = DecisionTreeClassifier(max_depth=10)
tree_over.fit(X_train_over, y_train_over)
y_pred_tree_over = tree_over.predict(X_test_over)

tree_importance_over = {feature : importance for feature, importance in zip(X_train_over.columns, tree_over.feature_importances_)}

print(f"Accuracy: {accuracy_score(y_test_over, y_pred_tree_over):.2f}")
print(f"Precision: {precision_score(y_test_over, y_pred_tree_over):.2f}")
print(f"Recall: {recall_score(y_test_over, y_pred_tree_over):.2f}")
print(f"F1 Score: {f1_score(y_test_over, y_pred_tree_over):.2f}")
print(f"R2 score {tree_over.score(X_test_over, y_test_over):.2f}")

In [None]:
import graphviz

tree_over = DecisionTreeClassifier(max_depth=3)
tree_over.fit(X_train_over, y_train_over)
dot_data = export_graphviz(tree_over, out_file="tree.dot", filled=True, rounded=True, feature_names=X_train_over.columns)
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
tree_under = DecisionTreeClassifier(max_depth=10)
tree_under.fit(X_train_under, y_train_under)
y_pred_tree_under = tree_under.predict(X_test_under)

tree_importance_under = {feature : importance for feature, importance in zip(X_train_under.columns, tree_under.feature_importances_)}

print(f"Accuracy: {accuracy_score(y_test_under, y_pred_tree_under):.2f}")
print(f"Precision: {precision_score(y_test_under, y_pred_tree_under):.2f}")
print(f"Recall: {recall_score(y_test_under, y_pred_tree_under):.2f}")
print(f"F1 Score: {f1_score(y_test_under, y_pred_tree_under):.2f}")
print(f"R2 score {tree_over.score(X_test_under, y_test_under):.2f}")

In [None]:
tree_under = DecisionTreeClassifier(max_depth=3)
tree_under.fit(X_train_under, y_train_under)
dot_data = export_graphviz(tree_under, out_file="tree.dot", filled=True, rounded=True, feature_names=X_train_under.columns)
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

### Bayesian Search

In [None]:
from sklearn.model_selection import cross_val_score

def objective(trial, confidence_level, folds):

    # First, we define the grid with values to consider when train several possible combinations.
    # Now we specify a range/list of values to try for each hyper-parameter, and we let optuna to decide which
    # combination to try.
    max_depth = trial.suggest_int("max_depth", 10, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 4, 16)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 250, 1000)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    dt = DecisionTreeClassifier(random_state=123,
                               max_depth=max_depth,
                               min_samples_split=min_samples_split,
                               max_leaf_nodes=max_leaf_nodes,
                               max_features=max_features)

    # Here the parameter "cv" specifies the number of folds K
    scores = cross_val_score(dt, X_train_over, y_train_over, cv=folds) # The scores provided will be the R2 on each hold out fold
    mean_score = np.mean(scores)
    sem = np.std(scores, ddof=1) / np.sqrt(folds)

    tc = st.t.ppf(1-((1-confidence_level)/2), df=folds-1)
    lower_bound = mean_score - ( tc * sem )
    upper_bound = mean_score + ( tc * sem )

    # Here, we're storing confidence interval for each trial. It's not possible for the objective function to return
    # multiple values as Optuna uses the only returned value to find the best combination of hyperparameters.
    trial.set_user_attr("CV_score_summary", [round(lower_bound,4), round(np.mean(scores),4), round(upper_bound,4)])

    return np.mean(scores)

In [None]:
import optuna
import optuna.visualization as vis

confidence_level = 0.95
folds = 10

start_time = time.time()
study = optuna.create_study(direction="maximize") # We want to have the maximum values for the R2 scores
study.optimize(lambda trial: objective(trial, confidence_level, folds), n_trials=45)
end_time = time.time()

print("\n")
print(f"Time taken to find the best combination of hyperparameters among the given ones: {end_time - start_time: .4f} seconds")
print("\n")
print("The best combination of hyperparameters found was: ", study.best_params)
print(f"The best R2 found was: {study.best_value: .4f}")

In [None]:
vis.plot_optimization_history(study)

In the previous plot, each marker represents a unique combination of the hyperparameters. However, we can't know which were the hyperparameter values in each combination. To gain more insights into this, we can do an slice plot

In [None]:
slice_plot = vis.plot_slice(study)
slice_plot.show()

It's also interesting to know what was the most important hyper-parameter to improve the model performance

In [None]:
# Plot parameter importance
vis.plot_param_importances(study)

In [None]:
best_model_over = DecisionTreeClassifier(random_state=123, **study.best_params)
best_model_over.fit(X_train_over, y_train_over)
y_pred_test_over = best_model_over.predict(X_test_over)

print(f"Accuracy: {accuracy_score(y_test_over, y_pred_test_over):.2f}")
print(f"Precision: {precision_score(y_test_over, y_pred_test_over):.2f}")
print(f"Recall: {recall_score(y_test_over, y_pred_test_over):.2f}")
print(f"F1 Score: {f1_score(y_test_over, y_pred_test_over):.2f}")
print(f"Test R2 score:  {best_model_over.score(X_test_over, y_test_over): .3f}")

As we can see, the R2 on the test set is not within the confidence interval. However, you need to keep inmind that this will only happen in 5% of all tests sets as the confidence interval compromises 95% of all the test cases