In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

from sklearn.model_selection import train_test_split
# Import accuracy_score
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.model_selection import GridSearchCV

SEED =1 

import warnings
warnings.filterwarnings('ignore') 

### Read data

In [19]:
df = pd.read_csv(r"../data/wbc.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

### Convert the target into binary

In [20]:
y = df["diagnosis"].replace({"M":1, "B":0})
X = df.drop(["id", "diagnosis", "Unnamed: 32"], axis=1)

### Split data into train and test

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=SEED)

### View tunable hyperparameters

In [22]:
# Instantiate dt
dt = DecisionTreeClassifier(random_state=SEED)
print(dt.get_params())

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 1, 'splitter': 'best'}


### Set hyperparameters

In [23]:
params_dt = {
    "max_depth": [3, 4, 5, 6],
    "min_samples_leaf": [0.04, 0.06, 0.08],
    "max_features": [0.2, 0.4, 0.6, 0.8]
}

grid_dt = GridSearchCV(estimator=dt,
                       param_grid=params_dt,
                       scoring="accuracy",
                       cv=10,
                       n_jobs=-1)

### GridSearch

In [24]:
grid_dt.fit(X_train, y_train)

best_hyperparams = grid_dt.best_params_
print("Best hyperparameters:\n", best_hyperparams)

best_CV_score = grid_dt.best_score_
print("Best CV accuracy {:.3f}".format(best_CV_score))

# Extracting the best model
best_model = grid_dt.best_estimator_
test_acc = best_model.score(X_test, y_test)

print("test set accuracy of best model: {:.3f}".format(test_acc))

Best hyperparameters:
 {'max_depth': 4, 'max_features': 0.4, 'min_samples_leaf': 0.04}
Best CV accuracy 0.941
test set accuracy of best model: 0.947
