# Using the validation set

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

## Test data

In [None]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target


# keep the test data until the end
X_train_val, X_test, y_train_val, y_test = train_test_split(cancer.data, cancer.target, test_size=0.15)

## Train & validation data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20)

## Define hyperparameters

In [None]:
max_depth = [3, 4, 5]
min_samples_split = [10, 50, 100]

In [None]:
val_scores = {}

for depth in max_depth:
    for split in min_samples_split:
        dt = DecisionTreeClassifier(max_depth=depth, min_samples_split=split)
        dt.fit(X_train, y_train)
        val_scores[(depth, split)] = dt.score(X_val, y_val)


In [None]:
val_scores

In [None]:
best_hp = max(val_scores, key=val_scores.get)
best_hp

## Compare different models

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn.score(X_val, y_val)

In [None]:
dt = DecisionTreeClassifier(max_depth=best_hp[0], min_samples_split=best_hp[1])
dt.fit(X_train_val, y_train_val)
dt.score(X_test, y_test)

## Final model

In [None]:
X_train_full = np.concatenate((X_train, X_val))
y_train_full = np.concatenate((y_train, y_val))

In [None]:
model_final = DecisionTreeClassifier(max_depth=best_hp[0], min_samples_split=best_hp[1])
model_final.fit(X_train_full, y_train_full)
model_final.score(X_test, y_test)