In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('https://static.bc-edx.com/mbc/ai/m5/datasets/numeric_bank.csv')
df.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
0,30,1787,19,79,1,-1,0,0
1,33,4789,11,220,1,339,4,0
2,35,1350,16,185,1,330,1,0
3,30,1476,3,199,4,-1,0,0
4,59,0,5,226,1,-1,0,0


In [3]:
target = df["y"]
target_names = ["negative", "positive"]

In [4]:
data = df.drop("y", axis=1)
data.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,30,1787,19,79,1,-1,0
1,33,4789,11,220,1,339,4
2,35,1350,16,185,1,330,1
3,30,1476,3,199,4,-1,0
4,59,0,5,226,1,-1,0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
# Create three KNN classifiers
from sklearn.neighbors import KNeighborsClassifier
untuned_model = KNeighborsClassifier()
grid_tuned_model = KNeighborsClassifier()
random_tuned_model = KNeighborsClassifier()

In [7]:
## Train a model without tuning
from sklearn.metrics import classification_report
untuned_model.fit(X_train, y_train)
untuned_y_pred = untuned_model.predict(X_test)
print(classification_report(y_test, untuned_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.91      0.96      0.93      1006
    positive       0.38      0.22      0.28       125

    accuracy                           0.87      1131
   macro avg       0.64      0.59      0.60      1131
weighted avg       0.85      0.87      0.86      1131



In [8]:
# Create the grid search estimator along with a parameter object containing the values to adjust.
# Try adjusting n_neighbors with values of 1 through 19. Adjust leaf_size by using 10, 50, 100, and 500.
# Include both uniform and distance options for weights.
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance'],
    'leaf_size': [10, 50, 100, 500]
}
grid_clf = GridSearchCV(grid_tuned_model, param_grid, verbose=3)

In [9]:
# Fit the model by using the grid search estimator.
# This will take the KNN model and try each combination of parameters.
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.842 total time=   0.0s
[CV 2/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.867 total time=   0.0s
[CV 3/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.833 total time=   0.0s
[CV 4/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.817 total time=   0.0s
[CV 5/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.848 total time=   0.0s
[CV 1/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.842 total time=   0.0s
[CV 2/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.867 total time=   0.0s
[CV 3/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.833 total time=   0.0s
[CV 4/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.817 total time=   0.0s
[CV 5/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.848 total time=   0.0s
[CV 1/5] END leaf_size=

[CV 1/5] END leaf_size=50, n_neighbors=1, weights=uniform;, score=0.842 total time=   0.0s
[CV 2/5] END leaf_size=50, n_neighbors=1, weights=uniform;, score=0.867 total time=   0.0s
[CV 3/5] END leaf_size=50, n_neighbors=1, weights=uniform;, score=0.833 total time=   0.0s
[CV 4/5] END leaf_size=50, n_neighbors=1, weights=uniform;, score=0.817 total time=   0.0s
[CV 5/5] END leaf_size=50, n_neighbors=1, weights=uniform;, score=0.848 total time=   0.0s
[CV 1/5] END leaf_size=50, n_neighbors=1, weights=distance;, score=0.842 total time=   0.0s
[CV 2/5] END leaf_size=50, n_neighbors=1, weights=distance;, score=0.867 total time=   0.0s
[CV 3/5] END leaf_size=50, n_neighbors=1, weights=distance;, score=0.833 total time=   0.0s
[CV 4/5] END leaf_size=50, n_neighbors=1, weights=distance;, score=0.817 total time=   0.0s
[CV 5/5] END leaf_size=50, n_neighbors=1, weights=distance;, score=0.848 total time=   0.0s
[CV 1/5] END leaf_size=50, n_neighbors=3, weights=uniform;, score=0.876 total time=  

[CV 3/5] END leaf_size=50, n_neighbors=19, weights=uniform;, score=0.891 total time=   0.0s
[CV 4/5] END leaf_size=50, n_neighbors=19, weights=uniform;, score=0.888 total time=   0.0s
[CV 5/5] END leaf_size=50, n_neighbors=19, weights=uniform;, score=0.886 total time=   0.0s
[CV 1/5] END leaf_size=50, n_neighbors=19, weights=distance;, score=0.882 total time=   0.0s
[CV 2/5] END leaf_size=50, n_neighbors=19, weights=distance;, score=0.891 total time=   0.0s
[CV 3/5] END leaf_size=50, n_neighbors=19, weights=distance;, score=0.889 total time=   0.0s
[CV 4/5] END leaf_size=50, n_neighbors=19, weights=distance;, score=0.888 total time=   0.0s
[CV 5/5] END leaf_size=50, n_neighbors=19, weights=distance;, score=0.883 total time=   0.0s
[CV 1/5] END leaf_size=100, n_neighbors=1, weights=uniform;, score=0.842 total time=   0.0s
[CV 2/5] END leaf_size=100, n_neighbors=1, weights=uniform;, score=0.867 total time=   0.0s
[CV 3/5] END leaf_size=100, n_neighbors=1, weights=uniform;, score=0.833 to

[CV 2/5] END leaf_size=100, n_neighbors=19, weights=uniform;, score=0.888 total time=   0.0s
[CV 3/5] END leaf_size=100, n_neighbors=19, weights=uniform;, score=0.891 total time=   0.0s
[CV 4/5] END leaf_size=100, n_neighbors=19, weights=uniform;, score=0.888 total time=   0.0s
[CV 5/5] END leaf_size=100, n_neighbors=19, weights=uniform;, score=0.886 total time=   0.0s
[CV 1/5] END leaf_size=100, n_neighbors=19, weights=distance;, score=0.882 total time=   0.0s
[CV 2/5] END leaf_size=100, n_neighbors=19, weights=distance;, score=0.891 total time=   0.0s
[CV 3/5] END leaf_size=100, n_neighbors=19, weights=distance;, score=0.889 total time=   0.0s
[CV 4/5] END leaf_size=100, n_neighbors=19, weights=distance;, score=0.888 total time=   0.0s
[CV 5/5] END leaf_size=100, n_neighbors=19, weights=distance;, score=0.883 total time=   0.0s
[CV 1/5] END leaf_size=500, n_neighbors=1, weights=uniform;, score=0.842 total time=   0.0s
[CV 2/5] END leaf_size=500, n_neighbors=1, weights=uniform;, score

[CV 1/5] END leaf_size=500, n_neighbors=19, weights=uniform;, score=0.882 total time=   0.0s
[CV 2/5] END leaf_size=500, n_neighbors=19, weights=uniform;, score=0.888 total time=   0.0s
[CV 3/5] END leaf_size=500, n_neighbors=19, weights=uniform;, score=0.891 total time=   0.0s
[CV 4/5] END leaf_size=500, n_neighbors=19, weights=uniform;, score=0.888 total time=   0.0s
[CV 5/5] END leaf_size=500, n_neighbors=19, weights=uniform;, score=0.886 total time=   0.0s
[CV 1/5] END leaf_size=500, n_neighbors=19, weights=distance;, score=0.882 total time=   0.0s
[CV 2/5] END leaf_size=500, n_neighbors=19, weights=distance;, score=0.891 total time=   0.0s
[CV 3/5] END leaf_size=500, n_neighbors=19, weights=distance;, score=0.889 total time=   0.0s
[CV 4/5] END leaf_size=500, n_neighbors=19, weights=distance;, score=0.888 total time=   0.0s
[CV 5/5] END leaf_size=500, n_neighbors=19, weights=distance;, score=0.883 total time=   0.0s


GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [10, 50, 100, 500],
                         'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
                         'weights': ['uniform', 'distance']},
             verbose=3)

In [10]:
# List the best parameters for this dataset
print(grid_clf.best_params_)

{'leaf_size': 10, 'n_neighbors': 17, 'weights': 'distance'}


In [11]:
# Print the classification report for the best model
grid_y_pred = grid_clf.predict(X_test)
print(classification_report(y_test, grid_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.91      0.98      0.94      1006
    positive       0.48      0.18      0.26       125

    accuracy                           0.89      1131
   macro avg       0.69      0.58      0.60      1131
weighted avg       0.86      0.89      0.86      1131



In [12]:
# Create the parameter object for the randomized search estimator.
# Try adjusting n_neighbors with values of 1 through 19. 
# Adjust leaf_size by using a range from 1 to 500.
# Include both uniform and distance options for weights.
param_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(1, 500)
}
param_grid

{'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
 'weights': ['uniform', 'distance'],
 'leaf_size': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        1

In [13]:
# Create the randomized search estimator
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(random_tuned_model, param_grid, random_state=0, verbose=3)

In [14]:
# Fit the model by using the randomized search estimator.
random_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.883 total time=   0.0s
[CV 2/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.891 total time=   0.0s
[CV 3/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.885 total time=   0.0s
[CV 4/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.886 total time=   0.0s
[CV 5/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.881 total time=   0.0s
[CV 1/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.867 total time=   0.0s
[CV 2/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.878 total time=   0.0s
[CV 3/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.867 total time=   0.0s
[CV 4/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.853 total time=   0.0s
[CV 5/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.864 total time=   0.0s
[CV 1/5] 

RandomizedSearchCV(estimator=KNeighborsClassifier(),
                   param_distributions={'leaf_size': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,...
       430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
       443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455,
       456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
       469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481,
       482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
       495, 496, 497, 498, 499]),
   

In [15]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'weights': 'distance', 'n_neighbors': 19, 'leaf_size': 243}


In [16]:
# Make predictions with the hypertuned model
random_tuned_pred = random_clf.predict(X_test)

In [17]:
# Calculate the classification report
print(classification_report(y_test, random_tuned_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.91      0.98      0.94      1006
    positive       0.51      0.18      0.26       125

    accuracy                           0.89      1131
   macro avg       0.71      0.58      0.60      1131
weighted avg       0.86      0.89      0.87      1131



## Interpretations
What were the best settings for the hyperparameters that were tested? How much improvement was made by tuning those hyperparameters?

**Answer:** The best hyper parameter combination was {'weights': 'distance', 'n_neighbors': 19, 'leaf_size': 243}, and they improved the overall accuracy of the model from 0.87 to 0.89. If Recall of the positive class was the metric of interest however, the adjusted hyperparameters performed worse. 