In [5]:
import warnings
warnings.filterwarnings("ignore")

In [19]:
import numpy as np
import pandas as pd 
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV


In [None]:
hr = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\human-resources-analytics\HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)

In [None]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

ct = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=object)),
    (ohe, make_column_selector(dtype_include=object)),
    verbose_feature_names_out=False
).set_output(transform='pandas')

knn = KNeighborsClassifier()

pipe = Pipeline(
    [
        ('CT', ct),
        ('SCL', None),
        ('KNN', knn)
    ]
)

pipe.fit(X_train, y_train)

y_pred_prob = pipe.predict_proba(X_test)
print("Logloss: ", log_loss(y_test, y_pred_prob))

#### K-FOLDS
kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)
params = {
    'SCL':[scaler_mm, scaler_std, None],
    'KNN__n_neighbors': np.arange(1, 11, 1)
}

gcv = GridSearchCV(
    pipe,
    param_grid=params,
    scoring='neg_log_loss',     # Best : -0.46380636508556156
    # scoring='accuracy',     # Best : 0.9676558852950983
    cv=kfold, 
    verbose=2
)

gcv.fit(X,y)


Logloss:  0.6644742504323284
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END .............KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.0s
[CV] END .............KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.0s
[CV] END .............KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.0s
[CV] END .............KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.0s
[CV] END .............KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.0s
[CV] END ...........KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.0s
[CV] END ...........KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.1s
[CV] END ...........KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.0s
[CV] END ...........KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.0s
[CV] END ...........KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.0s
[CV] END .......................KNN__n_neighbors=1, SCL=None; total time=   0.0s
[CV] END .........

In [26]:
print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)


{'KNN__n_neighbors': 10, 'SCL': StandardScaler()}
-0.46380636508556156
(30, 15)


In [31]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

ct = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=object)),
    (ohe, make_column_selector(dtype_include=object)),
    verbose_feature_names_out=False
).set_output(transform='pandas')

knn = KNeighborsClassifier()

pipe = Pipeline(
    [
        ('CT', ct),
        ('SCL', None),
        ('KNN', knn)
    ]
)

pipe.fit(X_train, y_train)

y_pred_prob = pipe.predict_proba(X_test)
print("Logloss: ", log_loss(y_test, y_pred_prob))

#### K-FOLDS
kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)
params = {
    'SCL':[scaler_mm, scaler_std, None],
    'KNN__n_neighbors': np.arange(1, 11, 1),    
    'KNN__metric': ['cityblock','haversine', 'manhattan', 'minkowski'] #Distance metrics
}

scoring_metrics = {
    'neg_log_loss': 'neg_log_loss',
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

gcv = GridSearchCV(
    pipe, 
    param_grid=params,
    scoring=scoring_metrics,
    cv=kfold,
    verbose=2,
    refit='neg_log_loss'  # Specify which metric to use for selecting the best model
)

gcv.fit(X,y)

# Print results for all metrics
print("\nBest Parameters:", gcv.best_params_)
print("\nBest neg_log_loss Score:", gcv.best_score_)

# Get mean test scores for each metric
print("\nMean scores for each metric:")
for metric in scoring_metrics.keys():
    mean_score = pd.DataFrame(gcv.cv_results_)[f'mean_test_{metric}'].max()
    print(f"{metric}: {mean_score:.4f}")

    


Logloss:  0.6644742504323284
Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.4s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.5s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.5s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.5s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler(); total time=   0.4s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.5s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.4s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.4s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler(); total time=   0.4s
[CV] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler(); total t

In [30]:
gcv.cv_results_

{'mean_fit_time': array([0.03947673, 0.0383841 , 0.03746562, 0.02587152, 0.03272934,
        0.03121276, 0.03399458, 0.03751521, 0.035252  , 0.03816619,
        0.02943711, 0.02899766, 0.03977289, 0.04280624, 0.03344169,
        0.03694353, 0.04350643, 0.03249159, 0.04457746, 0.03218956,
        0.04075685, 0.03324633, 0.03907313, 0.02551641, 0.04044065,
        0.03564892, 0.03273196, 0.03640718, 0.04459958, 0.03274007]),
 'std_fit_time': array([0.00815499, 0.00981886, 0.00144285, 0.0020454 , 0.00743627,
        0.00845046, 0.01017144, 0.00861403, 0.00789197, 0.01006176,
        0.00389672, 0.0099557 , 0.00749859, 0.00911162, 0.01095619,
        0.00999265, 0.00828031, 0.00935143, 0.00232519, 0.00781821,
        0.00326135, 0.01061027, 0.01023853, 0.00724393, 0.00785402,
        0.00971634, 0.00843953, 0.01028716, 0.0082412 , 0.0098749 ]),
 'mean_score_time': array([0.36243305, 0.37459154, 0.36537523, 0.35879879, 0.38327408,
        0.38725481, 0.38901544, 0.39236655, 0.37507882, 0.39