In [1]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV


In [3]:
hr = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\human-resources-analytics\HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)

In [7]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

ct = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=object)),
    (ohe, make_column_selector(dtype_include=object)),
    verbose_feature_names_out=False
).set_output(transform='pandas')

lr = LogisticRegression(random_state=24)

pipe = Pipeline(
    [
        ('CT',ct),
        ('SCL',None),
        ('LR',lr)
    ]
)

pipe.fit(X_train, y_train)


y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

#### K-FOLDS

kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
params = {
    'LR__solver':[
        'lbfgs',
        'liblinear',
        'newton-cg',
        'newton-cholesky',
        'sag','saga'
    ],
    'LR__C':np.linspace(0.001, 10, 20),
    'SCL':[scaler_mm, scaler_std, None]
}


gcv = GridSearchCV(
    pipe, 
    param_grid=params,
    scoring='neg_log_loss',
    cv=kfold, 
    verbose=3
)

gcv.fit(X,y)


0.43649681302349613
Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV 1/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=-0.518 total time=   0.0s
[CV 2/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=-0.520 total time=   0.0s
[CV 3/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=-0.517 total time=   0.0s
[CV 4/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=-0.518 total time=   0.0s
[CV 5/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=-0.516 total time=   0.0s
[CV 1/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=-0.449 total time=   0.0s
[CV 2/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=-0.457 total time=   0.0s
[CV 3/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=-0.449 total time=   0.0s
[CV 4/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=-0.453 total time=   0.0s
[CV 5/5] END LR__C=0.001, LR__solve

In [6]:
print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)


{'LR__C': 0.5272631578947369, 'LR__solver': 'newton-cholesky', 'SCL': None}
0.8201244202657666
(360, 16)
