In [1]:
import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, log_loss
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, train_test_split, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

df = pd.read_csv('data/Cardiovascular_Diseases_Risk_Prediction_Dataset.csv')

y = df['Heart_Disease']
X = df.drop('Heart_Disease', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, stratify=y)

# Transform Target column to 1's and 0's
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

num_cols = [
    'Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption',
    'Fruit_Consumption', 'Green_Vegetables_Consumption',
    'FriedPotato_Consumption'
]
ord_cols = ['General_Health', 'Checkup', 'Age_Category']
dum_cols = [
    'Exercise', 'Skin_Cancer', 'Depression', 'Arthritis', 'Other_Cancer',
    'Smoking_History', 'Sex', 'Diabetes'
]

health = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
check = [
    'Never', '5 or more years ago', 'Within the past 5 years',
    'Within the past 2 years', 'Within the past year'
]
age = [
    '18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80+'
]

col_transformer = ColumnTransformer(transformers=[
    ('oe', OrdinalEncoder(categories=[health, check, age]), ord_cols),
    ('ohe', OneHotEncoder(), dum_cols)
],
                                    remainder="passthrough")


def Model_eval(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    score = cross_validate(model,
                           X_train,
                           y_train,
                           scoring=['f1', 'neg_log_loss'],
                           cv=5)
    f1 = score['test_f1'].mean()
    log_loss = -1 * score['test_neg_log_loss'].mean()
    print(f'Accuracy: {acc}')
    print(f'F1 Score: {f1}')
    print(f'Log-loss: {log_loss}')

    y_preds = model.predict(X_test)
    cm = confusion_matrix(y_test, y_preds, normalize='true')
    disp = ConfusionMatrixDisplay(cm)
    fig, ax = plt.subplots(figsize=(3.2, 2.4))
    disp.plot(ax=ax)
    plt.show()


def data_prep(X_train, y_train, X_test, y_test, smote=True, scale=True):
    if smote & scale:
        pipe = ImPipeline(steps=[('ct',col_transformer),
                         ('ss', StandardScaler()),
                         ('sm', SMOTE(random_state=42))])
        X_train_resampled, y_train_resampled = pipe.fit_resample(X_train,y_train)
        X_train_resampled_df = pd.DataFrame(X_train_resampled, columns=pipe.get_feature_names_out())
        X_test_scaled = pd.DataFrame(pipe[:-1].transform(X_test), columns=pipe.get_feature_names_out())
        return X_train_resampled_df, y_train_resampled, X_test_scaled, y_test
    elif smote:
        pipe = ImPipeline(steps=[('ct',col_transformer),
                         ('sm', SMOTE(random_state=42))])
        X_train_resamp, y_train_resamp = pipe.fit_resample(X_train,y_train)
        X_train_resamp_df = pd.DataFrame(X_train_resamp, columns=pipe.get_feature_names_out())
        X_test_transformed = pd.DataFrame(pipe[:-1].transform(X_test), columns=pipe.get_feature_names_out())
        return X_train_resamp_df, y_train_resamp, X_test_transformed, y_test
    elif scale:
        pipe = Pipeline(steps=[('col_transformer', col_transformer),
                               ('ss', StandardScaler())])
        pipe.fit(X_train)
        X_train_scaled = pd.DataFrame(pipe.transform(X_train), 
                                      columns=pipe.get_feature_names_out(),
                                      index=X_train.index)
        X_test_scaled = pd.DataFrame(pipe.transform(X_test), 
                                     columns=pipe.get_feature_names_out(),
                                     index= X_test.index)
        return X_train_scaled, y_train, X_test_scaled, y_test
    else:
        pipe = Pipeline(steps=[('col_transformer', col_transformer)])
        pipe.fit(X_train)
        X_train_transformed = pd.DataFrame(pipe.transform(X_train),
                                columns=pipe.get_feature_names_out(),
                                index=X_train.index)
        X_test_transformed = pd.DataFrame(pipe.transform(X_test),
                                columns=pipe.get_feature_names_out(),
                                index=X_test.index)
        return X_train_transformed, y_train, X_test_transformed, y_test

In [None]:
X_tr, y_tr, X_ts, y_ts = data_prep(X_train, y_train, X_test, y_test, smote=False, scale=True)

classifiers = {
    "DummyClassifier": DummyClassifier(strategy='most_frequent', random_state=12),
    "LogisticRegression": LogisticRegression(random_state=12),  
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=12),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "GaussianNB": GaussianNB(),
    "RandomForestClassifier": RandomForestClassifier(random_state=12),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=12),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=12),
    "XGBClassifier": XGBClassifier(random_state=12)
}

models_list = []

for key in classifiers:
    print('*',key)
    classifier = classifiers[key]
    model = classifier.fit(X_tr, y_tr)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)
    score = cross_validate(model,
                           X_tr,
                           y_tr,
                           scoring=['f1', 'neg_log_loss'],
                           cv=cv,
                           n_jobs=3)
    f1 = score['test_f1'].mean()
    log_loss = -1 * score['test_neg_log_loss'].mean()
    row = [key, f1, log_loss]
    models_list.append(row)
models_df = pd.DataFrame(models_list,columns=['model', 'F1', 'log_loss'])
models_df

In [3]:
X_tr, y_tr, X_ts, y_ts = data_prep(X_train, y_train, X_test, y_test, smote=True, scale=True)

classifiers = {
    "DummyClassifier": DummyClassifier(strategy='most_frequent', random_state=12),
    "LogisticRegression": LogisticRegression(solver = 'lbfgs', max_iter=1000, random_state=12),
    "XGBClassifier": XGBClassifier(random_state=12),
    "KNeighborsClassifier": KNeighborsClassifier(),    
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=12),
    "RandomForestClassifier": RandomForestClassifier(random_state=12),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=12),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=12),
    "GaussianNB": GaussianNB()
}

models_list = []

for key in classifiers:
    print('*',key)
    classifier = classifiers[key]
    model = classifier.fit(X_tr, y_tr)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)
    score = cross_validate(model, X_tr, y_tr, scoring=['recall', 'f1', 'neg_log_loss'], cv=cv, n_jobs=-1)
    recall_train = score['test_recall'].mean()
    f1_train = score['test_f1'].mean()
    log_loss_train = -1 * score['test_neg_log_loss'].mean()
    recall_test = recall_score(y_ts, model.predict(X_ts))
    f1_test = f1_score(y_ts, model.predict(X_ts))
    log_loss_test = log_loss(y_ts, model.predict(X_ts))
    row = [key, recall_train, recall_test, f1_train, f1_test,log_loss_train, log_loss_test]
    models_list.append(row)
models_df = pd.DataFrame(models_list,columns=['model', 'recall_train', 'recall_test', 'f1_train', 'f1_test', 'log_loss_train', 'log_loss_train'])
models_df.sort_values('recall_test', ascending = False)

* DummyClassifier
* LogisticRegression
* XGBClassifier
* KNeighborsClassifier
* DecisionTreeClassifier
* RandomForestClassifier
* AdaBoostClassifier
* GradientBoostingClassifier
* GaussianNB


Unnamed: 0,model,recall_train,recall_test,f1_train,f1_test,log_loss_train,log_loss_train.1
1,LogisticRegression,0.80692,0.788083,0.779034,0.327563,0.48514,9.429401
8,GaussianNB,0.73459,0.725933,0.723177,0.283516,1.399989,10.692568
6,AdaBoostClassifier,0.876714,0.548294,0.866218,0.342163,0.674693,6.144049
3,KNeighborsClassifier,0.99528,0.542047,0.891099,0.268369,1.546776,8.612965
7,GradientBoostingClassifier,0.907722,0.342944,0.919729,0.326074,0.235425,4.131198
4,DecisionTreeClassifier,0.927674,0.24988,0.919613,0.222064,2.92286,5.102146
5,RandomForestClassifier,0.946433,0.172994,0.957542,0.235834,0.15164,3.267148
2,XGBClassifier,0.916073,0.061509,0.953287,0.108952,0.125249,2.931984
0,DummyClassifier,0.4,0.0,0.266665,0.0,18.021996,2.914245


In [34]:
classifiers = {
    "DummyClassifier": DummyClassifier(strategy='most_frequent', random_state=12),
    "LogisticRegression": LogisticRegression(solver = 'lbfgs', max_iter=1000, random_state=12),
    "XGBClassifier": XGBClassifier(random_state=12),
    "KNeighborsClassifier": KNeighborsClassifier(),    
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=12),
    "RandomForestClassifier": RandomForestClassifier(random_state=12),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=12),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=12),
    "GaussianNB": GaussianNB()
}

models_list = []

for key in classifiers:
    print('*',key)
    classifier = classifiers[key]
    model = classifier.fit(X_tr, y_tr)
    score = cross_validate(model, X_tr, y_tr, scoring=['accuracy', 'f1', 'neg_log_loss'], cv=5)
    f1 = score['test_f1'].mean()
    log_loss = -1 * score['test_neg_log_loss'].mean()
    row = [key, f1, log_loss]
    models_list.append(row)
models_df = pd.DataFrame(models_list,columns=['model', 'F1', 'log_loss'])
models_df 

    
    
    

[CV 3/5] END ......................xbg__eta=0.3;, score=0.085 total time= 1.2min
[CV 4/5] END ......................xbg__eta=0.3;, score=0.086 total time=  54.3s
[CV 2/5] END ......................xbg__eta=0.3;, score=0.093 total time= 1.2min
[CV 5/5] END ......................xbg__eta=0.3;, score=0.093 total time=  54.4s


0.9387810165573264

In [39]:
f1_score(y_test,clf.predict(X_ts))


0.10895162434387856

In [40]:
# XGBClassifier 1
xgb_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('xbg', XGBClassifier(random_state=12))])
param_grid = {
    'xbg__eta': [0.3]}
gs = GridSearchCV(xgb_pipe, param_grid=param_grid, scoring='f1', verbose=4, cv=5, n_jobs=3)
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_)

TypeError: GridSearchCV.__init__() missing 1 required positional argument: 'param_grid'

In [33]:
from sklearn.metrics import f1_score
f1_score(y_test,gs.predict(X_test))

0.09489367857659481

In [41]:
# XGBClassifier 1
xgb_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('xbg', XGBClassifier(random_state=12, tree_method='hist'))])
param_grid = {
    'xbg__eta': [0.1, 0.2, 0.3], # default=0.3 [0,1]
    'xbg__max_depth': [6], # default=6 [0,∞]
    'xbg__min_child_weight': [1, 2], # default=1 [0,∞]
    'xbg__subsample': [0.5, 0.7], # default=1 (0,1] 
    'xbg__n_estimators': [100],
    'xbg__gamma' : [0], # default=0 [0,∞]
    'sm__k_neighbors': [3, 5, 9]
} #6 x 2 = 12
gs = GridSearchCV(xgb_pipe, param_grid=param_grid, scoring='f1', verbose=4, cv=2, n_jobs=3)
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_)
gs_results

Fitting 2 folds for each of 36 candidates, totalling 72 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sm__k_neighbors,param_xbg__eta,param_xbg__gamma,param_xbg__max_depth,param_xbg__min_child_weight,param_xbg__n_estimators,param_xbg__subsample,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,5.337049,0.015654,0.766894,0.009984,3,0.1,0,6,1,100,0.5,"{'sm__k_neighbors': 3, 'xbg__eta': 0.1, 'xbg__...",0.104012,0.101806,0.102909,0.001103,23
1,5.150725,0.172608,0.760175,0.009388,3,0.1,0,6,1,100,0.7,"{'sm__k_neighbors': 3, 'xbg__eta': 0.1, 'xbg__...",0.098178,0.102476,0.100327,0.002149,31
2,5.0626,0.001273,0.782267,0.009507,3,0.1,0,6,2,100,0.5,"{'sm__k_neighbors': 3, 'xbg__eta': 0.1, 'xbg__...",0.100115,0.104145,0.10213,0.002015,25
3,6.486699,0.050761,0.70289,0.025,3,0.1,0,6,2,100,0.7,"{'sm__k_neighbors': 3, 'xbg__eta': 0.1, 'xbg__...",0.100707,0.097678,0.099193,0.001514,34
4,5.625971,0.778103,0.72982,0.022583,3,0.2,0,6,1,100,0.5,"{'sm__k_neighbors': 3, 'xbg__eta': 0.2, 'xbg__...",0.10102,0.106069,0.103544,0.002525,19
5,4.874376,0.017796,0.735708,0.003641,3,0.2,0,6,1,100,0.7,"{'sm__k_neighbors': 3, 'xbg__eta': 0.2, 'xbg__...",0.099525,0.106507,0.103016,0.003491,22
6,5.415961,0.038999,0.697164,0.008693,3,0.2,0,6,2,100,0.5,"{'sm__k_neighbors': 3, 'xbg__eta': 0.2, 'xbg__...",0.103403,0.106725,0.105064,0.001661,18
7,5.023962,0.390769,0.739425,0.033237,3,0.2,0,6,2,100,0.7,"{'sm__k_neighbors': 3, 'xbg__eta': 0.2, 'xbg__...",0.103314,0.099895,0.101604,0.00171,27
8,4.814603,0.04566,0.749843,0.000589,3,0.3,0,6,1,100,0.5,"{'sm__k_neighbors': 3, 'xbg__eta': 0.3, 'xbg__...",0.119814,0.127929,0.123872,0.004058,3
9,4.709905,0.012024,0.751119,0.000737,3,0.3,0,6,1,100,0.7,"{'sm__k_neighbors': 3, 'xbg__eta': 0.3, 'xbg__...",0.116697,0.117604,0.11715,0.000453,9


In [None]:
# XGBClassifier 2
xgb_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('xbg', XGBClassifier(random_state=12, tree_method='hist',verbose=4, eval_metric= 'logloss'))])
param_grid = {
    'xbg__eta': [0.001, 0.01, 0.1], # default=0.3 [0,1]
    'xbg__max_depth': [3, 5, 7, 9, 11], # default=6 [0,∞]
    'xbg__min_child_weight': [1, 2, 3], # default=1 [0,∞]
    'xbg__subsample': [0.3, 0.5, 0.7, 0.9], # default=1 (0,1] 
    'xbg__colsample_bytree': [0.3, 0.5, 0.7, 0.9], # default=1 (0,1] colsample_bytree
    'xbg__n_estimators': [100, 1000],
    'xbg__gamma' : [0, 1, 1000], # default=0 [0,∞]
    'sm__k_neighbors': [3, 5, 9]
} #6 x 2 = 12
gs = GridSearchCV(xgb_pipe, param_grid=param_grid, scoring='f1', cv=5, n_jobs=3)
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_)
gs_results

In [27]:
# XGBClassifier 4
xgb_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('xbg', XGBClassifier(random_state=12, 
                                                   tree_method='hist', verbosity=1))])
param_grid = {
    'xbg__n_estimators': np.linspace(75,200,5, dtype=int),
    'xbg__eta': np.linspace(0.01,0.2,5) 
} 
gs = GridSearchCV(xgb_pipe, param_grid=param_grid, scoring=['f1','neg_log_loss'], refit='neg_log_loss' cv=5, n_jobs=3, verbose=3)
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_).sort_values('rank_test_score')
gs_results

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[20:39:42] INFO: /var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b2k4a2hea3/croot/xgboost-split_1675457783214/work/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[20:39:42] INFO: /var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b2k4a2hea3/croot/xgboost-split_1675457783214/work/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[20:39:42] INFO: /var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b2k4a2hea3/croot/xgboost-split_1675457783214/work/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[20:39:42] INFO: /var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b2k4a2hea3/croot/xgboost-split_1675457783214/work/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xbg__eta,param_xbg__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,7.634301,0.226653,0.327896,0.02438,0.1525,100,"{'xbg__eta': 0.15250000000000002, 'xbg__n_esti...",-0.224232,-0.220168,-0.225307,-0.219064,-0.222889,-0.222332,0.002372,1
18,6.334687,0.126209,0.300889,0.020466,0.1525,75,"{'xbg__eta': 0.15250000000000002, 'xbg__n_esti...",-0.224379,-0.220501,-0.225471,-0.219297,-0.222874,-0.222505,0.002313,2
14,8.131389,0.431701,0.326174,0.042692,0.105,100,"{'xbg__eta': 0.105, 'xbg__n_estimators': 100}",-0.224354,-0.220737,-0.225374,-0.219335,-0.222874,-0.222535,0.002234,3
23,6.450535,0.156616,0.338979,0.050124,0.2,75,"{'xbg__eta': 0.2, 'xbg__n_estimators': 75}",-0.224038,-0.220355,-0.225769,-0.219531,-0.223199,-0.222578,0.002321,4
24,9.355729,1.789416,0.463399,0.212206,0.2,100,"{'xbg__eta': 0.2, 'xbg__n_estimators': 100}",-0.224173,-0.220703,-0.225849,-0.21983,-0.223304,-0.222772,0.00222,5
22,6.353914,0.384207,0.31552,0.089614,0.2,50,"{'xbg__eta': 0.2, 'xbg__n_estimators': 50}",-0.224667,-0.220651,-0.225853,-0.219773,-0.223478,-0.222884,0.002324,6
13,6.83463,0.563045,0.324783,0.024331,0.105,75,"{'xbg__eta': 0.105, 'xbg__n_estimators': 75}",-0.225645,-0.221772,-0.226373,-0.220268,-0.22385,-0.223582,0.002297,7
17,5.435168,0.142627,0.313329,0.019207,0.1525,50,"{'xbg__eta': 0.15250000000000002, 'xbg__n_esti...",-0.226067,-0.222152,-0.226847,-0.220308,-0.224084,-0.223892,0.002424,8
9,7.854516,0.257236,0.338049,0.026945,0.0575,100,"{'xbg__eta': 0.0575, 'xbg__n_estimators': 100}",-0.229107,-0.225305,-0.229894,-0.223672,-0.22686,-0.226968,0.002314,9
12,5.257432,0.136845,0.30268,0.020495,0.105,50,"{'xbg__eta': 0.105, 'xbg__n_estimators': 50}",-0.23107,-0.22682,-0.231393,-0.2252,-0.228686,-0.228634,0.002393,10


In [28]:
# XGBClassifier 4
xgb_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('xbg', XGBClassifier(random_state=12, 
                                                   tree_method='hist', verbosity=1))])
param_grid = {
    'xbg__n_estimators': range(50,500,50),
    'xbg__eta': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3] 
} 
gs = GridSearchCV(xgb_pipe, param_grid=param_grid, scoring='f1', cv=5, n_jobs=3, verbose=4)
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_).sort_values('rank_test_score')
gs_results

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 2/5] END xbg__eta=0.0001, xbg__n_estimators=50;, score=0.332 total time=   6.5s
[CV 4/5] END xbg__eta=0.0001, xbg__n_estimators=50;, score=0.324 total time=   5.9s
[CV 2/5] END xbg__eta=0.0001, xbg__n_estimators=100;, score=0.332 total time=   8.2s
[CV 5/5] END xbg__eta=0.0001, xbg__n_estimators=100;, score=0.322 total time=   8.1s
[CV 3/5] END xbg__eta=0.0001, xbg__n_estimators=150;, score=0.320 total time=  10.8s
[CV 1/5] END xbg__eta=0.0001, xbg__n_estimators=200;, score=0.318 total time=  14.2s
[CV 4/5] END xbg__eta=0.0001, xbg__n_estimators=200;, score=0.324 total time=  13.3s
[CV 2/5] END xbg__eta=0.0001, xbg__n_estimators=250;, score=0.332 total time=  15.5s
[CV 5/5] END xbg__eta=0.0001, xbg__n_estimators=250;, score=0.322 total time=  16.9s
[CV 3/5] END xbg__eta=0.0001, xbg__n_estimators=300;, score=0.320 total time=  18.5s
[CV 1/5] END xbg__eta=0.0001, xbg__n_estimators=350;, score=0.318 total time=  22.8s
[CV 4



[CV 1/5] END xbg__eta=0.0001, xbg__n_estimators=50;, score=0.318 total time=   6.6s
[CV 1/5] END xbg__eta=0.0001, xbg__n_estimators=100;, score=0.318 total time=   8.2s
[CV 4/5] END xbg__eta=0.0001, xbg__n_estimators=100;, score=0.324 total time=   8.3s
[CV 2/5] END xbg__eta=0.0001, xbg__n_estimators=150;, score=0.332 total time=  10.5s
[CV 5/5] END xbg__eta=0.0001, xbg__n_estimators=150;, score=0.322 total time=  11.4s
[CV 3/5] END xbg__eta=0.0001, xbg__n_estimators=200;, score=0.320 total time=  13.5s
[CV 1/5] END xbg__eta=0.0001, xbg__n_estimators=250;, score=0.318 total time=  15.9s
[CV 4/5] END xbg__eta=0.0001, xbg__n_estimators=250;, score=0.324 total time=  15.9s
[CV 2/5] END xbg__eta=0.0001, xbg__n_estimators=300;, score=0.332 total time=  17.9s
[CV 5/5] END xbg__eta=0.0001, xbg__n_estimators=300;, score=0.322 total time=  18.7s
[CV 3/5] END xbg__eta=0.0001, xbg__n_estimators=350;, score=0.320 total time=  22.4s
[CV 1/5] END xbg__eta=0.0001, xbg__n_estimators=400;, score=0.318 

[CV 4/5] END xbg__eta=0.1, xbg__n_estimators=350;, score=0.088 total time=  16.9s
[CV 2/5] END xbg__eta=0.1, xbg__n_estimators=400;, score=0.090 total time=  18.5s
[CV 5/5] END xbg__eta=0.1, xbg__n_estimators=400;, score=0.103 total time=  17.1s
[CV 3/5] END xbg__eta=0.1, xbg__n_estimators=450;, score=0.096 total time=  20.3s
[CV 1/5] END xbg__eta=0.2, xbg__n_estimators=50;, score=0.096 total time=   5.2s
[CV 2/5] END xbg__eta=0.2, xbg__n_estimators=50;, score=0.084 total time=   5.7s
[CV 4/5] END xbg__eta=0.2, xbg__n_estimators=50;, score=0.091 total time=   5.3s
[CV 2/5] END xbg__eta=0.2, xbg__n_estimators=100;, score=0.080 total time=   7.5s
[CV 5/5] END xbg__eta=0.2, xbg__n_estimators=100;, score=0.095 total time=   6.4s
[CV 3/5] END xbg__eta=0.2, xbg__n_estimators=150;, score=0.089 total time=   8.2s
[CV 1/5] END xbg__eta=0.2, xbg__n_estimators=200;, score=0.093 total time=  10.6s
[CV 4/5] END xbg__eta=0.2, xbg__n_estimators=200;, score=0.093 total time=  10.1s
[CV 2/5] END xbg__e

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xbg__eta,param_xbg__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,15.316522,0.502636,0.477179,0.163856,0.001,300,"{'xbg__eta': 0.001, 'xbg__n_estimators': 300}",0.321008,0.337309,0.328076,0.347873,0.329752,0.332804,0.009145,1
12,10.580368,0.094051,0.347161,0.020293,0.001,200,"{'xbg__eta': 0.001, 'xbg__n_estimators': 200}",0.325023,0.34077,0.332032,0.336218,0.328271,0.332463,0.00559,2
13,12.887348,0.351837,0.372334,0.028851,0.001,250,"{'xbg__eta': 0.001, 'xbg__n_estimators': 250}",0.323788,0.336942,0.332141,0.338349,0.32875,0.331994,0.005343,3
17,22.752339,1.501105,0.606483,0.101518,0.001,450,"{'xbg__eta': 0.001, 'xbg__n_estimators': 450}",0.329334,0.311847,0.325719,0.348599,0.3368,0.33046,0.012161,4
15,17.299198,0.430646,0.433528,0.021789,0.001,350,"{'xbg__eta': 0.001, 'xbg__n_estimators': 350}",0.321008,0.312483,0.329372,0.353548,0.335264,0.330335,0.013927,5
19,7.946013,0.609484,0.286514,0.030179,0.01,100,"{'xbg__eta': 0.01, 'xbg__n_estimators': 100}",0.315139,0.330194,0.330125,0.345872,0.327731,0.329812,0.009776,6
16,19.027017,0.289976,0.470367,0.022064,0.001,400,"{'xbg__eta': 0.001, 'xbg__n_estimators': 400}",0.321008,0.312483,0.326531,0.347778,0.337518,0.329063,0.012393,7
18,5.166429,0.408895,0.294548,0.046252,0.01,50,"{'xbg__eta': 0.01, 'xbg__n_estimators': 50}",0.324103,0.311889,0.325051,0.348595,0.33122,0.328171,0.011982,8
11,8.632202,0.124825,0.304783,0.015218,0.001,150,"{'xbg__eta': 0.001, 'xbg__n_estimators': 150}",0.326329,0.340032,0.31955,0.323856,0.328271,0.327608,0.006862,9
10,7.153529,0.797225,0.26451,0.040168,0.001,100,"{'xbg__eta': 0.001, 'xbg__n_estimators': 100}",0.324601,0.338355,0.31955,0.323856,0.322131,0.325699,0.006562,10


In [43]:
xgb_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('xbg', XGBClassifier(random_state=12, 
                                                   tree_method='hist', verbosity=1))])
params = {
        'xbg__min_child_weight': [1, 5, 10],
        'xbg__gamma': [0.5, 1, 1.5, 2, 5],
        'xbg__subsample': [0.6, 0.8, 1.0],
        'xbg__colsample_bytree': [0.6, 0.8, 1.0],
        'xbg__max_depth': [3, 4, 5]
        }
gs = GridSearchCV(xgb_pipe, param_grid=params, scoring='f1', cv=5, n_jobs=3, verbose=4)
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_).sort_values('rank_test_score')
gs_results

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
[CV 2/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.6;, score=0.104 total time=   5.0s
[CV 1/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.8;, score=0.091 total time=   6.1s
[CV 3/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.8;, score=0.101 total time=   4.6s
[CV 1/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=1.0;, score=0.094 total time=   4.5s
[CV 4/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=1.0;, score=0.096 total time=   4.4s
[CV 1/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=5, xbg__subsample=0.6;, score=0.103 total time=   4.6s
[CV 4/5] END xbg__colsample_by



[CV 1/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.6;, score=0.101 total time=   5.0s
[CV 5/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.6;, score=0.097 total time=   6.2s
[CV 4/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.8;, score=0.092 total time=   4.7s
[CV 2/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=1.0;, score=0.101 total time=   4.5s
[CV 5/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=1.0;, score=0.107 total time=   4.4s
[CV 2/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=5, xbg__subsample=0.6;, score=0.096 total time=   4.6s
[CV 5/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weigh

[CV 3/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.6;, score=0.103 total time=   5.0s
[CV 4/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.6;, score=0.099 total time=   6.2s
[CV 2/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.8;, score=0.098 total time=   4.6s
[CV 5/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=0.8;, score=0.109 total time=   4.5s
[CV 3/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=1, xbg__subsample=1.0;, score=0.096 total time=   4.7s
[CV 3/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weight=5, xbg__subsample=0.6;, score=0.108 total time=   5.5s
[CV 1/5] END xbg__colsample_bytree=0.6, xbg__gamma=0.5, xbg__max_depth=3, xbg__min_child_weigh

[CV 5/5] END xbg__colsample_bytree=0.8, xbg__gamma=1.5, xbg__max_depth=4, xbg__min_child_weight=10, xbg__subsample=0.8;, score=0.092 total time=   6.5s
[CV 5/5] END xbg__colsample_bytree=0.8, xbg__gamma=1.5, xbg__max_depth=4, xbg__min_child_weight=10, xbg__subsample=1.0;, score=0.091 total time=   9.3s
[CV 3/5] END xbg__colsample_bytree=0.8, xbg__gamma=1.5, xbg__max_depth=5, xbg__min_child_weight=1, xbg__subsample=0.6;, score=0.098 total time=   6.4s
[CV 1/5] END xbg__colsample_bytree=0.8, xbg__gamma=1.5, xbg__max_depth=5, xbg__min_child_weight=1, xbg__subsample=0.8;, score=0.083 total time=   5.8s
[CV 4/5] END xbg__colsample_bytree=0.8, xbg__gamma=1.5, xbg__max_depth=5, xbg__min_child_weight=1, xbg__subsample=0.8;, score=0.088 total time=   5.7s
[CV 2/5] END xbg__colsample_bytree=0.8, xbg__gamma=1.5, xbg__max_depth=5, xbg__min_child_weight=1, xbg__subsample=1.0;, score=0.087 total time=   5.3s
[CV 5/5] END xbg__colsample_bytree=0.8, xbg__gamma=1.5, xbg__max_depth=5, xbg__min_child_wei

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xbg__colsample_bytree,param_xbg__gamma,param_xbg__max_depth,param_xbg__min_child_weight,param_xbg__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
116,4.511378,0.127717,0.211886,0.008100,0.6,5,3,10,1.0,"{'xbg__colsample_bytree': 0.6, 'xbg__gamma': 5...",0.107219,0.113073,0.103053,0.102478,0.106731,0.106511,0.003790,1
380,4.261124,0.058447,0.220670,0.019630,1.0,5,3,1,1.0,"{'xbg__colsample_bytree': 1.0, 'xbg__gamma': 5...",0.102748,0.092511,0.113101,0.111429,0.110687,0.106095,0.007673,2
110,5.223689,0.451164,0.256414,0.068743,0.6,5,3,1,1.0,"{'xbg__colsample_bytree': 0.6, 'xbg__gamma': 5...",0.094964,0.115063,0.101622,0.101589,0.113126,0.105273,0.007625,3
113,4.428662,0.078487,0.230370,0.023426,0.6,5,3,5,1.0,"{'xbg__colsample_bytree': 0.6, 'xbg__gamma': 5...",0.105639,0.104808,0.098157,0.104427,0.112199,0.105046,0.004459,4
386,4.282804,0.021727,0.220261,0.015465,1.0,5,3,10,1.0,"{'xbg__colsample_bytree': 1.0, 'xbg__gamma': 5...",0.104186,0.094542,0.109817,0.101736,0.109379,0.103932,0.005610,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,4.501939,0.044330,0.215973,0.018427,1.0,2,4,1,1.0,"{'xbg__colsample_bytree': 1.0, 'xbg__gamma': 2...",0.084205,0.071889,0.078229,0.073322,0.090643,0.079658,0.006984,401
335,4.514480,0.069501,0.220976,0.021751,1.0,1.5,4,1,1.0,"{'xbg__colsample_bytree': 1.0, 'xbg__gamma': 1...",0.081245,0.074442,0.077527,0.072241,0.092390,0.079569,0.007090,402
280,4.800130,0.020743,0.237151,0.019372,1.0,0.5,4,1,0.8,"{'xbg__colsample_bytree': 1.0, 'xbg__gamma': 0...",0.082641,0.069808,0.079218,0.080059,0.083619,0.079069,0.004904,403
308,9.744609,1.220711,0.432367,0.033602,1.0,1,4,1,1.0,"{'xbg__colsample_bytree': 1.0, 'xbg__gamma': 1...",0.078125,0.073595,0.075258,0.076676,0.091043,0.078939,0.006235,404


In [15]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [25]:
np.linspace(1,5,2)

array([1., 5.])

In [15]:
list(range(1,20,2))

[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]

In [12]:
gs_results['param_xbg__n_estimators']

0    10
1    20
2    30
3    40
Name: param_xbg__n_estimators, dtype: object

In [None]:
# XGBClassifier
xgb_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('xbg', XGBClassifier(random_state=12))])
param_grid = {
    'xbg__eta': [0.001, 0.01, 0.1], # default=0.3 [0,1]
    'xbg__max_depth': [3, 5, 7, 9, 11], # default=6 [0,∞]
    'xbg__min_child_weight': [1, 2, 3], # default=1 [0,∞]
    'xbg__subsample': [0.3, 0.5, 0.7, 0.9], # default=1 (0,1] 
    'xbg__colsample_bytree': [0.3, 0.5, 0.7, 0.9], # default=1 (0,1] colsample_bytree
    'xbg__n_estimators': [100, 1000],
    'xbg__gamma' : [0, 1, 1000], # default=0 [0,∞]
    'sm__k_neighbors': [3, 5, 9]
} #72 models x 5 = 360 models
gs = GridSearchCV(xgb_pipe, param_grid, scoring=['f1','neg_log_loss'], cv=5, refit='f1' verbose=4)
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_)
gs_results

In [None]:
# RandomForestClassifier 1
rfc_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('rfc', RandomForestClassifier(random_state=12))])

param_grid = {
    'rfc__max_features': [None, 1], # ”sqrt”
} #24 models x 5 = 120 models
gs = GridSearchCV(estimator = rfc_pipe, param_grid=param_grid, scoring='f1', verbose=4, cv=2, n_jobs=3)
gs.fit(X_train, y_train)

In [None]:
pd.DataFrame(gs.cv_results_)

In [15]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [None]:
# RandomForestClassifier 2
rfc_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('rfc', RandomForestClassifier(random_state=12))])

param_grid = {
    'rfc__max_features': [None, 1], # ”sqrt”
} #24 models x 5 = 120 models
gs = GridSearchCV(estimator = rfc_pipe, param_grid=param_grid, scoring='f1', verbose=4, cv=2, n_jobs=3)
gs.fit(X_train, y_train)

In [None]:
# RandomForestClassifier
rfc_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('rfc', RandomForestClassifier(random_state=12))])

param_grid = {
    'rfc__n_estimators': [5, 100], # 100
    'rfc__criterion': ['gini', 'entropy', 'log_loss'], # ”gini”
    'rfc__max_depth': [None, 2, 5], # None
    'rfc__min_samples_split': [2], # 2
    'rfc__min_samples_leaf': [1, 5, 10], # 1
    'rfc__max_features': [None, 1, 10, 'sqrt'], # ”sqrt”
    'sm__k_neighbors': [3, 5, 9] # 
} #648 models x 5 = 3240 models
gs = GridSearchCV(rfc_pipe, param_grid, scoring=['f1','neg_log_loss'], cv=5, refit='f1' verbose=4, n_jobs=3)
