<h2>Import Libraries</h2>

In [38]:
from scjpnlib.utils.file_io import FileManager
import scjpnlib.utils as scjpnutils
import pickle
import json
from IPython.core.display import HTML, Markdown
import html2text
from bs4 import BeautifulSoup
import pprint

import pandas as pd
import numpy as np

from scjpnlib.utils.skl_transformers import LabelEncodingTransformer

from sklearn.model_selection import GridSearchCV, cross_val_score
import dask_ml.model_selection as dcv
from dask.distributed import Client
import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier 
# import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

K = 3 # num folds for cross-val
cross_val_score_K = 5
n_jobs = 8

LOG_MODEL_TRIALS = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read Configs for this Experiment

In [45]:
fm = FileManager()

models_config = fm.load_json('models-config.txt')
models_config

{'dask': {'use': False,
  'is_remote': False,
  'local': {'n_workers': 1, 'n_jobs': 8, 'memory_limit': '16GB'},
  'remote': {'scheduler_address': '35.230.13.87'}},
 'SEED': 42,
 'DecisionTreeClassifier': {'run': False,
  'trials': {'run': False,
   'array': [{'gridsearch': {'run': False,
      'param_grid': {'criterion': ['entropy', 'gini'],
       'splitter': ['best'],
       'max_depth': [10, 50, 75, None],
       'min_samples_split': [2],
       'max_features': ['auto', 'sqrt', 'log2']},
      'last_best': {'criterion': 'entropy',
       'splitter': 'best',
       'max_depth': None,
       'min_samples_split': 2,
       'max_features': 'sqrt'}}}]},
  'params': {'criterion': 'entropy',
   'splitter': 'best',
   'max_depth': None,
   'min_samples_split': 2,
   'max_features': 'sqrt'}},
 'RandomForestClassifier': {'run': False,
  'trials': {'run': False,
   'array': [{'gridsearch': {'run': False,
      'param_grid': {'bootstrap': [True, False],
       'criterion': ['entropy', 'gini'],


In [3]:
is_data_cached = 'data_cached' in models_config
data_config = models_config['data_cached'] if is_data_cached else fm.load_json('eda-config.txt')
digest = data_config['digest'] if is_data_cached else scjpnutils.json_to_md5_hash_digest(data_config)

In [4]:
print(f"EDA description: {'CACHED ' if is_data_cached else ''}(digest: {digest}) {data_config['eda_desc']['short']}")

EDA description: (digest: 2d7d3126b4f539a5c747b7bed497626b) 0.10 test_ratio, flat insig cat hat handling with threshold 10; location based on ward


In [5]:
model_results_fname = scjpnutils.get_model_result_fname(data_config, data_kwargs={'is_cached':is_data_cached})
print(f"modeling results will be saved to: {model_results_fname}")

modeling results will be saved to: models-results-2d7d3126b4f539a5c747b7bed497626b.json


In [6]:
model_results = {}
model_results['digest'] = digest
model_results['modeling_results'] = {}

<p><br>
<h2>Load TEST/TRAIN Data</h2>

In [7]:
SEED = models_config['SEED']
model_results['seed'] = SEED

In [8]:
fname = scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':False,'type':'train','is_cached':is_data_cached})
data_train = pd.read_csv(fname, index_col=0).sort_index()
print(f"loaded {fname}\n")

data_train.info()

loaded wrangled-labeled-data-train-2d7d3126b4f539a5c747b7bed497626b.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53460 entries, 0 to 74247
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         53460 non-null  float64
 1   funder             52583 non-null  float64
 2   gps_height         53460 non-null  int64  
 3   installer          52582 non-null  float64
 4   longitude          53460 non-null  float64
 5   latitude           53460 non-null  float64
 6   basin              53460 non-null  float64
 7   region_code        53459 non-null  float64
 8   district_code      53460 non-null  float64
 9   population         53460 non-null  int64  
 10  public_meeting     53460 non-null  bool   
 11  scheme_management  53460 non-null  float64
 12  scheme_name        52880 non-null  float64
 13  permit             53460 non-null  bool   
 14  extraction_type    53460 non-null  float64
 1

In [9]:
data_train.index

Int64Index([    0,     1,     2,     3,     5,     7,     8,     9,    11,
               12,
            ...
            74235, 74236, 74237, 74238, 74239, 74240, 74242, 74243, 74246,
            74247],
           dtype='int64', name='id', length=53460)

In [10]:
fname = scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':True,'type':'train','is_cached':is_data_cached})
y_train = pd.read_csv(fname, index_col=0).sort_index()
print(f"loaded {fname}\n")

y_train.info()

loaded labels-train-2d7d3126b4f539a5c747b7bed497626b.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53460 entries, 0 to 74247
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  53460 non-null  object
dtypes: object(1)
memory usage: 835.3+ KB


In [11]:
y_train.index

Int64Index([    0,     1,     2,     3,     5,     7,     8,     9,    11,
               12,
            ...
            74235, 74236, 74237, 74238, 74239, 74240, 74242, 74243, 74246,
            74247],
           dtype='int64', name='id', length=53460)

In [12]:
let_labels = LabelEncodingTransformer(['status_group'])
y_train = let_labels.fit_transform(y_train)
y_train.status_group.unique()

array([2, 0, 1])

In [13]:
let_labels.labelencoder.classes_

array(['functional', 'functional needs repair', 'non functional'],
      dtype=object)

In [14]:
classes_train = list(let_labels.labelencoder.classes_)

In [15]:
data_train_with_target = pd.concat([data_train, y_train], axis=1, join='inner')
data_train_with_target.columns = list(data_train.columns) + ['status_group']
data_train_with_target

Unnamed: 0_level_0,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,region_code,district_code,population,...,permit,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,pump_age,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.801370,0,0.773171,33.125828,-5.118154,0.918364,1.104651,0.938982,0,...,True,0.620690,0.922653,1.081903,0.894147,0.623053,0.953635,0.705496,0,2
1,0.0,0.429167,1978,0.469828,34.770717,-9.395642,0.674483,0.413474,0.700044,20,...,False,0.664444,0.922604,1.026985,0.791310,0.622992,0.953516,0.705496,3,0
2,0.0,1.006849,0,1.135593,36.115056,-6.279268,0.907618,0.988636,0.700044,0,...,True,1.200070,0.922604,0.598954,0.791310,0.857747,0.976612,1.161586,0,0
3,10.0,0.234811,1639,0.234811,37.147432,-3.187555,0.744938,0.720877,0.817451,25,...,True,0.700661,0.430082,0.598954,0.791310,0.622992,0.680494,0.677816,14,0
5,50.0,0.391156,28,0.440994,39.286124,-6.972403,0.907618,1.025391,1.142857,6922,...,False,0.849675,0.480203,0.598731,0.791271,0.623053,0.976431,1.161259,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74240,0.0,0.702811,1183,0.623209,37.007726,-3.280868,0.744714,0.633807,0.731299,350,...,False,0.700661,0.537885,1.081903,0.791310,0.857615,0.655623,0.677816,1,0
74242,0.0,0.849020,0,0.889418,33.724987,-8.940758,0.674232,0.891332,0.814178,0,...,False,0.700661,0.702026,0.688219,0.791271,0.623053,0.735928,0.677816,0,0
74243,0.0,1.315789,1188,0.835007,33.963539,-1.429477,0.908266,1.070122,0.934446,95,...,False,0.849885,0.922653,1.026907,0.791290,0.623053,1.560209,1.161586,29,2
74246,50.0,1.066667,1428,1.066667,35.630481,-7.710549,0.674232,0.413474,0.819620,1,...,True,0.700586,0.922604,0.598731,0.791310,1.943795,0.680494,0.677781,11,2


In [16]:
fname = scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':False,'type':'test','is_cached':is_data_cached})
data_test = pd.read_csv(fname, index_col=0).sort_index()
print(f"loaded {fname}\n")

data_test.info()

loaded wrangled-labeled-data-test-2d7d3126b4f539a5c747b7bed497626b.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5940 entries, 4 to 74229
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         5940 non-null   float64
 1   funder             5843 non-null   float64
 2   gps_height         5940 non-null   int64  
 3   installer          5844 non-null   float64
 4   longitude          5940 non-null   float64
 5   latitude           5940 non-null   float64
 6   basin              5940 non-null   float64
 7   region_code        5940 non-null   float64
 8   district_code      5940 non-null   float64
 9   population         5940 non-null   int64  
 10  public_meeting     5940 non-null   bool   
 11  scheme_management  5940 non-null   float64
 12  scheme_name        5891 non-null   float64
 13  permit             5940 non-null   bool   
 14  extraction_type    5940 non-null   float64
 15 

In [17]:
data_test.index

Int64Index([    4,     6,    40,    41,    46,    66,    75,    84,    96,
              105,
            ...
            74140, 74145, 74158, 74166, 74177, 74195, 74199, 74214, 74215,
            74229],
           dtype='int64', name='id', length=5940)

In [18]:
fname = scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':True,'type':'test','is_cached':is_data_cached})
y_test = pd.read_csv(fname, index_col=0).sort_index()
print(f"loaded {fname}\n")

y_test.info()

loaded labels-test-2d7d3126b4f539a5c747b7bed497626b.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5940 entries, 4 to 74229
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  5940 non-null   object
dtypes: object(1)
memory usage: 92.8+ KB


In [19]:
y_test.index

Int64Index([    4,     6,    40,    41,    46,    66,    75,    84,    96,
              105,
            ...
            74140, 74145, 74158, 74166, 74177, 74195, 74199, 74214, 74215,
            74229],
           dtype='int64', name='id', length=5940)

In [20]:
y_test = let_labels.fit_transform(y_test)
y_test.status_group.unique()

array([2, 0, 1])

In [21]:
let_labels.labelencoder.classes_

array(['functional', 'functional needs repair', 'non functional'],
      dtype=object)

In [22]:
classes_test = list(let_labels.labelencoder.classes_)

<p><br>
<h2>Build Models (Run Trials)</h2>

<h3>General functions for building Classifiers and running trials</h3>

In [23]:
# utility function to render HTML and optionally log (append) it to file
def render_HTML(the_html, fname=None):
    display(HTML(the_html))
    if fname is not None:
        fm.append_text_file(BeautifulSoup(the_html).text + '\n', fname)

In [24]:
def get_trials_log_fname(clf):
    return f"{clf.__class__.__name__}-trials.log" if LOG_MODEL_TRIALS else None

In [25]:
def gs_find_best_params(clf, param_grid):
    render_HTML(f"<br><br>param_grid for {clf.__class__.__name__} GridSearch:<br><pre>{params}</pre>", fname=get_trials_log_fname(clf))
    grid_clf = GridSearchCV(
        clf, 
        param_grid, 
        cv=K, 
        n_jobs=-1
        , verbose=20
    )
    # with joblib.parallel_backend('dask'):
    #     %time _ = grid_clf.fit(data_train, y_train)
    %time _ = grid_clf.fit(data_train, y_train)
    return grid_clf.best_params_

In [26]:
def clf_fit(clf, data_train, y_train):
    # with joblib.parallel_backend('dask'):
    #     %time clf.fit(data_train, y_train)
    %time clf.fit(data_train, y_train)
    return clf

In [27]:
def summarize_preds(clf, X, y, preds, dataset_name, classes):
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    _accuracy = accuracy_score(y, preds)
    render_HTML(f"{dataset_name} Accuracy: {round(_accuracy*100,4)}", fname=get_trials_log_fname(clf))
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{classification_report(y, preds, target_names=classes)}</pre>", fname=get_trials_log_fname(clf))
    return _accuracy

In [28]:
def clf_run_trial(clf, params_to_try, best_parameters_so_far, run_trials_gridsearch=False):
    if run_trials_gridsearch:
        for param_name, param_value in best_parameters_so_far.items():
            params_to_try.update({param_name: [param_value]})
        best_parameters = gs_find_best_params(clf, params_to_try)
    else:
        best_parameters = params_to_try
    best_parameters_so_far.update(best_parameters)

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Grid Search {'(previously) ' if not run_trials_gridsearch else ''}found the following optimal parameters: ", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{pprint.pformat(best_parameters_so_far, indent=4)}</pre>", fname=get_trials_log_fname(clf))

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Fitting classifier...", fname=get_trials_log_fname(clf))
    clf = clf.set_params(**best_parameters_so_far)
    clf = clf_fit(clf, data_train, y_train)
    s_all_done = "\tALL DONE!"
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Predicting labels on training data...", fname=get_trials_log_fname(clf))
    pred_train = clf.predict(data_train)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_train = summarize_preds(clf, data_train, y_train, pred_train, 'Training', classes_train)

    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML("Computing cross-val score on training data...")
        cv_score_train = cross_val_score(clf, data_train, y_train, cv=cross_val_score_K)
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_train}</pre>")
        render_HTML(f"cross_val_score: {np.mean(cv_score_train)}")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Predicting labels on testing data...", fname=get_trials_log_fname(clf))
    pred_test = clf.predict(data_test)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_test = summarize_preds(clf, data_test, y_test, pred_test, 'Testing', classes_test)
    
    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML(f"Computing cross-val score on testing data...")
        cv_score_test = cross_val_score(clf, data_test, y_test, cv=cross_val_score_K)
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_test}</pre>")
        render_HTML(f"cross_val_score: {np.mean(cv_score_test)}")

    _class_name = clf.__class__.__name__
    model_results['modeling_results'][_class_name] = {}
    model_results['modeling_results'][_class_name]['accuracy'] = {}
    model_results['modeling_results'][_class_name]['accuracy']['train'] = _accuracy_train
    model_results['modeling_results'][_class_name]['accuracy']['test'] = _accuracy_test
    model_results['modeling_results'][_class_name]['feature_importances'] = get_feat_importances(clf)
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Feature Importances:", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{pprint.pformat(model_results['modeling_results'][_class_name]['feature_importances'], indent=4)}</pre><p><br><br>", fname=get_trials_log_fname(clf))

    return clf, best_parameters_so_far, model_results

In [29]:
def get_feat_importances(clf):
    feat_importances = {}
    for i, feat in enumerate(list(data_train.columns)):
        feat_importances[feat] = clf.feature_importances_[i]
    return sorted(list(feat_importances.items()), key=lambda item: item[1], reverse=True)

<p><br>
<h2>Initialize Dask-Client (to Dask backend for parallelization) <i>(DISABLED for now)</i></h2>

In [30]:
if models_config['dask']['use']:
    if models_config['dask']['is_remote']:
        # for Kubernetes dask scheduler/worker cluster in GCP - but this costs money to run the cluster AND requires a lot more work for data parallelization!
        dask_client = Client(f'tcp://{models_config['dask']['remote']['scheduler_address']}:8786')
    else:
        # local
        dask_client = Client( #spawns a local cluster
            n_workers=models_config['dask']['local']['n_workers'], 
            threads_per_worker=models_config['dask']['local']['n_jobs'], 
            memory_limit=models_config['dask']['local']['memory_limit'] # memory_limit is per worker
        )

    dask_client

<p><br>
<h3>Decision Tree Classifier</h3>
<h4>Trials</h4>

In [31]:
run_dtclf = models_config['DecisionTreeClassifier']['run']
render_HTML(f"models_config['DecisionTreeClassifier']['run']: {run_rfclf}")

if run_dtclf:
    trials = models_config['DecisionTreeClassifier']['trials']

    display(HTML(f"models_config['DecisionTreeClassifier']['trials']['run']: {trials['run']}"))
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            display(HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>"))
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            rfclf, best_parameters, model_results = clf_run_trial(DecisionTreeClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['DecisionTreeClassifier']['params']

<p><br>
<h4>Build Final Model with best params</h4>

In [32]:
if run_dtclf:
    best_parameters.update({'random_state': SEED})
    dtclf, _, model_results = clf_run_trial(DecisionTreeClassifier(), best_parameters)

<p><br>
<h3>Random Forest Classifier</h3>
<h4>Trials</h4>

In [33]:
run_rfclf = models_config['RandomForestClassifier']['run']
render_HTML(f"models_config['RandomForestClassifier']['run']: {run_rfclf}")

if run_rfclf:
    trials = models_config['RandomForestClassifier']['trials']

    render_HTML(f"models_config['RandomForestClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            rfclf, best_parameters, model_results = clf_run_trial(RandomForestClassifier(), params, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['RandomForestClassifier']['params']

<p><br>
<h4>Build Final Model with best params</h4>

In [34]:
if run_rfclf:
    best_parameters.update({'n_jobs':-1, 'verbose':1, 'random_state': SEED})
    rfclf, _ , model_results = clf_run_trial(RandomForestClassifier(), best_parameters)

<p><br><br><br>
<h3>XGBClassifier</h3>
<h4>Trials</h4>

In [35]:
run_xgbclf = models_config['XGBClassifier']['run']
render_HTML(f"models_config['XGBClassifier']['run']: {run_xgbclf}")

if run_xgbclf:
    trials = models_config['XGBClassifier']['trials']

    render_HTML(f"models_config['XGBClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            xgbclf, best_parameters, model_results = clf_run_trial(XGBClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['XGBClassifier']['params']

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:   35.5s remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:   35.6s remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:  2.8min remaining:  5.7min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:  2.9min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:  2.9min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  4.0min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:  4.0min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  4.4min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  5.0min remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  6.2min remaining: 

CPU times: user 7min 31s, sys: 4.01 s, total: 7min 35s
Wall time: 1min 3s


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  5.5min remaining: 35.7min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:  5.5min remaining: 22.0min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  8.1min remaining: 22.4min
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:  8.3min remaining: 16.6min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:  8.3min remaining: 12.5min
[Parallel(n_jobs=-1)]: Done   7 out of  15 | elapsed: 15.0min remaining: 17.1min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 15.3min remaining: 13.4min
[Parallel(n_jobs=-1)]: Done   9 out of  15 | elapsed: 19.9min remaining: 13.3min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 26.9min remaining: 13.4min
[Parallel(n_jobs=-1)]: Done  11 out of  15 | elapsed: 27.3min remaining: 

CPU times: user 20min 15s, sys: 16.6 s, total: 20min 31s
Wall time: 3min 4s


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed: 12.5min remaining: 62.3min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed: 13.5min remaining: 40.5min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed: 13.8min remaining: 27.6min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed: 13.9min remaining: 19.4min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed: 14.8min remaining: 14.8min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed: 15.1min remaining: 10.8min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed: 15.1min remaining:  7.6min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed: 19.9min remaining:  6.6min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 20.6min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 20.8min remaining: 

CPU times: user 19min, sys: 16.1 s, total: 19min 16s
Wall time: 2min 59s


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed: 13.2min remaining: 46.2min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed: 13.5min remaining: 27.0min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed: 13.7min remaining: 17.1min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed: 13.7min remaining: 10.9min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed: 13.7min remaining:  6.8min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 14.0min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 20.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 20.5min finished
CPU times: user 19min 42s, sys: 16.8 s, total: 19min 59s
Wall time: 23min 35s


CPU times: user 19min 33s, sys: 16 s, total: 19min 49s
Wall time: 3min 1s


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:  9.0min remaining: 45.0min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  9.0min remaining: 27.1min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed: 10.4min remaining: 20.8min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed: 10.5min remaining: 14.7min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed: 10.6min remaining: 10.6min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed: 12.1min remaining:  8.6min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed: 12.3min remaining:  6.1min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed: 17.0min remaining:  5.7min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 18.1min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 19.1min remaining: 

CPU times: user 18min 26s, sys: 14.7 s, total: 18min 40s
Wall time: 2min 45s


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed: 13.6min remaining: 88.4min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed: 13.7min remaining: 54.8min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed: 14.4min remaining: 39.6min
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed: 14.5min remaining: 29.0min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed: 15.1min remaining: 22.6min
[Parallel(n_jobs=-1)]: Done   7 out of  15 | elapsed: 15.1min remaining: 17.2min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 16.4min remaining: 14.3min
[Parallel(n_jobs=-1)]: Done   9 out of  15 | elapsed: 26.4min remaining: 17.6min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 27.5min remaining: 13.7min
[Parallel(n_jobs=-1)]: Done  11 out of  15 | elapsed: 27.7min remaining: 

CPU times: user 18min 16s, sys: 14.5 s, total: 18min 31s
Wall time: 2min 47s


<p><br>
<h4>Build Final Model with best params</h4>

In [47]:
if run_xgbclf:
    best_parameters.update({'n_jobs':-1, 'verbosity':1, 'random_state': SEED})
    xgbclf, _ , model_results = clf_run_trial(XGBClassifier(), best_parameters, best_parameters)

CPU times: user 17min 57s, sys: 14.6 s, total: 18min 12s
Wall time: 2min 42s


<p><br><br>
<h2>Save Results to File</h2>

In [48]:
fm.save_json(model_results, model_results_fname)
print(f"updated {model_results_fname}")

updated models-results-2d7d3126b4f539a5c747b7bed497626b.json
