<h2>Import Libraries</h2>

In [1]:
from scjpnlib.utils.file_io import FileManager
import os 
import scjpnlib.utils as scjpnutils
import pickle
import json
from IPython.core.display import HTML, Markdown
import html2text
from bs4 import BeautifulSoup
import pprint

import pandas as pd
import numpy as np

from scjpnlib.utils.skl_transformers import LabelEncodingTransformer

from sklearn.model_selection import GridSearchCV, cross_val_score
import dask_ml.model_selection as dcv
from dask.distributed import Client
import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier 
# import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

n_jobs = 8

LOG_MODEL_TRIALS = True

## Read Configs for this Experiment

In [2]:
fm = FileManager()

models_config = fm.load_json('models-config.txt')
models_config

{'logging': {'dir': 'model-results'},
 'cross_validation': {'k': {'grid_search': 3, 'score': 5}},
 'dask': {'use': False,
  'is_remote': False,
  'local': {'n_workers': 1, 'n_jobs': 8, 'memory_limit': '16GB'},
  'remote': {'scheduler_address': '35.230.13.87'}},
 'SEED': 42,
 'DecisionTreeClassifier': {'run': True,
  'trials': {'run': True,
   'array': [{'gridsearch': {'run': True,
      'param_grid': {'criterion': ['entropy', 'gini'],
       'splitter': ['best'],
       'max_depth': [10, 50, 75, None],
       'min_samples_split': [2],
       'max_features': ['auto', 'sqrt', 'log2']},
      'last_best': {'criterion': 'entropy',
       'max_depth': 75,
       'max_features': 'sqrt',
       'min_samples_split': 2,
       'splitter': 'best'}}}]},
  'params': {'criterion': 'entropy',
   'splitter': 'best',
   'max_depth': None,
   'min_samples_split': 2,
   'max_features': 'sqrt'}},
 'RandomForestClassifier': {'run': True,
  'trials': {'run': True,
   'array': [{'gridsearch': {'run': True,


In [3]:
is_data_cached = 'data_cached' in models_config
data_config = models_config['data_cached'] if is_data_cached else fm.load_json('eda-config.txt')
digest = data_config['digest'] if is_data_cached else scjpnutils.json_to_md5_hash_digest(data_config)

In [4]:
print(f"EDA description: {'CACHED ' if is_data_cached else ''}(digest: {digest}) {data_config['eda_desc']['short']}")

EDA description: (digest: d6a76cbd7fb874262adbd0e4a705fc31) 0.10 test_ratio


In [5]:
if len(data_config['wrangled_data']['dir']) > 0:
    WRANGLED_DATA_DIR = data_config['wrangled_data']['dir'] + "/"
else:
    WRANGLED_DATA_DIR = ""

if len(data_config['labels']['dir']) > 0:
    SAVE_LABELS_DIR = data_config['labels']['dir'] + "/"
else:
    SAVE_LABELS_DIR = ""
    
fname__train_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':False,'type':'train','is_cached':is_data_cached})
fname__train_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':True,'type':'train','is_cached':is_data_cached})
fname__validation_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':False,'type':'test','is_cached':is_data_cached})
fname__validation_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(data_config, data_kwargs={'is_labels':True,'type':'test','is_cached':is_data_cached})
# fname__unlabeled_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':False,'type':'unlabeled'})

In [6]:
if len(models_config['logging']['dir']) > 0:
    MODEL_RESULTS_DIR = models_config['logging']['dir']
    try:
        os.makedirs(f"{os.getcwd()}/{MODEL_RESULTS_DIR}")
    except FileExistsError:
        pass
    MODEL_RESULTS_DIR += "/"
else:
    MODEL_RESULTS_DIR = ""
    
model_results_fname = MODEL_RESULTS_DIR + scjpnutils.get_model_result_fname(data_config, data_kwargs={'is_cached':is_data_cached})
print(f"modeling results will be saved to: {model_results_fname}")

modeling results will be saved to: model-results/models-results-d6a76cbd7fb874262adbd0e4a705fc31.json


In [7]:
model_results = {}
model_results['digest'] = digest
model_results['modeling_results'] = {}

In [8]:
SEED = models_config['SEED']
model_results['seed'] = SEED

In [9]:
K = models_config['cross_validation']['k']['grid_search'] # num folds for cross-val
cross_val_score_K = models_config['cross_validation']['k']['score']

<p><br>
<h2>Load TEST/TRAIN Data</h2>

In [10]:
data_train = pd.read_csv(fname__train_predictors, index_col=0).sort_index()
print(f"loaded {fname__train_predictors}\n")

data_train.info()

loaded preprocessing-results/wrangled-labeled-data-train-d6a76cbd7fb874262adbd0e4a705fc31.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53460 entries, 0 to 74247
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   amount_tsh                        53460 non-null  float64
 1   gps_height                        53460 non-null  int64  
 2   longitude                         53460 non-null  float64
 3   latitude                          53460 non-null  float64
 4   population                        53460 non-null  int64  
 5   public_meeting                    53460 non-null  bool   
 6   permit                            53460 non-null  bool   
 7   pump_age                          53460 non-null  int64  
 8   installer_target_encoded          53460 non-null  float64
 9   basin_target_encoded              53460 non-null  float64
 10  region_code_target_encoded        5

In [11]:
data_train.index

Int64Index([    0,     1,     2,     3,     5,     7,     8,     9,    11,
               12,
            ...
            74235, 74236, 74237, 74238, 74239, 74240, 74242, 74243, 74246,
            74247],
           dtype='int64', name='id', length=53460)

In [12]:
y_train = pd.read_csv(fname__train_labels, index_col=0).sort_index()
print(f"loaded {fname__train_labels}\n")

y_train.info()

loaded preprocessing-results/labels-train-d6a76cbd7fb874262adbd0e4a705fc31.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53460 entries, 0 to 74247
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  53460 non-null  object
dtypes: object(1)
memory usage: 835.3+ KB


In [13]:
y_train.index

Int64Index([    0,     1,     2,     3,     5,     7,     8,     9,    11,
               12,
            ...
            74235, 74236, 74237, 74238, 74239, 74240, 74242, 74243, 74246,
            74247],
           dtype='int64', name='id', length=53460)

In [14]:
let_labels = LabelEncodingTransformer(['status_group'])
y_train = let_labels.fit_transform(y_train)
y_train.status_group.unique()

array([2, 0, 1])

In [15]:
let_labels.labelencoder.classes_

array(['functional', 'functional needs repair', 'non functional'],
      dtype=object)

In [16]:
classes_train = list(let_labels.labelencoder.classes_)

In [17]:
data_train_with_target = pd.concat([data_train, y_train], axis=1, join='inner')
data_train_with_target

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,population,public_meeting,permit,pump_age,installer_target_encoded,basin_target_encoded,...,scheme_management_target_encoded,scheme_name_target_encoded,extraction_type_target_encoded,management_target_encoded,payment_type_target_encoded,water_quality_target_encoded,quantity_target_encoded,source_target_encoded,waterpoint_type_target_encoded,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0,33.125828,-5.118154,0,False,True,0,0.788043,0.922970,...,0.907233,0.898294,0.623434,0.924005,1.086092,0.885089,0.625598,0.953780,0.708085,2
1,0.0,1978,34.770717,-9.395642,20,True,False,3,0.490476,0.679928,...,0.983733,0.898294,0.665860,0.924005,1.028301,0.793384,0.625598,0.953780,0.708085,0
2,0.0,0,36.115056,-6.279268,0,True,True,0,1.140351,0.909414,...,0.907233,0.750000,1.206856,0.924005,0.602198,0.793384,0.861791,0.981335,1.162505,0
3,10.0,1639,37.147432,-3.187555,25,True,True,14,0.237132,0.746142,...,0.471044,0.103704,0.699696,0.435330,0.602198,0.793384,0.625598,0.677102,0.677651,0
5,50.0,28,39.286124,-6.972403,6922,True,False,0,0.441379,0.909414,...,0.614650,0.898294,0.854037,0.477028,0.602198,0.793384,0.625598,0.981335,1.162505,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74240,0.0,1183,37.007726,-3.280868,350,True,False,1,0.619651,0.746142,...,0.528367,0.898294,0.699696,0.537719,1.086092,0.793384,0.861791,0.666342,0.677651,0
74242,0.0,0,33.724987,-8.940758,0,False,False,0,0.867585,0.679928,...,0.907233,0.898294,0.699696,0.704487,0.683778,0.793384,0.625598,0.735053,0.677651,0
74243,0.0,1188,33.963539,-1.429477,95,True,False,29,0.836638,0.914632,...,0.907233,2.000000,0.854037,0.924005,1.028301,0.793384,0.625598,1.588321,1.162505,2
74246,50.0,1428,35.630481,-7.710549,1,True,True,11,1.125000,0.679928,...,0.907233,0.962963,0.699696,0.924005,0.602198,0.793384,1.946499,0.677102,0.677651,2


In [18]:
data_test = pd.read_csv(fname__validation_predictors, index_col=0).sort_index()
print(f"loaded {fname__validation_predictors}\n")

data_test.info()

loaded preprocessing-results/wrangled-labeled-data-test-d6a76cbd7fb874262adbd0e4a705fc31.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5940 entries, 4 to 74229
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   amount_tsh                        5940 non-null   float64
 1   gps_height                        5940 non-null   int64  
 2   longitude                         5940 non-null   float64
 3   latitude                          5940 non-null   float64
 4   population                        5940 non-null   int64  
 5   public_meeting                    5940 non-null   bool   
 6   permit                            5940 non-null   bool   
 7   pump_age                          5940 non-null   int64  
 8   installer_target_encoded          5940 non-null   float64
 9   basin_target_encoded              5940 non-null   float64
 10  region_code_target_encoded        594

In [19]:
data_test.index

Int64Index([    4,     6,    40,    41,    46,    66,    75,    84,    96,
              105,
            ...
            74140, 74145, 74158, 74166, 74177, 74195, 74199, 74214, 74215,
            74229],
           dtype='int64', name='id', length=5940)

In [20]:
y_test = pd.read_csv(fname__validation_labels, index_col=0).sort_index()
print(f"loaded {fname__validation_labels}\n")

y_test.info()

loaded preprocessing-results/labels-test-d6a76cbd7fb874262adbd0e4a705fc31.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5940 entries, 4 to 74229
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  5940 non-null   object
dtypes: object(1)
memory usage: 92.8+ KB


In [21]:
y_test.index

Int64Index([    4,     6,    40,    41,    46,    66,    75,    84,    96,
              105,
            ...
            74140, 74145, 74158, 74166, 74177, 74195, 74199, 74214, 74215,
            74229],
           dtype='int64', name='id', length=5940)

In [22]:
y_test = let_labels.fit_transform(y_test)
y_test.status_group.unique()

array([2, 0, 1])

In [23]:
let_labels.labelencoder.classes_

array(['functional', 'functional needs repair', 'non functional'],
      dtype=object)

In [24]:
classes_test = list(let_labels.labelencoder.classes_)

In [25]:
data_test_with_target = pd.concat([data_test, y_test], axis=1, join='inner')
data_test_with_target

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,population,public_meeting,permit,pump_age,installer_target_encoded,basin_target_encoded,...,scheme_management_target_encoded,scheme_name_target_encoded,extraction_type_target_encoded,management_target_encoded,payment_type_target_encoded,water_quality_target_encoded,quantity_target_encoded,source_target_encoded,waterpoint_type_target_encoded,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0,36.164893,-6.099289,0,True,True,0,0.600000,0.909414,...,0.907233,0.898294,0.594223,0.924005,1.086092,0.793384,1.946499,0.953780,0.708085,2
6,0.0,0,33.229880,-3.852983,0,True,True,0,1.274706,0.776687,...,0.907233,0.898294,1.645411,0.924005,1.028301,0.793384,0.747809,0.953780,1.692281,2
40,0.0,1636,37.579079,-7.082958,560,True,True,7,0.000000,0.909414,...,0.907233,0.333333,0.699696,0.924005,1.028301,0.793384,0.625598,0.735053,0.677651,0
41,50.0,5,39.228066,-6.903605,200,False,False,3,0.860597,0.909414,...,0.614650,0.898294,0.854037,0.477028,0.602198,0.793384,0.625598,0.981335,0.677651,0
46,0.0,0,36.151647,-6.137493,0,True,True,0,1.000000,0.909414,...,0.907233,0.898294,0.594223,0.924005,1.028301,0.793384,0.861791,0.953780,0.708085,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74195,0.0,305,39.013796,-10.500101,30,True,True,20,1.315508,1.177415,...,0.907233,0.898294,0.699696,0.924005,1.028301,0.793384,0.747809,0.677102,0.677651,1
74199,500.0,1267,30.123296,-4.248082,840,True,False,35,1.274706,0.922970,...,0.828901,0.898294,0.995249,0.924005,0.741799,0.793384,0.625598,0.981335,1.162505,2
74214,0.0,0,33.166752,-2.975367,0,False,True,0,0.836638,0.914632,...,0.907233,0.898294,0.623434,0.924005,1.028301,1.709233,1.439161,0.981335,0.708085,2
74215,0.0,0,32.933201,-4.350711,0,False,False,0,0.602210,0.922970,...,0.907233,0.898294,0.594223,0.924005,1.028301,0.793384,0.625598,0.953780,0.708085,0


In [26]:
data_ALL_labeled_with_target = pd.concat([data_train_with_target, data_test_with_target], axis=0).sort_index()
data_ALL_labeled_with_target.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 74247
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   amount_tsh                        59400 non-null  float64
 1   gps_height                        59400 non-null  int64  
 2   longitude                         59400 non-null  float64
 3   latitude                          59400 non-null  float64
 4   population                        59400 non-null  int64  
 5   public_meeting                    59400 non-null  bool   
 6   permit                            59400 non-null  bool   
 7   pump_age                          59400 non-null  int64  
 8   installer_target_encoded          59400 non-null  float64
 9   basin_target_encoded              59400 non-null  float64
 10  region_code_target_encoded        59400 non-null  float64
 11  district_code_target_encoded      59400 non-null  float64
 12  sche

In [27]:
data_ALL_labeled_with_target.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            74235, 74236, 74237, 74238, 74239, 74240, 74242, 74243, 74246,
            74247],
           dtype='int64', name='id', length=59400)

<p><br>
<h2>Build Models (Run Trials)</h2>

<h3>General functions for building Classifiers and running trials</h3>

In [28]:
# utility function to render HTML and optionally log (append) it to file
def render_HTML(the_html, fname=None):
    display(HTML(the_html))
    if fname is not None:
        fm.append_text_file(BeautifulSoup(the_html).text + '\n', fname)

In [29]:
def get_trials_log_fname(clf):
    return f"{MODEL_RESULTS_DIR}{clf.__class__.__name__}-trials.log" if LOG_MODEL_TRIALS else None

In [30]:
def gs_find_best_params(clf, param_grid):
    render_HTML(f"<br><br>param_grid for {clf.__class__.__name__} GridSearch:<br><pre>{params}</pre>", fname=get_trials_log_fname(clf))
    grid_clf = GridSearchCV(
        clf, 
        param_grid, 
        cv=K, 
        n_jobs=-1
        , verbose=20
    )
    # with joblib.parallel_backend('dask'):
    #     %time _ = grid_clf.fit(data_train, y_train)
    %time _ = grid_clf.fit(data_train, y_train)
    return grid_clf.best_params_

In [31]:
def clf_fit(clf, data_train, y_train):
    # with joblib.parallel_backend('dask'):
    #     %time clf.fit(data_train, y_train)
    %time clf.fit(data_train, y_train)
    return clf

In [32]:
def summarize_preds(clf, X, y, preds, dataset_name, classes):
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    _accuracy = accuracy_score(y, preds)
    render_HTML(f"{dataset_name} Accuracy: {round(_accuracy*100,4)}", fname=get_trials_log_fname(clf))
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{classification_report(y, preds, target_names=classes)}</pre>", fname=get_trials_log_fname(clf))
    return _accuracy

In [33]:
def get_feat_importances(clf):
    feat_importances = {}
    for i, feat in enumerate(list(data_train.columns)):
        feat_importances[feat] = clf.feature_importances_[i]
    return sorted(list(feat_importances.items()), key=lambda item: item[1], reverse=True)

In [34]:
def clf_run_trial(clf, params_to_try, best_parameters_so_far, run_trials_gridsearch=False):
    if run_trials_gridsearch:
        for param_name, param_value in best_parameters_so_far.items():
            params_to_try.update({param_name: [param_value]})
        best_parameters = gs_find_best_params(clf, params_to_try)
    else:
        best_parameters = params_to_try
    best_parameters_so_far.update(best_parameters)

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Grid Search {'(previously) ' if not run_trials_gridsearch else ''}found the following optimal parameters: ", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{pprint.pformat(best_parameters_so_far, indent=4)}</pre>", fname=get_trials_log_fname(clf))

    _y_train = y_train.status_group.ravel()
    _y_test = y_test.status_group.ravel()
    
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Fitting classifier...", fname=get_trials_log_fname(clf))
    clf = clf.set_params(**best_parameters_so_far)
    clf = clf_fit(clf, data_train, _y_train)
    s_all_done = "\tALL DONE!"
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))

    _class_name = clf.__class__.__name__
    model_results['modeling_results'][_class_name] = {}

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Predicting labels on training data...", fname=get_trials_log_fname(clf))
    pred_train = clf.predict(data_train)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_train = summarize_preds(clf, data_train, _y_train, pred_train, 'Training', classes_train)
    model_results['modeling_results'][_class_name]['accuracy'] = {}
    model_results['modeling_results'][_class_name]['accuracy']['train'] = _accuracy_train

    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML("Computing cross-val score on training data...")
        model_results['modeling_results'][_class_name]['cv_score'] = {}
        cv_score_train = cross_val_score(clf, data_train, _y_train, cv=cross_val_score_K)
        mean_cv_score_train = np.mean(cv_score_train)
        model_results['modeling_results'][_class_name]['cv_score']['train'] = mean_cv_score_train
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_train}</pre>")
        render_HTML(f"cross_val_score: {mean_cv_score_train}")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Predicting labels on testing data...", fname=get_trials_log_fname(clf))
    pred_test = clf.predict(data_test)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_test = summarize_preds(clf, data_test, _y_test, pred_test, 'Testing', classes_test)
    model_results['modeling_results'][_class_name]['accuracy']['test'] = _accuracy_test
    model_results['modeling_results'][_class_name]['feature_importances'] = get_feat_importances(clf)
    
    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML(f"Computing cross-val score on testing data...")
        cv_score_test = cross_val_score(clf, data_test, _y_test, cv=cross_val_score_K)
        mean_cv_score_test = np.mean(cv_score_test)
        model_results['modeling_results'][_class_name]['cv_score']['test'] = mean_cv_score_test
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_test}</pre>")
        render_HTML(f"cross_val_score: {mean_cv_score_test}")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Feature Importances:", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{pprint.pformat(model_results['modeling_results'][_class_name]['feature_importances'], indent=4)}</pre><p><br><br>", fname=get_trials_log_fname(clf))

    return clf, best_parameters_so_far, model_results

In [35]:
def clf_build_final_model(clf, params):
    render_HTML("Fitting classifier {} to ALL LABALED data...")
    clf = clf.set_params(**best_parameters_so_far)
    clf = clf_fit(clf, data_ALL_labeled_with_target.drop('status_group', axis=1), data_ALL_labeled_with_target[['status_group']])
    s_all_done = "\tALL DONE!"
    render_HTML(f"<pre>{s_all_done}</pre>")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Predicting labels of UNLABELED data...", fname=get_trials_log_fname(clf))
    pred_unlabeled = clf.predict(data_unlabeled)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))

    return 

<p><br>
<h2>Initialize Dask-Client (to Dask backend for parallelization) <i>(DISABLED for now)</i></h2>

In [36]:
if models_config['dask']['use']:
    if models_config['dask']['is_remote']:
        # for Kubernetes dask scheduler/worker cluster in GCP - but this costs money to run the cluster AND requires a lot more work for data parallelization!
        dask_client = Client(f"tcp://{models_config['dask']['remote']['scheduler_address']}:8786")
    else:
        # local
        dask_client = Client( #spawns a local cluster
            n_workers=models_config['dask']['local']['n_workers'], 
            threads_per_worker=models_config['dask']['local']['n_jobs'], 
            memory_limit=models_config['dask']['local']['memory_limit'] # memory_limit is per worker
        )

    dask_client

<p><br>
<h3>Decision Tree Classifier</h3>
<h4>Trials</h4>

In [37]:
run_dtclf = models_config['DecisionTreeClassifier']['run']
render_HTML(f"models_config['DecisionTreeClassifier']['run']: {run_dtclf}")

if run_dtclf:
    trials = models_config['DecisionTreeClassifier']['trials']

    display(HTML(f"models_config['DecisionTreeClassifier']['trials']['run']: {trials['run']}"))
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            display(HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>"))
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            rfclf, best_parameters, model_results = clf_run_trial(DecisionTreeClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['DecisionTreeClassifier']['params']

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

CPU times: user 871 ms, sys: 221 ms, total: 1.09 s
Wall time: 6.4 s


CPU times: user 143 ms, sys: 17.7 ms, total: 161 ms
Wall time: 163 ms


<p><br>
<h4>Build Final Model with best params</h4>

In [38]:
if run_dtclf:
    best_parameters.update({'random_state': SEED})
    dtclf, _, model_results = clf_run_trial(DecisionTreeClassifier(), best_parameters, best_parameters)

CPU times: user 130 ms, sys: 16.6 ms, total: 146 ms
Wall time: 147 ms


<p><br>
<h3>Random Forest Classifier</h3>
<h4>Trials</h4>

In [39]:
run_rfclf = models_config['RandomForestClassifier']['run']
render_HTML(f"models_config['RandomForestClassifier']['run']: {run_rfclf}")

if run_rfclf:
    trials = models_config['RandomForestClassifier']['trials']

    render_HTML(f"models_config['RandomForestClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            rfclf, best_parameters, model_results = clf_run_trial(RandomForestClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['RandomForestClassifier']['params']

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   45.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   45.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

CPU times: user 2min 38s, sys: 2.86 s, total: 2min 41s
Wall time: 1h 3min 4s


CPU times: user 2min 34s, sys: 2.17 s, total: 2min 36s
Wall time: 21.4 s


<p><br>
<h4>Build Final Model with best params</h4>

In [40]:
if run_rfclf:
    best_parameters.update({'n_jobs':-1, 'verbose':1, 'random_state': SEED})
    rfclf, _ , model_results = clf_run_trial(RandomForestClassifier(), best_parameters, best_parameters)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   16.4s


CPU times: user 2min 35s, sys: 2.1 s, total: 2min 37s
Wall time: 21.3 s


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   20.9s finished


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    2.9s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 852 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 985 out of 1000 | elapsed:   17.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   17.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elap

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.5s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 

<p><br><br><br>
<h3>XGBClassifier</h3>
<h4>Trials</h4>

In [41]:
run_xgbclf = models_config['XGBClassifier']['run']
render_HTML(f"models_config['XGBClassifier']['run']: {run_xgbclf}")

if run_xgbclf:
    trials = models_config['XGBClassifier']['trials']

    render_HTML(f"models_config['XGBClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            xgbclf, best_parameters, model_results = clf_run_trial(XGBClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['XGBClassifier']['params']

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  7

CPU times: user 4min, sys: 2.08 s, total: 4min 2s
Wall time: 7h 27min 53s


CPU times: user 4min 9s, sys: 1.4 s, total: 4min 10s
Wall time: 32.1 s


<p><br>
<h4>Build Final Vaildation Model with best params</h4>

In [42]:
if run_xgbclf:
    best_parameters.update({'n_jobs':-1, 'verbosity':1, 'random_state': SEED})
    xgbclf, _ , model_results = clf_run_trial(XGBClassifier(), best_parameters, best_parameters)

CPU times: user 3min 56s, sys: 1.35 s, total: 3min 57s
Wall time: 30.6 s


<p><br><br>
<h4>Save Validation Results to File</h4>

In [43]:
fm.save_json(model_results, f"{model_results_fname}")
print(f"updated {model_results_fname}")

updated model-results/models-results-d6a76cbd7fb874262adbd0e4a705fc31.json


<p><br><br>
<h4>Build Final Model with ALL Labeled Data (Test + Validation)</h4>