<h2>Import Libraries</h2>

In [1]:
from scjpnlib.utils.file_io import FileManager
import os 
import scjpnlib.utils as scjpnutils
import pickle
import json
from IPython.core.display import HTML, Markdown
import html2text
from bs4 import BeautifulSoup
import pprint

import pandas as pd
import numpy as np

from scjpnlib.utils.skl_transformers import LabelEncodingTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import scjpnlib.utils.preprocessing_strategy_transformers as scjpnpreprocessing

from sklearn.model_selection import GridSearchCV, cross_val_score
import dask_ml.model_selection as dcv
from dask.distributed import Client
import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier 
from xgboost import XGBClassifier
from sklearn import svm
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

n_jobs = 8

LOG_MODEL_TRIALS = True

## Read Configs for this Experiment

In [2]:
fm = FileManager()

preprocessing_config = fm.load_json('preprocessing-config.txt')
models_config = fm.load_json('models-config.txt')
models_config

{'logging': {'dir': 'model-results'},
 'cross_validation': {'k': {'grid_search': 3, 'score': 5}},
 'dask': {'use': False,
  'is_remote': False,
  'local': {'n_workers': 1, 'n_jobs': 8, 'memory_limit': '16GB'},
  'remote': {'scheduler_address': '35.230.13.87'}},
 'SEED': 42,
 'DecisionTreeClassifier': {'run': True,
  'trials': {'run': True,
   'array': [{'gridsearch': {'run': True,
      'param_grid': {'criterion': ['entropy', 'gini'],
       'splitter': ['best'],
       'max_depth': [10, 50, 75, None],
       'min_samples_split': [2],
       'max_features': ['auto', 'sqrt', 'log2']},
      'last_best': {'criterion': 'gini',
       'max_depth': 10,
       'max_features': 'auto',
       'min_samples_split': 2,
       'random_state': 42,
       'splitter': 'best'}}}]},
  'params': {'criterion': 'gini',
   'max_depth': 10,
   'max_features': 'auto',
   'min_samples_split': 2,
   'random_state': 42,
   'splitter': 'best'}},
 'RandomForestClassifier': {'run': True,
  'trials': {'run': False,

In [3]:
is_data_cached = 'data_cached' in models_config
data_config = models_config['data_cached'] if is_data_cached else fm.load_json('preprocessing-config.txt')

In [4]:
if len(data_config['wrangled_data']['dir']) > 0:
    WRANGLED_DATA_DIR = data_config['wrangled_data']['dir'] + "/"
else:
    WRANGLED_DATA_DIR = ""

if len(data_config['labels']['dir']) > 0:
    SAVE_LABELS_DIR = data_config['labels']['dir'] + "/"
else:
    SAVE_LABELS_DIR = ""

In [5]:
preprocessing_spec = fm.load_json(WRANGLED_DATA_DIR + "preprocessing-spec-last.json")

In [6]:
digest = data_config['digest'] if is_data_cached else scjpnutils.json_to_md5_hash_digest(preprocessing_spec)
print(f"digest for last preprocessing spec: {digest}")                                                                                      

digest for last preprocessing spec: 8a48b1bceea8e4f07957e8a9efbe76d8


In [7]:
fname__train_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':False,'type':'train','is_cached':is_data_cached})
fname__train_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':True,'type':'train','is_cached':is_data_cached})
fname__validation_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':False,'type':'test','is_cached':is_data_cached})
fname__validation_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':True,'type':'test','is_cached':is_data_cached})
fname__unlabeled_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':False,'type':'unlabeled'})

In [8]:
if len(models_config['logging']['dir']) > 0:
    MODEL_RESULTS_DIR = models_config['logging']['dir']
    try:
        os.makedirs(f"{os.getcwd()}/{MODEL_RESULTS_DIR}")
    except FileExistsError:
        pass
    MODEL_RESULTS_DIR += "/"
else:
    MODEL_RESULTS_DIR = ""
    
model_results_fname = MODEL_RESULTS_DIR + scjpnutils.get_model_result_fname(data_config, preprocessing_spec, data_kwargs={'is_cached':is_data_cached})
print(f"modeling results will be saved to: {model_results_fname}")

modeling results will be saved to: model-results/models-results-8a48b1bceea8e4f07957e8a9efbe76d8.json


In [9]:
model_results = {}
model_results['digest'] = digest
model_results['modeling_results'] = {}

In [10]:
SEED = models_config['SEED']
model_results['seed'] = SEED

In [11]:
K = models_config['cross_validation']['k']['grid_search'] # num folds for cross-val
cross_val_score_K = models_config['cross_validation']['k']['score']

<p><br>

## Load Data

In [12]:
fname_unlabeled_predictors = preprocessing_config['official_data']['unlabeled_predictors']['local_fname']
fname_labeled_predictors = preprocessing_config['official_data']['labeled_predictors']['local_fname']
fname_labels = preprocessing_config['official_data']['labels']['local_fname']

ds_map = {
    fname_unlabeled_predictors: preprocessing_config['official_data']['unlabeled_predictors']['url'], 
    fname_labeled_predictors: preprocessing_config['official_data']['unlabeled_predictors']['url'],
    fname_labels: preprocessing_config['official_data']['unlabeled_predictors']['url']
}

fm.validate_download(ds_map)

In [13]:
labeled_with_target = pd.concat(
    [
        pd.read_csv(fname_labeled_predictors, index_col=0), 
        pd.read_csv(fname_labels, index_col=0)
    ], 
    axis=1
    ,
    join='inner'
).sort_index()

In [14]:
labels = labeled_with_target[['status_group']]

In [15]:
let_labels = LabelEncodingTransformer(['status_group'])
labels_encoded = let_labels.fit_transform(labels)
classes = list(let_labels.labelencoder.classes_)
classes

['functional', 'functional needs repair', 'non functional']

In [16]:
labeled_with_target['status_group_encoded'] = labels_encoded.status_group

In [17]:
X_labeled = labeled_with_target.drop(['status_group', 'status_group_encoded'], axis=1)

In [18]:
data_unlabeled = pd.read_csv(fname_unlabeled_predictors, index_col=0)

## Prepare Test/Train Data

In [19]:
data_train, data_test, y_train, y_test = train_test_split(X_labeled, labels, test_size=preprocessing_config['test_ratio'], random_state=SEED)

In [20]:
y_train_encoded = labels_encoded.loc[y_train.index]

In [21]:
data_train_with_target = pd.concat([data_train, y_train_encoded], axis=1, join='inner')

In [22]:
y_test_encoded = labels_encoded.loc[y_test.index]

In [23]:
data_test_with_target = pd.concat([data_test, y_test_encoded], axis=1, join='inner')

In [24]:
data_ALL_labeled_with_target = pd.concat([data_train_with_target, data_test_with_target], axis=0).sort_index()

<p><br>

## Apply Preprocessing Transformations to Training Data

In [25]:
pipeline_data_preprocessor = Pipeline(steps=[('passthrough', None)], verbose=True)

scjpnutils.analyze_values(data_train, 'data_train BEFORE preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,96,"[0.0, 1000.0, 250.0, 5000.0, 3000.0, 2400.0, 5...",0.001796,99.82,0,0.0,
1,date_recorded,object,352,"[2013-03-06, 2013-02-05, 2011-07-24, 2012-11-0...",0.006584,99.34,0,0.0,
2,funder,object,1799,"[Kiliwater, Unicef, Danida, Tasaf/tlc, Dhv, Am...",0.033651,96.63,3269,0.061149,"Int64Index([ 569, 35243, 7826, 7576, 71341,..."
3,gps_height,int64,2415,"[1424, 1358, 0, 299, -14, 1490, 1421, 915, 891...",0.045174,95.48,0,0.0,
4,installer,object,2035,"[Kiliwater, TWESA, Central government, TASAF/T...",0.038066,96.19,3287,0.061485,"Int64Index([ 569, 35243, 7826, 7576, 71341,..."
5,longitude,float64,51753,"[37.61840909, 30.6557619, 33.81319755, 33.1151...",0.96807,3.19,0,0.0,
6,latitude,float64,51755,"[-3.26320247, -3.53625, -9.47660713, -4.835553...",0.968107,3.19,0,0.0,
7,wpt_name,object,34085,"[Kwa Moris Assenga, Bavunja Primary School, Za...",0.637579,36.24,0,0.0,
8,num_private,int64,61,"[0, 34, 65, 32, 1, 8, 41, 6, 15, 3, 698, 1402,...",0.001141,99.89,0,0.0,
9,basin,object,9,"[Pangani, Lake Tanganyika, Lake Nyasa, Rufiji,...",0.000168,99.98,0,0.0,


In [None]:
data_train_preprocessed = data_train.copy()

for group_name, preprocessing_option in preprocessing_spec.items():
    composite_transformer = scjpnpreprocessing.instantiate_strategy_transformer(
        preprocessing_option, 
        group_name,
        pipeline_data_preprocessor,
        verbose=True
    )
    scjpnpreprocessing.html_prettify_strategy_transformer_description(composite_transformer)
    data_train_preprocessed = composite_transformer.fit_transform(data_train_preprocessed, y_train_encoded.status_group)
    display(HTML("<p><br><br>"))

strategy appended step ['replace "amount_tsh" outliers with mean', <scjpnlib.utils.skl_transformers.SimpleValueTransformer object at 0x7faf82fca910>] to pipeline
strategy "replace "amount_tsh" outliers with mean" transformation is COMPLETE!


strategy appended step ['drop feature: date_recorded', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7faf831291d0>] to pipeline
strategy "drop feature: date_recorded" transformation is COMPLETE!
strategy appended step ['leave feature as is (do nothing): construction_year', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7faf831359e0>)] to pipeline
strategy "leave feature as is (do nothing): construction_year" transformation is COMPLETE!


strategy appended step ['leave feature as is (do nothing): gps_height', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7faf7d8a17a0>)] to pipeline
strategy "leave feature as is (do nothing): gps_height" transformation is COMPLETE!


strategy appended step ['leave feature as is (do nothing): latitude', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7faf7d8a18c0>)] to pipeline
strategy "leave feature as is (do nothing): latitude" transformation is COMPLETE!


strategy appended step ['leave feature as is (do nothing): longitude', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7faf831357a0>)] to pipeline
strategy "leave feature as is (do nothing): longitude" transformation is COMPLETE!


strategy appended step ['leave feature as is (do nothing): num_private', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7faf83135680>)] to pipeline
strategy "leave feature as is (do nothing): num_private" transformation is COMPLETE!


** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: region_code', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7faf6959ce60>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'region_code' in X match those that were previously fit
added new feature: region_code_target_encoded
strategy appended step ['drop after target encoding: region_code', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7faf695a0210>] to pipeline
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: region_code" dropped feature 'region_code' after target encoding
strategy transformation of feature 'region_code' to 'region_code_target_encoded' is COMPL

strategy appended step ['OneHot Encode: district_code', <scjpnlib.utils.skl_transformers.OneHotEncodingTransformer object at 0x7faf83129390>] to pipeline
strategy "OneHot Encode: district_code" transformation is COMPLETE!


strategy appended step ['tfidf normalize string-categorical: ward', FunctionTransformer(func=<function C__tfidf_normalize__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7faf7d8a1a70>)] to pipeline


In [None]:
scjpnutils.analyze_values(data_train_preprocessed, 'data_train AFTER preprocessing');

In [None]:
data_train = data_train_preprocessed
null_labeled = data_train.isnull().values.any()

<p><br>

## Apply Preprocessing Transformations (using the `Pipeline`) to Testing Data

In [None]:
scjpnutils.analyze_values(data_test, 'data_test BEFORE preprocessing');

In [None]:
data_test_preprocessed = pipeline_data_preprocessor.transform(data_test)

In [None]:
scjpnutils.analyze_values(data_test_preprocessed, 'data_test AFTER preprocessing');

In [None]:
data_test = data_test_preprocessed
null_labeled = null_labeled and data_test.isnull().values.any()
data_ALL_labeled = pd.concat([data_train, data_test], axis=0).sort_index()

<p><br>

## Apply Preprocessing Transformations (using the `Pipeline`) to Unlabeled Data

In [None]:
scjpnutils.analyze_values(data_unlabeled, 'data_unlabeled BEFORE preprocessing');

In [None]:
data_unlabeled_preprocessed = pipeline_data_preprocessor.transform(data_unlabeled)

In [None]:
scjpnutils.analyze_values(data_unlabeled_preprocessed, 'data_unlabeled AFTER preprocessing');

In [None]:
data_unlabeled = data_unlabeled_preprocessed
null_unlabeled = data_unlabeled.isnull().values.any()

<p><br>
<h2>Build Models (Run Trials)</h2>

<h3>General functions for building Classifiers and running trials</h3>

In [None]:
# utility function to render HTML and optionally log (append) it to file
def render_HTML(the_html, fname=None):
    display(HTML(the_html))
    if fname is not None:
        fm.append_text_file(BeautifulSoup(the_html).text + '\n', fname)

In [None]:
def get_trials_log_fname(clf):
    return f"{MODEL_RESULTS_DIR}{clf.__class__.__name__}-{digest}-trials.log" if LOG_MODEL_TRIALS else None

In [None]:
def gs_find_best_params(clf, param_grid):
    render_HTML(f"<br><br>param_grid for {clf.__class__.__name__} GridSearch:<br><pre>{params}</pre>", fname=get_trials_log_fname(clf))
    grid_clf = GridSearchCV(
        clf, 
        param_grid, 
        cv=K, 
        n_jobs=-1
        , verbose=20
    )
    # with joblib.parallel_backend('dask'):
    #     %time _ = grid_clf.fit(data_train, y_train)
    %time _ = grid_clf.fit(data_train, y_train)
    return grid_clf.best_params_

In [None]:
def clf_fit(clf, data_train, y_train):
    # with joblib.parallel_backend('dask'):
    #     %time clf.fit(data_train, y_train)
    %time clf.fit(data_train, y_train)
    return clf

In [None]:
def summarize_preds(clf, X, y, preds, dataset_name, classes):
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    _accuracy = accuracy_score(y, preds)
    render_HTML(f"{dataset_name} Accuracy: {round(_accuracy*100,4)}", fname=get_trials_log_fname(clf))
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{classification_report(y, preds, target_names=classes)}</pre>", fname=get_trials_log_fname(clf))
    return _accuracy

In [None]:
def get_feat_importances(clf):
    if hasattr(clf, 'feature_importances_'):
        feat_importances = {}
        for i, feat in enumerate(list(data_train.columns)):
            feat_importances[feat] = clf.feature_importances_[i]
        return sorted(list(feat_importances.items()), key=lambda item: item[1], reverse=True)
    else:
        return None

In [None]:
def clf_run_trial(clf, params_to_try, best_parameters_so_far, run_trials_gridsearch=False):
    if run_trials_gridsearch:
        for param_name, param_value in best_parameters_so_far.items():
            params_to_try.update({param_name: [param_value]})
        best_parameters = gs_find_best_params(clf, params_to_try)
    else:
        best_parameters = params_to_try
    best_parameters_so_far.update(best_parameters)

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Grid Search {'(previously) ' if not run_trials_gridsearch else ''}found the following optimal parameters: ", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{pprint.pformat(best_parameters_so_far, indent=4)}</pre>", fname=get_trials_log_fname(clf))

    _y_train = y_train_encoded.status_group.ravel()
    _y_test = y_test_encoded.status_group.ravel()
    
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Fitting classifier...", fname=get_trials_log_fname(clf))
    clf = clf.set_params(**best_parameters_so_far)
    clf = clf_fit(clf, data_train, _y_train)
    s_all_done = "\tALL DONE!"
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))

    _class_name = clf.__class__.__name__
    model_results['modeling_results'][_class_name] = {}

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Predicting labels on training data...", fname=get_trials_log_fname(clf))
    pred_train = clf.predict(data_train)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_train = summarize_preds(clf, data_train, _y_train, pred_train, 'Training', classes)
    model_results['modeling_results'][_class_name]['accuracy'] = {}
    model_results['modeling_results'][_class_name]['accuracy']['train'] = _accuracy_train

    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML("Computing cross-val score on training data...")
        model_results['modeling_results'][_class_name]['cv_score'] = {}
        cv_score_train = cross_val_score(clf, data_train, _y_train, cv=cross_val_score_K)
        mean_cv_score_train = np.mean(cv_score_train)
        model_results['modeling_results'][_class_name]['cv_score']['train'] = mean_cv_score_train
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_train}</pre>")
        render_HTML(f"cross_val_score: {mean_cv_score_train}")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Predicting labels on testing data...", fname=get_trials_log_fname(clf))
    pred_test = clf.predict(data_test)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_test = summarize_preds(clf, data_test, _y_test, pred_test, 'Testing', classes)
    model_results['modeling_results'][_class_name]['accuracy']['test'] = _accuracy_test
    model_results['modeling_results'][_class_name]['feature_importances'] = get_feat_importances(clf)
    
    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML(f"Computing cross-val score on testing data...")
        cv_score_test = cross_val_score(clf, data_test, _y_test, cv=cross_val_score_K)
        mean_cv_score_test = np.mean(cv_score_test)
        model_results['modeling_results'][_class_name]['cv_score']['test'] = mean_cv_score_test
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_test}</pre>")
        render_HTML(f"cross_val_score: {mean_cv_score_test}")

    if hasattr(clf, 'feature_importances_'):
        render_HTML("<p><br>", fname=get_trials_log_fname(clf))
        render_HTML("Feature Importances:", fname=get_trials_log_fname(clf))
        render_HTML(f"<pre>{pprint.pformat(model_results['modeling_results'][_class_name]['feature_importances'], indent=4)}</pre><p><br><br>", fname=get_trials_log_fname(clf))

    return clf, best_parameters_so_far, model_results

In [None]:
def clf_build_final_model(clf, params):
    render_HTML("Fitting classifier {} to ALL LABALED data...")
    clf = clf.set_params(**best_parameters_so_far)
    clf = clf_fit(clf, data_ALL_labeled_with_target.drop('status_group', axis=1), data_ALL_labeled_with_target[['status_group']])
    s_all_done = "\tALL DONE!"
    render_HTML(f"<pre>{s_all_done}</pre>")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Predicting labels of UNLABELED data...", fname=get_trials_log_fname(clf))
    pred_unlabeled = clf.predict(data_unlabeled)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))

    return 

<p><br>
<h2>Initialize Dask-Client (to Dask backend for parallelization) <i>(DISABLED for now)</i></h2>

In [None]:
if models_config['dask']['use']:
    if models_config['dask']['is_remote']:
        # for Kubernetes dask scheduler/worker cluster in GCP - but this costs money to run the cluster AND requires a lot more work for data parallelization!
        dask_client = Client(f"tcp://{models_config['dask']['remote']['scheduler_address']}:8786")
    else:
        # local
        dask_client = Client( #spawns a local cluster
            n_workers=models_config['dask']['local']['n_workers'], 
            threads_per_worker=models_config['dask']['local']['n_jobs'], 
            memory_limit=models_config['dask']['local']['memory_limit'] # memory_limit is per worker
        )

    dask_client

<p><br>
<h3>Decision Tree Classifier</h3>
<h4>Trials</h4>

In [None]:
run_dtclf = models_config['DecisionTreeClassifier']['run'] and not null_labeled and not null_unlabeled
render_HTML(f"models_config['DecisionTreeClassifier']['run']: {models_config['DecisionTreeClassifier']['run']}; data_ALL_labeled_with_target.isnull().values.any(): {null_labeled}; data_unlabeled.isnull().values.any(): {null_unlabeled}")

if run_dtclf:
    trials = models_config['DecisionTreeClassifier']['trials']

    display(HTML(f"models_config['DecisionTreeClassifier']['trials']['run']: {trials['run']}"))
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            display(HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>"))
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            rfclf, best_parameters, model_results = clf_run_trial(DecisionTreeClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['DecisionTreeClassifier']['params']

<p><br>
<h4>Build Validation Final Model with best params</h4>

In [None]:
if run_dtclf:
    best_parameters.update({'random_state': SEED})
    dtclf, _, model_results = clf_run_trial(DecisionTreeClassifier(), best_parameters, best_parameters)
    
    pred_unlabeled = dtclf.predict(data_unlabeled)
    pred_unlabeled

In [None]:
if run_dtclf:
    bagging_dtclf, _, model_results = clf_run_trial(BaggingClassifier(base_estimator=dtclf, n_estimators=500, random_state=SEED), {}, {})
    pred_unlabeled = bagging_dtclf.predict(data_unlabeled)

<p><br><br>
<h4>Build Final Model with ALL Labeled Data (Test + Validation)</h4>

In [None]:
if run_dtclf:
    dtclf = DecisionTreeClassifier()
    del best_parameters['random_state']
    dtclf.set_params(**best_parameters)
    dtclf.fit(data_ALL_labeled, labels_encoded.status_group)

<p><br><br>
<h4>Make Predictions with Final Model on Unlabeled Predictors</h4>

In [None]:
if run_dtclf:
    pred_unlabeled = dtclf.predict(data_unlabeled)
    df_pred_unlabeled = pd.concat([data_unlabeled.reset_index()[['id']], pd.DataFrame(pred_unlabeled, columns=['status_group'])], axis=1).set_index('id')
    df_pred_unlabeled.info()
    fname__preds_unlabeled = f"{MODEL_RESULTS_DIR}DecisionTreeClassifier-preds-{digest}.csv"
    df_pred_unlabeled.to_csv(fname__preds_unlabeled, sep=',')
    print(f"updated {fname__preds_unlabeled}")

<p><br>
<h3>Random Forest Classifier</h3>
<h4>Trials</h4>

In [None]:
run_rfclf = models_config['RandomForestClassifier']['run'] and not null_labeled and not null_unlabeled
render_HTML(f"models_config['RandomForestClassifier']['run']: {models_config['RandomForestClassifier']['run']}; data_ALL_labeled_with_target.isnull().values.any(): {null_labeled}; data_unlabeled.isnull().values.any(): {null_unlabeled}")

if run_rfclf:
    trials = models_config['RandomForestClassifier']['trials']

    render_HTML(f"models_config['RandomForestClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            rfclf, best_parameters, model_results = clf_run_trial(RandomForestClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['RandomForestClassifier']['params']

<p><br>
<h4>Build Final Vaildation Model with best params</h4>

In [None]:
if run_rfclf:
    best_parameters.update({'n_jobs':-1, 'verbose':1, 'random_state': SEED})
    rfclf, _ , model_results = clf_run_trial(RandomForestClassifier(), best_parameters, best_parameters)

<p><br><br>
<h4>Build Final Model with ALL Labeled Data (Test + Validation)</h4>

In [None]:
if run_rfclf:
    rfclf = RandomForestClassifier()
    del best_parameters['random_state']
    rfclf.set_params(**best_parameters)
    rfclf.fit(data_ALL_labeled, labels_encoded.status_group)

<p><br><br>
<h4>Make Predictions with Final Model on Unlabeled Predictors</h4>

In [None]:
if run_rfclf:
    pred_unlabeled = rfclf.predict(data_unlabeled)
    df_pred_unlabeled = pd.concat([data_unlabeled.reset_index()[['id']], pd.DataFrame(pred_unlabeled, columns=['status_group'])], axis=1).set_index('id')
    df_pred_unlabeled.info()
    fname__preds_unlabeled = f"{MODEL_RESULTS_DIR}RandomForestClassifier-preds-{digest}.csv"
    df_pred_unlabeled.to_csv(fname__preds_unlabeled, sep=',')
    print(f"updated {fname__preds_unlabeled}")

<p><br><br><br>
<h3>XGBClassifier</h3>
<h4>Trials</h4>

In [None]:
run_xgbclf = models_config['XGBClassifier']['run']
render_HTML(f"models_config['XGBClassifier']['run']: {run_xgbclf}")

if run_xgbclf:
    trials = models_config['XGBClassifier']['trials']

    render_HTML(f"models_config['XGBClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            xgbclf, best_parameters, model_results = clf_run_trial(XGBClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['XGBClassifier']['params']

<p><br>
<h4>Build Final Vaildation Model with best params</h4>

In [None]:
if run_xgbclf:
    best_parameters.update({'n_jobs':-1, 'verbosity':1, 'random_state': SEED})
    xgbclf, _ , model_results = clf_run_trial(XGBClassifier(), best_parameters, best_parameters)

<p><br><br>
<h4>Build Final Model with ALL Labeled Data (Test + Validation)</h4>

In [None]:
if run_xgbclf:
    xgbclf = XGBClassifier()
    del best_parameters['random_state']
    xgbclf.set_params(**best_parameters)
    xgbclf.fit(data_ALL_labeled, labels_encoded.status_group)

<p><br><br>
<h4>Make Predictions with Final Model on Unlabeled Predictors</h4>

In [None]:
if run_xgbclf:
    pred_unlabeled = xgbclf.predict(data_unlabeled)
    df_pred_unlabeled = pd.concat([data_unlabeled.reset_index()[['id']], pd.DataFrame(pred_unlabeled, columns=['status_group_encoded'])], axis=1).set_index('id')
    df_pred_unlabeled['status_group'] = df_pred_unlabeled['status_group_encoded'].apply(lambda sg_encoded: classes[sg_encoded])
    df_pred_unlabeled = df_pred_unlabeled.drop('status_group_encoded', axis=1)
    df_pred_unlabeled.info()
    fname__preds_unlabeled = f"{MODEL_RESULTS_DIR}XGBClassifier-preds-{digest}.csv"
    df_pred_unlabeled.to_csv(fname__preds_unlabeled, sep=',')
    print(f"updated {fname__preds_unlabeled}")

<p><br><br><br>
<h3>Trying out a different clf</h3>
<h4>Trials</h4>

In [None]:
params = {
    'max_depth': [10], 
    'n_estimators': [1000], 
    'loss_function': ['MultiClass'], 
    'grow_policy': ['Depthwise'], 
    'nan_mode': ['Min'],
    'random_state': [SEED]
}
best_parameters = {}
cbclf, best_parameters , model_results = clf_run_trial(
    CatBoostClassifier(verbose=False), 
    params, 
    best_parameters,
    run_trials_gridsearch=True
)

In [None]:
params = {   
    'max_depth': 10,
    'grow_policy': 'Depthwise',
    'loss_function': 'MultiClass',
    'n_estimators': 1000,
    'nan_mode': 'Min'
}
cbclf, _ , model_results = clf_run_trial(
    CatBoostClassifier(random_state=SEED, verbose=False), 
    params, 
    params,
    run_trials_gridsearch=False
)

In [None]:
params = {   
    'grow_policy': 'Depthwise',
    'loss_function': 'MultiClass',
    'max_depth': 10,
    'n_estimators': 1000,
    'nan_mode': 'Min'
}
cbclf, _ , model_results = clf_run_trial(
    CatBoostClassifier(), 
    params, 
    params,
    run_trials_gridsearch=False
)

In [None]:
# params = {'kernel': ['poly', 'rbf', 'sigmoid'], 'break_ties': [False, True], 'random_state': [SEED]}
# best_parameters = {}
# svmclf, best_parameters , model_results = clf_run_trial(
#     svm.SVC(), 
#     params, 
#     best_parameters,
#     run_trials_gridsearch=True
# )

# BaggingClassifier(base_estimator=dtclf, n_estimators=500, random_state=SEED), {}, {}
bagging_dtclf, _, model_results = clf_run_trial(BaggingClassifier(base_estimator=svm.SVC(), n_estimators=100, random_state=SEED), {}, {})
pred_unlabeled = bagging_dtclf.predict(data_unlabeled)

In [None]:
# params = {'estimators': [('rf', rfclf), ('xgb', xgbclf)], 'voting': 'hard'}
params = {
    'estimators': [
        ('rf', rfclf),
        ('cb', cbclf), 
        ('xgb', xgbclf)
    ], 
    'voting': 'hard'
}
vclf, _ , model_results = clf_run_trial(VotingClassifier(estimators=params['estimators'], voting=params['voting']), params, params)

<p><br><br>
<h4>Save All Model Results to File</h4>

In [None]:
fm.save_json(model_results, f"{model_results_fname}")
print(f"updated {model_results_fname}")