<h2>Import Libraries</h2>

In [1]:
from scjpnlib.utils.file_io import FileManager
import os 
import scjpnlib.utils as scjpnutils
import pickle
import json
from IPython.core.display import HTML, Markdown
import html2text
from bs4 import BeautifulSoup
import pprint

import pandas as pd
import numpy as np

from scjpnlib.utils.skl_transformers import LabelEncodingTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import scjpnlib.utils.strategy_transformers as scjpnstrattransformers

from sklearn.model_selection import GridSearchCV, cross_val_score
import dask_ml.model_selection as dcv
from dask.distributed import Client
import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier 
# import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

n_jobs = 8

LOG_MODEL_TRIALS = True

## Read Configs for this Experiment

In [2]:
fm = FileManager()

preprocessing_config = fm.load_json('preprocessing-config.txt')
models_config = fm.load_json('models-config.txt')
models_config

{'logging': {'dir': 'model-results'},
 'cross_validation': {'k': {'grid_search': 3, 'score': 5}},
 'dask': {'use': False,
  'is_remote': False,
  'local': {'n_workers': 1, 'n_jobs': 8, 'memory_limit': '16GB'},
  'remote': {'scheduler_address': '35.230.13.87'}},
 'SEED': 42,
 'DecisionTreeClassifier': {'run': True,
  'trials': {'run': True,
   'array': [{'gridsearch': {'run': True,
      'param_grid': {'criterion': ['entropy', 'gini'],
       'splitter': ['best'],
       'max_depth': [10, 50, 75, None],
       'min_samples_split': [2],
       'max_features': ['auto', 'sqrt', 'log2']},
      'last_best': {'criterion': 'gini',
       'max_depth': 10,
       'max_features': 'auto',
       'min_samples_split': 2,
       'random_state': 42,
       'splitter': 'best'}}}]},
  'params': {'criterion': 'gini',
   'max_depth': 10,
   'max_features': 'auto',
   'min_samples_split': 2,
   'random_state': 42,
   'splitter': 'best'}},
 'RandomForestClassifier': {'run': True,
  'trials': {'run': False,

In [3]:
is_data_cached = 'data_cached' in models_config
data_config = models_config['data_cached'] if is_data_cached else fm.load_json('preprocessing-config.txt')

In [4]:
if len(data_config['wrangled_data']['dir']) > 0:
    WRANGLED_DATA_DIR = data_config['wrangled_data']['dir'] + "/"
else:
    WRANGLED_DATA_DIR = ""

if len(data_config['labels']['dir']) > 0:
    SAVE_LABELS_DIR = data_config['labels']['dir'] + "/"
else:
    SAVE_LABELS_DIR = ""

In [5]:
preprocessing_spec = fm.load_json(WRANGLED_DATA_DIR + "preprocessing-spec-last.json")

In [6]:
digest = data_config['digest'] if is_data_cached else scjpnutils.json_to_md5_hash_digest(preprocessing_spec)
print(f"digest for last preprocessing spec: {digest}")                                                                                      

digest for last preprocessing spec: 69feae104538a094014ae5cec62abdf0


In [7]:
fname__train_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':False,'type':'train','is_cached':is_data_cached})
fname__train_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':True,'type':'train','is_cached':is_data_cached})
fname__validation_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':False,'type':'test','is_cached':is_data_cached})
fname__validation_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':True,'type':'test','is_cached':is_data_cached})
fname__unlabeled_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(data_config, preprocessing_config, data_kwargs={'is_labels':False,'type':'unlabeled'})

In [8]:
if len(models_config['logging']['dir']) > 0:
    MODEL_RESULTS_DIR = models_config['logging']['dir']
    try:
        os.makedirs(f"{os.getcwd()}/{MODEL_RESULTS_DIR}")
    except FileExistsError:
        pass
    MODEL_RESULTS_DIR += "/"
else:
    MODEL_RESULTS_DIR = ""
    
model_results_fname = MODEL_RESULTS_DIR + scjpnutils.get_model_result_fname(data_config, preprocessing_spec, data_kwargs={'is_cached':is_data_cached})
print(f"modeling results will be saved to: {model_results_fname}")

modeling results will be saved to: model-results/models-results-69feae104538a094014ae5cec62abdf0.json


In [9]:
model_results = {}
model_results['digest'] = digest
model_results['modeling_results'] = {}

In [10]:
SEED = models_config['SEED']
model_results['seed'] = SEED

In [11]:
K = models_config['cross_validation']['k']['grid_search'] # num folds for cross-val
cross_val_score_K = models_config['cross_validation']['k']['score']

<p><br>

## Load Data

In [12]:
fname_unlabeled_predictors = preprocessing_config['official_data']['unlabeled_predictors']['local_fname']
fname_labeled_predictors = preprocessing_config['official_data']['labeled_predictors']['local_fname']
fname_labels = preprocessing_config['official_data']['labels']['local_fname']

ds_map = {
    fname_unlabeled_predictors: preprocessing_config['official_data']['unlabeled_predictors']['url'], 
    fname_labeled_predictors: preprocessing_config['official_data']['unlabeled_predictors']['url'],
    fname_labels: preprocessing_config['official_data']['unlabeled_predictors']['url']
}

fm.validate_download(ds_map)

In [13]:
labeled_with_target = pd.concat(
    [
        pd.read_csv(fname_labeled_predictors, index_col=0), 
        pd.read_csv(fname_labels, index_col=0)
    ], 
    axis=1
    ,
    join='inner'
).sort_index()

In [14]:
labels = labeled_with_target[['status_group']]

In [15]:
let_labels = LabelEncodingTransformer(['status_group'])
labels_encoded = let_labels.fit_transform(labels)
classes = list(let_labels.labelencoder.classes_)
classes

['functional', 'functional needs repair', 'non functional']

In [16]:
labeled_with_target['status_group_encoded'] = labels_encoded.status_group

In [17]:
X_labeled = labeled_with_target.drop(['status_group', 'status_group_encoded'], axis=1)

In [72]:
data_unlabeled = pd.read_csv(fname_unlabeled_predictors, index_col=0)

## Prepare Test/Train Data

In [19]:
data_train, data_test, y_train, y_test = train_test_split(X_labeled, labels, test_size=preprocessing_config['test_ratio'], random_state=SEED)

In [20]:
y_train_encoded = labels_encoded.loc[y_train.index]

In [21]:
data_train_with_target = pd.concat([data_train, y_train_encoded], axis=1, join='inner')

In [22]:
y_test_encoded = labels_encoded.loc[y_test.index]

In [23]:
data_test_with_target = pd.concat([data_test, y_test_encoded], axis=1, join='inner')

In [24]:
data_ALL_labeled_with_target = pd.concat([data_train_with_target, data_test_with_target], axis=0).sort_index()

<p><br>

## Apply Preprocessing Transformations to Training Data

In [25]:
pipeline_data_preprocessor = Pipeline(steps=[('passthrough', None)], verbose=True)

scjpnutils.analyze_values(data_train, 'data_train BEFORE preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,96,"[0.0, 1000.0, 250.0, 5000.0, 3000.0, 2400.0, 5...",0.001796,99.82,0,0.0,
1,date_recorded,object,352,"[2013-03-06, 2013-02-05, 2011-07-24, 2012-11-0...",0.006584,99.34,0,0.0,
2,funder,object,1799,"[Kiliwater, Unicef, Danida, Tasaf/tlc, Dhv, Am...",0.033651,96.63,3269,0.061149,"Int64Index([ 569, 35243, 7826, 7576, 71341,..."
3,gps_height,int64,2415,"[1424, 1358, 0, 299, -14, 1490, 1421, 915, 891...",0.045174,95.48,0,0.0,
4,installer,object,2035,"[Kiliwater, TWESA, Central government, TASAF/T...",0.038066,96.19,3287,0.061485,"Int64Index([ 569, 35243, 7826, 7576, 71341,..."
5,longitude,float64,51753,"[37.61840909, 30.6557619, 33.81319755, 33.1151...",0.96807,3.19,0,0.0,
6,latitude,float64,51755,"[-3.26320247, -3.53625, -9.47660713, -4.835553...",0.968107,3.19,0,0.0,
7,wpt_name,object,34085,"[Kwa Moris Assenga, Bavunja Primary School, Za...",0.637579,36.24,0,0.0,
8,num_private,int64,61,"[0, 34, 65, 32, 1, 8, 41, 6, 15, 3, 698, 1402,...",0.001141,99.89,0,0.0,
9,basin,object,9,"[Pangani, Lake Tanganyika, Lake Nyasa, Rufiji,...",0.000168,99.98,0,0.0,


In [26]:
data_train_preprocessed = data_train.copy()

for group_name, preprocessing_option in preprocessing_spec.items():
    composite_transformer = scjpnstrattransformers.instantiate_strategy_transformer(
        preprocessing_option, 
        group_name,
        pipeline_data_preprocessor
    )
    scjpnstrattransformers.html_prettify_strategy_transformer_description(composite_transformer)
    data_train_preprocessed = composite_transformer.fit_transform(data_train_preprocessed, y_train_encoded.status_group)
    display(HTML("<p><br><br>"))

** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: extraction_type', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaaa43eb90>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'extraction_type' in X match those that were previously fit
added new feature: extraction_type_target_encoded
strategy appended step ['drop after target encoding: extraction_type', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb89b90>] to pipeline
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: extraction_type" dropped feature 'extraction_type' after target encoding
strategy transformation of feature 'extraction_type' to 'extracti

strategy appended step ['drop feature: extraction_type_group', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaaa43dc10>] to pipeline
strategy "drop feature: extraction_type_group" transformation is COMPLETE!


strategy appended step ['drop feature: extraction_type_class', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaaa43dd90>] to pipeline
strategy "drop feature: extraction_type_class" transformation is COMPLETE!


strategy appended step ['replace "amount_tsh" values (0) with nan', FunctionTransformer(func=<function C__replace_0_with_nan__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaaa43ea70>)] to pipeline
strategy "replace "amount_tsh" values (0) with nan" transformation is COMPLETE!


** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: source', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaaa43ec20>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'source' in X match those that were previously fit
added new feature: source_target_encoded
strategy appended step ['drop after target encoding: source', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcac6238410>] to pipeline
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: source" dropped feature 'source' after target encoding
strategy transformation of feature 'source' to 'source_target_encoded' is COMPLETE!


strategy appended step ['drop feature: source_type', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcac6238050>] to pipeline
strategy "drop feature: source_type" transformation is COMPLETE!


strategy appended step ['drop feature: source_class', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb89bd0>] to pipeline
strategy "drop feature: source_class" transformation is COMPLETE!


** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: quantity', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaacb8bd40>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'quantity' in X match those that were previously fit
added new feature: quantity_target_encoded
strategy appended step ['drop after target encoding: quantity', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94050>] to pipeline
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: quantity" dropped feature 'quantity' after target encoding
strategy transformation of feature 'quantity' to 'quantity_target_encoded' is COMPLETE!


strategy appended step ['drop feature: quantity_group', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcac621dfd0>] to pipeline
strategy "drop feature: quantity_group" transformation is COMPLETE!


** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: water_quality', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaacb8be60>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'water_quality' in X match those that were previously fit
added new feature: water_quality_target_encoded
strategy appended step ['drop after target encoding: water_quality', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb943d0>] to pipeline
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: water_quality" dropped feature 'water_quality' after target encoding
strategy transformation of feature 'water_quality' to 'water_quality_target_e

strategy appended step ['drop feature: quality_group', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaaab54250>] to pipeline
strategy "drop feature: quality_group" transformation is COMPLETE!


strategy appended step ['leave feature as is (do nothing): gps_height', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaaa43e8c0>)] to pipeline
strategy "leave feature as is (do nothing): gps_height" transformation is COMPLETE!


strategy appended step ['leave feature as is (do nothing): latitude', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaacb8bf80>)] to pipeline
strategy "leave feature as is (do nothing): latitude" transformation is COMPLETE!
strategy appended step ['leave feature as is (do nothing): longitude', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaacb96050>)] to pipeline
strategy "leave feature as is (do nothing): longitude" transformation is COMPLETE!


** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: waterpoint_type', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaacb8bef0>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'waterpoint_type' in X match those that were previously fit
added new feature: waterpoint_type_target_encoded
strategy appended step ['drop after target encoding: waterpoint_type', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94210>] to pipeline
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: waterpoint_type" dropped feature 'waterpoint_type' after target encoding
strategy transformation of feature 'waterpoint_type' to 'waterpoi

strategy appended step ['drop feature: waterpoint_type_group', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaaab54dd0>] to pipeline
strategy "drop feature: waterpoint_type_group" transformation is COMPLETE!


strategy appended step ['drop feature: installer', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb89dd0>] to pipeline
strategy "drop feature: installer" transformation is COMPLETE!


strategy appended step ['replace "population" values (0) with nan', FunctionTransformer(func=<function C__replace_0_with_nan__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaaa43e9e0>)] to pipeline
strategy "replace "population" values (0) with nan" transformation is COMPLETE!


strategy appended step ['drop feature: region_code', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94bd0>] to pipeline
strategy "drop feature: region_code" transformation is COMPLETE!
** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: region', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcaacb965f0>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'region' in X match those that were previously fit
added new feature: region_target_encoded
strategy appended step ['drop after target encoding: region', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb99450>] to pipeline
strategy "(prefit) target-encoder (LOO==False, post_encode

strategy appended step ['drop feature: subvillage', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94110>] to pipeline
strategy "drop feature: subvillage" transformation is COMPLETE!


strategy appended step ['drop feature: scheme_management', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94310>] to pipeline
strategy "drop feature: scheme_management" transformation is COMPLETE!
strategy appended step ['drop feature: scheme_name', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcac621d490>] to pipeline
strategy "drop feature: scheme_name" transformation is COMPLETE!


strategy appended step ['drop feature: date_recorded', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94f90>] to pipeline
strategy "drop feature: date_recorded" transformation is COMPLETE!
strategy appended step ['leave feature as is (do nothing): construction_year', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fcabd906b90>)] to pipeline
strategy "leave feature as is (do nothing): construction_year" transformation is COMPLETE!


strategy appended step ['drop feature: payment_type', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94690>] to pipeline
strategy "drop feature: payment_type" transformation is COMPLETE!


strategy appended step ['drop feature: payment', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94d90>] to pipeline
strategy "drop feature: payment" transformation is COMPLETE!


strategy appended step ['drop feature: num_private', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94c10>] to pipeline
strategy "drop feature: num_private" transformation is COMPLETE!


strategy appended step ['drop feature: wpt_name', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94c90>] to pipeline
strategy "drop feature: wpt_name" transformation is COMPLETE!


strategy appended step ['drop feature: basin', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94d50>] to pipeline
strategy "drop feature: basin" transformation is COMPLETE!


strategy appended step ['drop feature: public_meeting', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94d10>] to pipeline
strategy "drop feature: public_meeting" transformation is COMPLETE!


strategy appended step ['drop feature: recorded_by', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94510>] to pipeline
strategy "drop feature: recorded_by" transformation is COMPLETE!


strategy appended step ['drop feature: permit', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94650>] to pipeline
strategy "drop feature: permit" transformation is COMPLETE!


strategy appended step ['drop feature: management', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb94c50>] to pipeline
strategy "drop feature: management" transformation is COMPLETE!


strategy appended step ['drop feature: management_group', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb948d0>] to pipeline
strategy "drop feature: management_group" transformation is COMPLETE!


strategy appended step ['drop feature: funder', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fcaacb999d0>] to pipeline
strategy "drop feature: funder" transformation is COMPLETE!


In [27]:
scjpnutils.analyze_values(data_train_preprocessed, 'data_train AFTER preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,96,"[nan, 1000.0, 250.0, 5000.0, 3000.0, 2400.0, 5...",0.001796,99.82,37461,0.70073,"Int64Index([56146, 38945, 11841, 1186, 57631,..."
1,gps_height,int64,2415,"[1424, 1358, 0, 299, -14, 1490, 1421, 915, 891...",0.045174,95.48,0,0.0,
2,longitude,float64,51753,"[37.61840909, 30.6557619, 33.81319755, 33.1151...",0.96807,3.19,0,0.0,
3,latitude,float64,51755,"[-3.26320247, -3.53625, -9.47660713, -4.835553...",0.968107,3.19,0,0.0,
4,population,float64,1019,"[1.0, 500.0, nan, 263.0, 400.0, 520.0, 350.0, ...",0.019061,98.09,19310,0.361205,"Int64Index([11841, 1186, 36909, 71895, 569,..."
5,construction_year,int64,55,"[2012, 2009, 0, 1996, 2000, 2001, 1998, 2013, ...",0.001029,99.9,0,0.0,
6,extraction_type_target_encoded,float64,18,"[0.699696453074972, 0.6234335839598998, 0.8060...",0.000337,99.97,0,0.0,
7,source_target_encoded,float64,10,"[0.6771024364684307, 0.6663419386264003, 0.953...",0.000187,99.98,0,0.0,
8,quantity_target_encoded,float64,5,"[0.8617910010259416, 1.9464991023339318, 0.625...",9.4e-05,99.99,0,0.0,
9,water_quality_target_encoded,float64,8,"[0.7933839123685419, 1.709232613908873, 1.0379...",0.00015,99.99,0,0.0,


In [28]:
data_train = data_train_preprocessed
null_labeled = data_train.isnull().values.any()

<p><br>

## Apply Preprocessing Transformations (using the `Pipeline`) to Testing Data

In [29]:
scjpnutils.analyze_values(data_test, 'data_test BEFORE preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,56,"[0.0, 5000.0, 30.0, 8000.0, 50.0, 500.0, 1500....",0.009428,99.06,0,0.0,
1,date_recorded,object,310,"[2013-03-13, 2011-03-15, 2013-02-19, 2013-01-1...",0.052189,94.78,0,0.0,
2,funder,object,622,"[Danida, Kalitasi, Wateraid, Gaica, Lga, W.B, ...",0.104714,89.53,366,0.061616,"Int64Index([11582, 19363, 56691, 5592, 33874,..."
3,gps_height,int64,1717,"[1435, 1271, 1384, 1538, 0, 919, 78, 1266, 324...",0.289057,71.09,0,0.0,
4,installer,object,681,"[DANIDA, DANID, Kalitasi, SEMA, GAICA, DWE, Ca...",0.114646,88.54,368,0.061953,"Int64Index([11582, 19363, 56691, 5592, 33874,..."
5,longitude,float64,5782,"[34.90829229, 35.81058308, 37.99252752, 34.758...",0.973401,2.66,0,0.0,
6,latitude,float64,5782,"[-11.16992903, -7.50962812, -4.23701933, -4.30...",0.973401,2.66,0,0.0,
7,wpt_name,object,4588,"[Kanisani, none, Kwa Mzee Tadei, Kulumbe, Shul...",0.772391,22.76,0,0.0,
8,num_private,int64,33,"[0, 6, 120, 17, 7, 32, 180, 93, 5, 8, 1, 15, 3...",0.005556,99.44,0,0.0,
9,basin,object,9,"[Lake Nyasa, Rufiji, Pangani, Internal, Lake R...",0.001515,99.85,0,0.0,


In [30]:
data_test_preprocessed = pipeline_data_preprocessor.transform(data_test)

** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'extraction_type' in X match those that were previously fit
added new feature: extraction_type_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'source' in X match those that were previously fit
added new feature: source_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'quantity' in X match those that were previously fit
added new feature: quantity_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'water_quality' in X match those that were previously fit
added new feature: water_quality_target_encoded
** TargetEncoder

In [31]:
scjpnutils.analyze_values(data_test_preprocessed, 'data_test AFTER preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,56,"[nan, 5000.0, 30.0, 8000.0, 50.0, 500.0, 1500....",0.009428,99.06,4178,0.703367,"Int64Index([ 3746, 28265, 16680, 29110, 73792,..."
1,gps_height,int64,1717,"[1435, 1271, 1384, 1538, 0, 919, 78, 1266, 324...",0.289057,71.09,0,0.0,
2,longitude,float64,5782,"[34.90829229, 35.81058308, 37.99252752, 34.758...",0.973401,2.66,0,0.0,
3,latitude,float64,5782,"[-11.16992903, -7.50962812, -4.23701933, -4.30...",0.973401,2.66,0,0.0,
4,population,float64,388,"[60.0, 1.0, 15.0, 189.0, nan, 200.0, 454.0, 16...",0.06532,93.47,2071,0.348653,"Int64Index([16680, 7097, 26799, 35654, 22940,..."
5,construction_year,int64,54,"[1995, 1990, 2002, 2006, 0, 2009, 2005, 2011, ...",0.009091,99.09,0,0.0,
6,extraction_type_target_encoded,float64,17,"[0.699696453074972, 1.2068562524347488, 0.8540...",0.002862,99.71,0,0.0,
7,source_target_encoded,float64,10,"[0.6771024364684307, 0.9813354626210201, 0.666...",0.001684,99.83,0,0.0,
8,quantity_target_encoded,float64,5,"[1.9464991023339318, 0.6255982863071928, 0.861...",0.000842,99.92,0,0.0,
9,water_quality_target_encoded,float64,8,"[0.7933839123685419, 1.709232613908873, 1.0379...",0.001347,99.87,0,0.0,


In [32]:
data_test = data_test_preprocessed
null_labeled = null_labeled and data_test.isnull().values.any()
data_ALL_labeled = pd.concat([data_train, data_test], axis=0).sort_index()

<p><br>

## Apply Preprocessing Transformations (using the `Pipeline`) to Unlabeled Data

In [74]:
scjpnutils.analyze_values(data_unlabeled, 'data_unlabeled BEFORE preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,68,"[0.0, 500.0, 30.0, 5.0, 1000.0, 1200.0, 200.0,...",0.004579,99.54,0,0.0,
1,date_recorded,object,331,"[2013-02-04, 2013-02-01, 2013-01-22, 2013-03-2...",0.02229,97.77,0,0.0,
2,funder,object,981,"[Dmdd, Government Of Tanzania, nan, Finn Water...",0.066061,93.39,869,0.058519,"Int64Index([17168, 27714, 30331, 15266, 51897,..."
3,gps_height,int64,2157,"[1996, 1569, 1567, 267, 1260, 1685, 550, 234, ...",0.145253,85.47,0,0.0,
4,installer,object,1092,"[DMDD, DWE, nan, FINN WATER, BRUDER, Gover, Fi...",0.073535,92.65,877,0.059057,"Int64Index([17168, 27714, 30331, 15266, 51897,..."
5,longitude,float64,14390,"[35.2907992, 36.65670893, 34.76786315, 38.0580...",0.969024,3.1,0,0.0,
6,latitude,float64,14390,"[-4.05969643, -3.30921425, -5.00434437, -9.418...",0.969024,3.1,0,0.0,
7,wpt_name,object,10840,"[Dinamu Secondary School, Kimnyak, Puma Second...",0.729966,27.0,0,0.0,
8,num_private,int64,36,"[0, 3, 93, 1, 22, 102, 6, 5, 16, 669, 11, 7, 2...",0.002424,99.76,0,0.0,
9,basin,object,9,"[Internal, Pangani, Ruvuma / Southern Coast, R...",0.000606,99.94,0,0.0,


In [75]:
data_unlabeled_preprocessed = pipeline_data_preprocessor.transform(data_unlabeled)

** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'extraction_type' in X match those that were previously fit
added new feature: extraction_type_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'source' in X match those that were previously fit
added new feature: source_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'quantity' in X match those that were previously fit
added new feature: quantity_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'water_quality' in X match those that were previously fit
added new feature: water_quality_target_encoded
** TargetEncoder

In [76]:
scjpnutils.analyze_values(data_unlabeled_preprocessed, 'data_unlabeled AFTER preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,68,"[nan, 500.0, 30.0, 5.0, 1000.0, 1200.0, 200.0,...",0.004579,99.54,10410,0.70101,"Int64Index([50785, 51630, 17168, 45559, 52449,..."
1,gps_height,int64,2157,"[1996, 1569, 1567, 267, 1260, 1685, 550, 234, ...",0.145253,85.47,0,0.0,
2,longitude,float64,14390,"[35.2907992, 36.65670893, 34.76786315, 38.0580...",0.969024,3.1,0,0.0,
3,latitude,float64,14390,"[-4.05969643, -3.30921425, -5.00434437, -9.418...",0.969024,3.1,0,0.0,
4,population,float64,637,"[321.0, 300.0, 500.0, 250.0, 60.0, 200.0, 600....",0.042896,95.71,5453,0.367205,"Int64Index([ 653, 14017, 40228, 28330, 70970,..."
5,construction_year,int64,55,"[2012, 2000, 2010, 1987, 1990, 2007, 1982, 199...",0.003704,99.63,0,0.0,
6,extraction_type_target_encoded,float64,17,"[1.6454107940873153, 0.699696453074972, 0.7667...",0.001145,99.89,0,0.0,
7,source_target_encoded,float64,10,"[0.6663419386264003, 0.6771024364684307, 0.953...",0.000673,99.93,0,0.0,
8,quantity_target_encoded,float64,5,"[0.7478094194961665, 0.8617910010259416, 1.946...",0.000337,99.97,0,0.0,
9,water_quality_target_encoded,float64,8,"[0.7933839123685419, 1.0379949645227742, 0.885...",0.000539,99.95,0,0.0,


In [77]:
data_unlabeled = data_unlabeled_preprocessed
null_unlabeled = data_unlabeled.isnull().values.any()

<p><br>
<h2>Build Models (Run Trials)</h2>

<h3>General functions for building Classifiers and running trials</h3>

In [37]:
# utility function to render HTML and optionally log (append) it to file
def render_HTML(the_html, fname=None):
    display(HTML(the_html))
    if fname is not None:
        fm.append_text_file(BeautifulSoup(the_html).text + '\n', fname)

In [38]:
def get_trials_log_fname(clf):
    return f"{MODEL_RESULTS_DIR}{clf.__class__.__name__}-{digest}-trials.log" if LOG_MODEL_TRIALS else None

In [39]:
def gs_find_best_params(clf, param_grid):
    render_HTML(f"<br><br>param_grid for {clf.__class__.__name__} GridSearch:<br><pre>{params}</pre>", fname=get_trials_log_fname(clf))
    grid_clf = GridSearchCV(
        clf, 
        param_grid, 
        cv=K, 
        n_jobs=-1
        , verbose=20
    )
    # with joblib.parallel_backend('dask'):
    #     %time _ = grid_clf.fit(data_train, y_train)
    %time _ = grid_clf.fit(data_train, y_train)
    return grid_clf.best_params_

In [40]:
def clf_fit(clf, data_train, y_train):
    # with joblib.parallel_backend('dask'):
    #     %time clf.fit(data_train, y_train)
    %time clf.fit(data_train, y_train)
    return clf

In [41]:
def summarize_preds(clf, X, y, preds, dataset_name, classes):
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    _accuracy = accuracy_score(y, preds)
    render_HTML(f"{dataset_name} Accuracy: {round(_accuracy*100,4)}", fname=get_trials_log_fname(clf))
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{classification_report(y, preds, target_names=classes)}</pre>", fname=get_trials_log_fname(clf))
    return _accuracy

In [42]:
def get_feat_importances(clf):
    feat_importances = {}
    for i, feat in enumerate(list(data_train.columns)):
        feat_importances[feat] = clf.feature_importances_[i]
    return sorted(list(feat_importances.items()), key=lambda item: item[1], reverse=True)

In [43]:
def clf_run_trial(clf, params_to_try, best_parameters_so_far, run_trials_gridsearch=False):
    if run_trials_gridsearch:
        for param_name, param_value in best_parameters_so_far.items():
            params_to_try.update({param_name: [param_value]})
        best_parameters = gs_find_best_params(clf, params_to_try)
    else:
        best_parameters = params_to_try
    best_parameters_so_far.update(best_parameters)

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Grid Search {'(previously) ' if not run_trials_gridsearch else ''}found the following optimal parameters: ", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{pprint.pformat(best_parameters_so_far, indent=4)}</pre>", fname=get_trials_log_fname(clf))

    _y_train = y_train_encoded.status_group.ravel()
    _y_test = y_test_encoded.status_group.ravel()
    
    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Fitting classifier...", fname=get_trials_log_fname(clf))
    clf = clf.set_params(**best_parameters_so_far)
    clf = clf_fit(clf, data_train, _y_train)
    s_all_done = "\tALL DONE!"
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))

    _class_name = clf.__class__.__name__
    model_results['modeling_results'][_class_name] = {}

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Predicting labels on training data...", fname=get_trials_log_fname(clf))
    pred_train = clf.predict(data_train)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_train = summarize_preds(clf, data_train, _y_train, pred_train, 'Training', classes)
    model_results['modeling_results'][_class_name]['accuracy'] = {}
    model_results['modeling_results'][_class_name]['accuracy']['train'] = _accuracy_train

    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML("Computing cross-val score on training data...")
        model_results['modeling_results'][_class_name]['cv_score'] = {}
        cv_score_train = cross_val_score(clf, data_train, _y_train, cv=cross_val_score_K)
        mean_cv_score_train = np.mean(cv_score_train)
        model_results['modeling_results'][_class_name]['cv_score']['train'] = mean_cv_score_train
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_train}</pre>")
        render_HTML(f"cross_val_score: {mean_cv_score_train}")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML(f"Predicting labels on testing data...", fname=get_trials_log_fname(clf))
    pred_test = clf.predict(data_test)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))
    _accuracy_test = summarize_preds(clf, data_test, _y_test, pred_test, 'Testing', classes)
    model_results['modeling_results'][_class_name]['accuracy']['test'] = _accuracy_test
    model_results['modeling_results'][_class_name]['feature_importances'] = get_feat_importances(clf)
    
    if not run_trials_gridsearch:
        render_HTML("<p><br>")
        render_HTML(f"Computing cross-val score on testing data...")
        cv_score_test = cross_val_score(clf, data_test, _y_test, cv=cross_val_score_K)
        mean_cv_score_test = np.mean(cv_score_test)
        model_results['modeling_results'][_class_name]['cv_score']['test'] = mean_cv_score_test
        render_HTML(f"<pre>{s_all_done} scores: {cv_score_test}</pre>")
        render_HTML(f"cross_val_score: {mean_cv_score_test}")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Feature Importances:", fname=get_trials_log_fname(clf))
    render_HTML(f"<pre>{pprint.pformat(model_results['modeling_results'][_class_name]['feature_importances'], indent=4)}</pre><p><br><br>", fname=get_trials_log_fname(clf))

    return clf, best_parameters_so_far, model_results

In [44]:
def clf_build_final_model(clf, params):
    render_HTML("Fitting classifier {} to ALL LABALED data...")
    clf = clf.set_params(**best_parameters_so_far)
    clf = clf_fit(clf, data_ALL_labeled_with_target.drop('status_group', axis=1), data_ALL_labeled_with_target[['status_group']])
    s_all_done = "\tALL DONE!"
    render_HTML(f"<pre>{s_all_done}</pre>")

    render_HTML("<p><br>", fname=get_trials_log_fname(clf))
    render_HTML("Predicting labels of UNLABELED data...", fname=get_trials_log_fname(clf))
    pred_unlabeled = clf.predict(data_unlabeled)
    render_HTML(f"<pre>{s_all_done}</pre>", fname=get_trials_log_fname(clf))

    return 

<p><br>
<h2>Initialize Dask-Client (to Dask backend for parallelization) <i>(DISABLED for now)</i></h2>

In [45]:
if models_config['dask']['use']:
    if models_config['dask']['is_remote']:
        # for Kubernetes dask scheduler/worker cluster in GCP - but this costs money to run the cluster AND requires a lot more work for data parallelization!
        dask_client = Client(f"tcp://{models_config['dask']['remote']['scheduler_address']}:8786")
    else:
        # local
        dask_client = Client( #spawns a local cluster
            n_workers=models_config['dask']['local']['n_workers'], 
            threads_per_worker=models_config['dask']['local']['n_jobs'], 
            memory_limit=models_config['dask']['local']['memory_limit'] # memory_limit is per worker
        )

    dask_client

<p><br>
<h3>Decision Tree Classifier</h3>
<h4>Trials</h4>

In [46]:
run_dtclf = models_config['DecisionTreeClassifier']['run'] and not null_labeled and not null_unlabeled
render_HTML(f"models_config['DecisionTreeClassifier']['run']: {models_config['DecisionTreeClassifier']['run']}; data_ALL_labeled_with_target.isnull().values.any(): {null_labeled}; data_unlabeled.isnull().values.any(): {null_unlabeled}")

if run_dtclf:
    trials = models_config['DecisionTreeClassifier']['trials']

    display(HTML(f"models_config['DecisionTreeClassifier']['trials']['run']: {trials['run']}"))
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            display(HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>"))
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            rfclf, best_parameters, model_results = clf_run_trial(DecisionTreeClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['DecisionTreeClassifier']['params']

<p><br>
<h4>Build Validation Final Model with best params</h4>

In [47]:
if run_dtclf:
    best_parameters.update({'random_state': SEED})
    dtclf, _, model_results = clf_run_trial(DecisionTreeClassifier(), best_parameters, best_parameters)
    
    pred_unlabeled = dtclf.predict(data_unlabeled)
    pred_unlabeled

<p><br><br>
<h4>Build Final Model with ALL Labeled Data (Test + Validation)</h4>

In [48]:
if run_dtclf:
    dtclf = DecisionTreeClassifier()
    del best_parameters['random_state']
    dtclf.set_params(**best_parameters)
    dtclf.fit(data_ALL_labeled, labels_encoded.status_group)

<p><br><br>
<h4>Make Predictions with Final Model on Unlabeled Predictors</h4>

In [49]:
if run_dtclf:
    pred_unlabeled = dtclf.predict(data_unlabeled)
    df_pred_unlabeled = pd.concat([data_unlabeled.reset_index()[['id']], pd.DataFrame(pred_unlabeled, columns=['status_group'])], axis=1).set_index('id')
    df_pred_unlabeled.info()
    fname__preds_unlabeled = f"{MODEL_RESULTS_DIR}DecisionTreeClassifier-preds-{digest}.csv"
    df_pred_unlabeled.to_csv(fname__preds_unlabeled, sep=',')
    print(f"updated {fname__preds_unlabeled}")

<p><br>
<h3>Random Forest Classifier</h3>
<h4>Trials</h4>

In [50]:
run_rfclf = models_config['RandomForestClassifier']['run'] and not null_labeled and not null_unlabeled
render_HTML(f"models_config['RandomForestClassifier']['run']: {models_config['RandomForestClassifier']['run']}; data_ALL_labeled_with_target.isnull().values.any(): {null_labeled}; data_unlabeled.isnull().values.any(): {null_unlabeled}")

if run_rfclf:
    trials = models_config['RandomForestClassifier']['trials']

    render_HTML(f"models_config['RandomForestClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            rfclf, best_parameters, model_results = clf_run_trial(RandomForestClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['RandomForestClassifier']['params']

<p><br>
<h4>Build Final Vaildation Model with best params</h4>

In [51]:
if run_rfclf:
    best_parameters.update({'n_jobs':-1, 'verbosity':1, 'random_state': SEED})
    rfclf, _ , model_results = clf_run_trial(RandomForestClassifier(), best_parameters, best_parameters)

<p><br><br>
<h4>Build Final Model with ALL Labeled Data (Test + Validation)</h4>

In [52]:
if run_rfclf:
    rfclf = RandomForestClassifier()
    del best_parameters['random_state']
    rfclf.set_params(**best_parameters)
    rfclf.fit(data_ALL_labeled, labels_encoded.status_group)

<p><br><br>
<h4>Make Predictions with Final Model on Unlabeled Predictors</h4>

In [53]:
if run_rfclf:
    pred_unlabeled = rfclf.predict(data_unlabeled)
    df_pred_unlabeled = pd.concat([data_unlabeled.reset_index()[['id']], pd.DataFrame(pred_unlabeled, columns=['status_group'])], axis=1).set_index('id')
    df_pred_unlabeled.info()
    fname__preds_unlabeled = f"{MODEL_RESULTS_DIR}RandomForestClassifier-preds-{digest}.csv"
    df_pred_unlabeled.to_csv(fname__preds_unlabeled, sep=',')
    print(f"updated {fname__preds_unlabeled}")

<p><br><br><br>
<h3>XGBClassifier</h3>
<h4>Trials</h4>

In [54]:
run_xgbclf = models_config['XGBClassifier']['run']
render_HTML(f"models_config['XGBClassifier']['run']: {run_xgbclf}")

if run_xgbclf:
    trials = models_config['XGBClassifier']['trials']

    render_HTML(f"models_config['XGBClassifier']['trials']['run']: {trials['run']}")
    if trials['run']:
        trials_list = trials['array']

        best_parameters = {}
        for i, trial in enumerate(trials_list):
            render_HTML(f"<p><br>trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}<br>")
            params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
            if trial['gridsearch']['run']:
                params.update({'n_jobs': [-1]})
            else:
                params.update({'n_jobs':-1})
            xgbclf, best_parameters, model_results = clf_run_trial(XGBClassifier(), params, best_parameters, run_trials_gridsearch=trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

    else:
        best_parameters = models_config['XGBClassifier']['params']

<p><br>
<h4>Build Final Vaildation Model with best params</h4>

In [55]:
if run_xgbclf:
    best_parameters.update({'n_jobs':-1, 'verbosity':1, 'random_state': SEED})
    xgbclf, _ , model_results = clf_run_trial(XGBClassifier(), best_parameters, best_parameters)

CPU times: user 6min 58s, sys: 4.17 s, total: 7min 2s
Wall time: 59.2 s


<p><br><br>
<h4>Build Final Model with ALL Labeled Data (Test + Validation)</h4>

In [56]:
if run_xgbclf:
    xgbclf = XGBClassifier()
    del best_parameters['random_state']
    xgbclf.set_params(**best_parameters)
    xgbclf.fit(data_ALL_labeled, labels_encoded.status_group)

<p><br><br>
<h4>Make Predictions with Final Model on Unlabeled Predictors</h4>

In [78]:
if run_xgbclf:
    pred_unlabeled = xgbclf.predict(data_unlabeled)
    df_pred_unlabeled = pd.concat([data_unlabeled.reset_index()[['id']], pd.DataFrame(pred_unlabeled, columns=['status_group_encoded'])], axis=1).set_index('id')
    df_pred_unlabeled['status_group'] = df_pred_unlabeled['status_group_encoded'].apply(lambda sg_encoded: classes[sg_encoded])
    df_pred_unlabeled = df_pred_unlabeled.drop('status_group_encoded', axis=1)
    df_pred_unlabeled.info()
    fname__preds_unlabeled = f"{MODEL_RESULTS_DIR}XGBClassifier-preds-{digest}.csv"
    df_pred_unlabeled.to_csv(fname__preds_unlabeled, sep=',')
    print(f"updated {fname__preds_unlabeled}")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14850 entries, 50785 to 68707
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  14850 non-null  object
dtypes: object(1)
memory usage: 232.0+ KB
updated model-results/XGBClassifier-preds-69feae104538a094014ae5cec62abdf0.csv


<p><br><br>
<h4>Save Validation Results to File</h4>

In [60]:
fm.save_json(model_results, f"{model_results_fname}")
print(f"updated {model_results_fname}")

updated model-results/models-results-69feae104538a094014ae5cec62abdf0.json
