<h2>Import Libraries</h2>

In [1]:
from scjpnlib.utils.file_io import FileManager
import scjpnlib.utils as scjpnutils
from IPython.core.display import HTML, Markdown

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, cross_val_score
import dask_ml.model_selection as dcv
from dask.distributed import Client
import joblib

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

K = 3 # num folds for cross-val
n_jobs = 8

## Read Configs for this Experiment

In [2]:
fm = FileManager()

eda_config = fm.load_json('eda-config.txt')

{'official_data': {'unlabeled_predictors': {'url': 'https://s3.amazonaws.com/drivendata-prod/data/7/public/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv',
   'local_fname': 'Taarifa-Unlabeled-Predictors.csv'},
  'labeled_predictors': {'url': 'https://s3.amazonaws.com/drivendata-prod/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv',
   'local_fname': 'Taarifa-Labeled-Predictors.csv'},
  'labels': {'url': 'https://s3.amazonaws.com/drivendata-prod/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv',
   'local_fname': 'Taarifa-Labels.csv'}},
 'test_ratio': 0.1,
 'SEED': 42,
 'insig_cat_handling': {'installer': {'strategy': 'flat',
   'threshold': 10,
   'map_to': 'other'},
  'scheme_name': {'strategy': 'flat', 'threshold': 10, 'map_to': 'other'}},
 'wrangled_data': {'fname_ext': 'csv',
  'train': {'fname_prefix': 'wrangled-labeled-data-train'},
  'test': {'fname_prefix': 'wrangled-labeled-data-test'},
  'unlabeled': {'fname_prefix': 'wrangled-unlabeled-data'}},
 'labels': {'fname_ext

In [3]:
models_config = fm.load_json('models-config.txt')
models_config

{'SEED': 42,
 'RandomForestClassifier': {'trials': {'run': False,
   'array': [{'gridsearch': {'run': False,
      'param_grid': {'bootstrap': [True, False],
       'criterion': ['entropy', 'gini'],
       'max_features': ['auto', 'sqrt', 'log2'],
       'max_depth': [10, 50, 75, None],
       'n_estimators': [100, 500, 1000]},
      'last_best': {'bootstrap': True,
       'criterion': 'entropy',
       'max_depth': 75,
       'max_features': 'auto',
       'n_estimators': 1000}}}]},
  'params': {'bootstrap': True,
   'criterion': 'entropy',
   'max_depth': 75,
   'max_features': 'auto',
   'n_estimators': 1000}},
 'XGBClassifier': {'trials': {'run': True,
   'array': [{'gridsearch': {'run': False,
      'param_grid': {'learning_rate': [0.1],
       'max_depth': [3],
       'min_child_weight': [1],
       'subsample': [1],
       'gamma': [0, 1, 2, 5, 10],
       'n_estimators': [100]},
      'last_best': {'learning_rate': 0.1,
       'max_depth': 3,
       'min_child_weight': 1,
     

<p><br>
<h2>Load TEST/TRAIN Data</h2>

In [4]:
SEED = models_config['SEED']

In [5]:
fname = scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':False,'type':'train'})
data_train = pd.read_csv(fname, index_col=0)
print(f"loaded {fname}\n")
data_train.info()

loaded wrangled-labeled-data-train-1d7f80e43261a7589d64368eab8a2853.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53460 entries, 44928 to 56422
Data columns (total 89 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   installer_rwe                              53460 non-null  float64
 1   installer_other                            53460 non-null  float64
 2   installer_distr                            53460 non-null  float64
 3   installer_danid                            53460 non-null  float64
 4   installer_hesaw                            53460 non-null  float64
 5   installer_world                            53460 non-null  float64
 6   installer_gover                            53460 non-null  float64
 7   installer_none                             53460 non-null  float64
 8   installer_commu                            53460 non-null  float64
 9   installer_dwe    

In [6]:
fname = scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':True,'type':'train'})
y_train = pd.read_csv(fname, index_col=0)
print(f"loaded {fname}\n")
y_train.info()

loaded labels-train-1d7f80e43261a7589d64368eab8a2853.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53460 entries, 44928 to 56422
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  53460 non-null  object
dtypes: object(1)
memory usage: 835.3+ KB


In [7]:
classes = y_train.status_group.unique()

In [8]:
fname = scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':False,'type':'test'})
data_test = pd.read_csv(fname, index_col=0)
print(f"loaded {fname}\n")
data_test.info()

loaded wrangled-labeled-data-test-1d7f80e43261a7589d64368eab8a2853.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5940 entries, 2980 to 26085
Data columns (total 89 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   installer_rwe                              5940 non-null   float64
 1   installer_other                            5940 non-null   float64
 2   installer_distr                            5940 non-null   float64
 3   installer_danid                            5940 non-null   float64
 4   installer_hesaw                            5940 non-null   float64
 5   installer_world                            5940 non-null   float64
 6   installer_gover                            5940 non-null   float64
 7   installer_none                             5940 non-null   float64
 8   installer_commu                            5940 non-null   float64
 9   installer_dwe       

In [9]:
fname = scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':True,'type':'test'})
y_test = pd.read_csv(fname, index_col=0)
print(f"loaded {fname}\n")
y_test.info()

loaded labels-test-1d7f80e43261a7589d64368eab8a2853.csv

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5940 entries, 2980 to 26085
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  5940 non-null   object
dtypes: object(1)
memory usage: 92.8+ KB


<p><br>
<h2>Models</h2>

<h3>General functions for building Classifiers and running trials</h3>

In [10]:
def gs_find_best_params(clf, param_grid):
    display(HTML(f"param_grid for {type(clf)} GridSearch:<br><pre>{params}</pre>"))
    grid_clf = GridSearchCV(
        clf, 
        param_grid, 
        cv=K, 
        n_jobs=-1
        , verbose=20
    )
    # with joblib.parallel_backend('dask'):
    #     %time _ = grid_clf.fit(data_train, y_train)
    %time _ = grid_clf.fit(data_train, y_train)
    return grid_clf.best_params_

In [11]:
def clf_fit(clf, data_train, y_train):
    # with joblib.parallel_backend('dask'):
    #     %time clf.fit(data_train, y_train)
    %time clf.fit(data_train, y_train)
    return clf

In [12]:
def summarize_preds(X, y, preds, dataset_name, classes):
    display(HTML("<p><br>"))
    _accuracy = accuracy_score(y, preds)
    display(HTML(f"{dataset_name} Accuracy: {round(_accuracy*100,4)}"))
    display(HTML("<p><br>"))
    display(HTML(f"<pre>{classification_report(y, preds, target_names=classes)}</pre>"))

In [13]:
def clf_run_trial(clf, params, run_trials_gridsearch):
    best_parameters = gs_find_best_params(clf, params) if run_trials_gridsearch else params

    display(HTML("<p><br>"))
    display(HTML(f"Grid Search {'(previously) ' if not run_trials_gridsearch else ''}found the following optimal parameters: "))
    s_best_params = ""
    for param_name in list(best_parameters.keys()):
        s_best_params += f"\t{param_name}: {best_parameters[param_name]}\n"
    display(HTML(f"<pre>{s_best_params}</pre>"))

    display(HTML("<p><br>"))
    display(HTML("Fitting classifer..."))
    clf = clf.set_params(**best_parameters)
    clf = clf_fit(clf, data_train, y_train)
    s_all_done = "\tALL DONE!"
    display(HTML(f"<pre>{s_all_done}</pre>"))

    display(HTML("<p><br>"))
    display(HTML("Predicting labels on training data..."))
    pred_train = clf.predict(data_train)
    display(HTML(f"<pre>{s_all_done}</pre>"))
    summarize_preds(data_train, y_train, pred_train, 'Training', classes)
    display(HTML("<p><br>"))
    display(HTML("Computing cross-val score on training data..."))
    cv_score_train = cross_val_score(clf, data_train, y_train, cv=K)
    display(HTML(f"<pre>{s_all_done}</pre>"))
    display(HTML(f"cross_val_score: {np.mean(cv_score_train)}"))

    display(HTML("<p><br>"))
    display(HTML(f"Predicting labels on testing data..."))
    pred_test = clf.predict(data_test)
    display(HTML(f"<pre>{s_all_done}</pre>"))
    summarize_preds(data_test, y_test, pred_test, 'Testing', classes)
    display(HTML("<p><br>"))
    display(HTML(f"Computing cross-val score on testing data..."))
    cv_score_test = cross_val_score(clf, data_test, y_test, cv=K)
    display(HTML(f"<pre>{s_all_done}</pre>"))
    display(HTML(f"cross_val_score: {np.mean(cv_score_test)}"))

    return best_parameters

<p><br>
<h2>Initialize Dask-Client (to Dask backend for parallelization) <i>(DISABLED for now)</i></h2>

In [14]:
# # local
# # dask_client = Client(n_workers=2, threads_per_worker=8, memory_limit='8GB') #spawns a local cluster; memory_limit is per worker
# dask_client = Client(n_workers=1, threads_per_worker=n_jobs, memory_limit='16GB') #spawns a local cluster; memory_limit is per worker

# # for Kubernetes dask scheduler/worker cluster in GCP - but this costs money to run the cluster AND requires a lot more work for data parallelization!
# # scheduler_address = '35.230.13.87'
# # dask_client = Client(f'tcp://{scheduler_address}:8786')

# dask_client

<p><br>
<h3>Random Forest Classifier</h3>
<h4>Trials</h4>

In [15]:
trials = models_config['RandomForestClassifier']['trials']

display(HTML(f"models_config['RandomForestClassifier']['trials']['run']: {trials['run']}"))
if trials['run']:
    trials_list = trials['array']

    for i, trial in enumerate(trials_list):
        display(HTML(f"trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}"))
        params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
        if trial['gridsearch']['run']:
            params.update({'n_jobs': [-1]})
        else:
            params.update({'n_jobs':-1})
        best_parameters = clf_run_trial(RandomForestClassifier(), params, trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

else:
    best_parameters = models_config['RandomForestClassifier']['params']

<p><br>
<h4>Build Final Model with best params</h4>

In [16]:
best_parameters.update({'n_jobs':-1, 'verbose':1})
clf_run_trial(RandomForestClassifier(), best_parameters, run_trials_gridsearch=False);

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   29.0s
CPU times: user 3min 49s, sys: 4.07 s, total: 3min 53s
Wall time: 39.3 s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   38.6s finished


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    4.9s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    6.0s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   31.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.6s
[Parall

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.8s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 

<p><br><br><br>
<h3>XGBClassifier</h3>
<h4>Trials</h4>

In [17]:
trials = models_config['XGBClassifier']['trials']

display(HTML(f"models_config['XGBClassifier']['trials']['run']: {trials['run']}"))
if trials['run']:
    trials_list = trials['array']

    for i, trial in enumerate(trials_list):
        display(HTML(f"trial[{i}]['gridsearch']['run']: {trial['gridsearch']['run']}"))
        params = trial['gridsearch']['last_best'] if not trial['gridsearch']['run'] else trial['gridsearch']['param_grid']
        if trial['gridsearch']['run']:
            params.update({'n_jobs': [-1]})
        else:
            params.update({'n_jobs':-1})
        best_parameters = clf_run_trial(XGBClassifier(), params, trial['gridsearch']['run']) # note that best_parameters will be set to those used in the last trial

else:
    best_parameters = models_config['RandomForestClassifier']['params']

CPU times: user 2min 1s, sys: 1.14 s, total: 2min 2s
Wall time: 19.5 s


CPU times: user 2min 5s, sys: 1.1 s, total: 2min 6s
Wall time: 19.3 s


CPU times: user 5min 12s, sys: 3.11 s, total: 5min 15s
Wall time: 48.3 s


CPU times: user 5min 14s, sys: 3.3 s, total: 5min 18s
Wall time: 51.6 s


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:  3.0min remaining: 14.8min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  3.0min remaining:  8.9min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:  3.7min remaining:  7.5min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:  3.7min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:  3.7min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  4.5min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:  4.5min remaining:  2.3min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  5.8min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  5.9min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  6.3min remaining: 

CPU times: user 5min 12s, sys: 2.37 s, total: 5min 14s
Wall time: 46.1 s


CPU times: user 5min 8s, sys: 3.39 s, total: 5min 11s
Wall time: 57.5 s


<p><br>
<h4>Build Final Model with best params</h4>

In [18]:
best_parameters.update({'n_jobs':-1, 'verbose':1})
clf_run_trial(XGBClassifier(), best_parameters, run_trials_gridsearch=False);

CPU times: user 5min 17s, sys: 4.5 s, total: 5min 22s
Wall time: 1min 10s
