# Intro

In [1]:
import scjpnlib.utils as scjpnutils
import os
import pandas as pd
import numpy as np
from scjpnlib.utils.file_io import FileManager
from IPython.core.display import HTML, Markdown
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import scjpnlib.utils.skl_transformers as scjpnskltransformers
import scjpnlib.utils.strategy_transformers as scjpnstrattransformers
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import StratifiedKFold

from scjpnlib.utils import submodels as scjpnsubmodels

# leave these in until port is complete 
from scjpnlib.utils.skl_transformers import DropColumnsTransformer, LambdaTransformer, SimpleValueTransformer, OneHotEncodingTransformer, LabelEncodingTransformer, TargetEncoderLOOTransformer
from sklearn.preprocessing import FunctionTransformer

In [2]:
fm = FileManager()
eda_config = fm.load_json('eda-config.txt')
eda_config

{'official_data': {'unlabeled_predictors': {'url': 'https://s3.amazonaws.com/drivendata-prod/data/7/public/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv',
   'local_fname': 'Taarifa-Unlabeled-Predictors.csv'},
  'labeled_predictors': {'url': 'https://s3.amazonaws.com/drivendata-prod/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv',
   'local_fname': 'Taarifa-Labeled-Predictors.csv'},
  'labels': {'url': 'https://s3.amazonaws.com/drivendata-prod/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv',
   'local_fname': 'Taarifa-Labels.csv'}},
 'eda_desc': {'short': '0.10 test_ratio'},
 'test_ratio': 0.1,
 'SEED': 42,
 'missing_string_value_replacement': 'none',
 'feature_groupings': {'amount_tsh': {'description': {'type': 'provided',
    'description': "'Total static head (amount water available to waterpoint)'"},
   'features': ['amount_tsh'],
   'preprocessing_strategy': [['amount_tsh',
     'C__drop_it__StrategyTransformer']]},
  'pump_age_at_observation_date': {'description': {'ty

In [3]:
if len(eda_config['wrangled_data']['dir']) > 0:
    WRANGLED_DATA_DIR = eda_config['wrangled_data']['dir']
    try:
        os.makedirs(f"{os.getcwd()}/{WRANGLED_DATA_DIR}")
    except FileExistsError:
        pass
    WRANGLED_DATA_DIR += "/"
else:
    WRANGLED_DATA_DIR = ""

if len(eda_config['labels']['dir']) > 0:
    SAVE_LABELS_DIR = eda_config['labels']['dir']
    try:
        os.makedirs(f"{os.getcwd()}/{SAVE_LABELS_DIR}")
    except FileExistsError:
        pass
    SAVE_LABELS_DIR += "/"
else:
    SAVE_LABELS_DIR = ""

    
fname__train_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':False,'type':'train'})
print(f"wrangled/preprocessed training predictors will be saved to: {fname__train_predictors}")
fname__train_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':True,'type':'train'})
print(f"training labels will be saved to: {fname__train_labels}")

fname__validation_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':False,'type':'test'})
print(f"wrangled/preprocessed validation predictors will be saved to: {fname__validation_predictors}")
fname__validation_labels = SAVE_LABELS_DIR + scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':True,'type':'test'})
print(f"validation labels will be saved to: {fname__validation_labels}")

fname__unlabeled_predictors = WRANGLED_DATA_DIR + scjpnutils.get_data_fname(eda_config, data_kwargs={'is_labels':False,'type':'unlabeled'})
print(f"wrangled/preprocessed unlabeled predictors will be saved to: {fname__unlabeled_predictors}")

wrangled/preprocessed training predictors will be saved to: preprocessing-results/wrangled-labeled-data-train-7359ee5d2cafabcb07b89ffc61b02a2f.csv
training labels will be saved to: preprocessing-results/labels-train-7359ee5d2cafabcb07b89ffc61b02a2f.csv
wrangled/preprocessed validation predictors will be saved to: preprocessing-results/wrangled-labeled-data-test-7359ee5d2cafabcb07b89ffc61b02a2f.csv
validation labels will be saved to: preprocessing-results/labels-test-7359ee5d2cafabcb07b89ffc61b02a2f.csv
wrangled/preprocessed unlabeled predictors will be saved to: preprocessing-results/wrangled-unlabeled-data-7359ee5d2cafabcb07b89ffc61b02a2f.csv


In [4]:
display(HTML(f"<b>NOTE: <i>for the above EDA configuration, wrangled/preprocessed and partitioned label output files will be identified (filename suffix) by: hash digest: {scjpnutils.json_to_md5_hash_digest(eda_config)}</i></b>"))

In [5]:
fname_unlabeled_predictors = eda_config['official_data']['unlabeled_predictors']['local_fname']
fname_labeled_predictors = eda_config['official_data']['labeled_predictors']['local_fname']
fname_labels = eda_config['official_data']['labels']['local_fname']

ds_map = {
    fname_unlabeled_predictors: eda_config['official_data']['unlabeled_predictors']['url'], 
    fname_labeled_predictors: eda_config['official_data']['unlabeled_predictors']['url'],
    fname_labels: eda_config['official_data']['unlabeled_predictors']['url']
}

fm.validate_download(ds_map)

In [6]:
SEED = eda_config['SEED'] # for random_state reproducability

pipeline_data_preprocessor = Pipeline(steps=[('passthrough', None)], verbose=True)

In [7]:
scjpnutils.display_pretty_feature_groupings(eda_config['feature_groupings'], include_preprocessing=True)

In [8]:
labeled_with_target = pd.concat(
    [
        pd.read_csv(fname_labeled_predictors, index_col=0), 
        pd.read_csv(fname_labels, index_col=0)
    ], 
    axis=1
    ,
    join='inner'
).sort_index()

In [9]:
labels = labeled_with_target[['status_group']]

In [10]:
let_labels = scjpnskltransformers.LabelEncodingTransformer(['status_group'])
labels_encoded = let_labels.fit_transform(labels)

In [11]:
classes = list(let_labels.labelencoder.classes_)

In [12]:
labeled_with_target['status_group_encoded'] = labels_encoded.status_group

In [13]:
X_labeled = labeled_with_target.drop(['status_group', 'status_group_encoded'], axis=1)
X_unlabeled = pd.read_csv(fname_unlabeled_predictors, index_col=0).sort_index()

In [14]:
X_train, X_validation, y_train, y_validation = train_test_split(X_labeled.sort_index(), labels.sort_index(), test_size=eda_config['test_ratio'], random_state=SEED)

In [15]:
y_train_encoded = labels_encoded.loc[y_train.index]

In [16]:
data_train = pd.concat([X_train, y_train, y_train_encoded], axis=1, join='inner').sort_index()
data_train.columns = list(X_train.columns) + ['status_group', 'status_group_encoded']

In [17]:
y_validation_encoded = labels_encoded.loc[y_validation.index]

In [18]:
data_validation = pd.concat([X_validation, y_validation, y_validation_encoded], axis=1, join='inner').sort_index()
data_validation.columns = list(X_validation.columns) + ['status_group', 'status_group_encoded']

In [19]:
scjpnutils.analyze_values(X_train, 'X_train BEFORE preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,96,"[0.0, 1000.0, 250.0, 5000.0, 3000.0, 2400.0, 5...",0.001796,99.82,0,0.0,
1,date_recorded,object,352,"[2013-03-06, 2013-02-05, 2011-07-24, 2012-11-0...",0.006584,99.34,0,0.0,
2,funder,object,1799,"[Kiliwater, Unicef, Danida, Tasaf/tlc, Dhv, Am...",0.033651,96.63,3269,0.061149,"Int64Index([ 569, 35243, 7826, 7576, 71341,..."
3,gps_height,int64,2415,"[1424, 1358, 0, 299, -14, 1490, 1421, 915, 891...",0.045174,95.48,0,0.0,
4,installer,object,2035,"[Kiliwater, TWESA, Central government, TASAF/T...",0.038066,96.19,3287,0.061485,"Int64Index([ 569, 35243, 7826, 7576, 71341,..."
5,longitude,float64,51753,"[37.61840909, 30.6557619, 33.81319755, 33.1151...",0.96807,3.19,0,0.0,
6,latitude,float64,51755,"[-3.26320247, -3.53625, -9.47660713, -4.835553...",0.968107,3.19,0,0.0,
7,wpt_name,object,34085,"[Kwa Moris Assenga, Bavunja Primary School, Za...",0.637579,36.24,0,0.0,
8,num_private,int64,61,"[0, 34, 65, 32, 1, 8, 41, 6, 15, 3, 698, 1402,...",0.001141,99.89,0,0.0,
9,basin,object,9,"[Pangani, Lake Tanganyika, Lake Nyasa, Rufiji,...",0.000168,99.98,0,0.0,


In [20]:
X_train_preprocessed = X_train.copy()

for feat_group in eda_config['feature_groupings']:
    strat_transformer = scjpnstrattransformers.instantiate_strategy_transformer(
        eda_config['feature_groupings'][feat_group]['preprocessing_strategy'], 
        feat_group,
        pipeline_data_preprocessor
    )
    scjpnstrattransformers.html_prettify_strategy_transformer_description(strat_transformer)
    X_train_preprocessed = strat_transformer.fit_transform(X_train_preprocessed, y_train_encoded.status_group)
    display(HTML("<p><br><br>"))

strategy "drop feature: amount_tsh" appended step ['drop feature: amount_tsh', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fdba61e2b90>] to pipeline
strategy "drop feature: amount_tsh" transformation for feature "amount_tsh" is COMPLETE!


strategy "convert (from string date format '%Y-%m-%d') to datetime type: date_recorded" appended step ["convert (from string date format '%Y-%m-%d') to datetime type: date_recorded", FunctionTransformer(func=<function C__convert_string_date_to_datetime__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fdbb5495830>)] to pipeline
strategy "convert (from string date format '%Y-%m-%d') to datetime type: date_recorded" transformation for feature "date_recorded" is COMPLETE!
strategy "replace 0 with date_recorded value: construction_year" appended step ['replace 0 with date_recorded value: construction_year', FunctionTransformer(func=<function C__replace_0_construction_year_with_date_recorded__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fdbb5495560>)] to pipeline
strategy "replace 0 with date_recorded value: construction_year" transformation for feature "construction_year" is COMPLETE!
strategy "convert (from string date format '%Y') to datetime type: construction_

strategy "drop feature: funder" appended step ['drop feature: funder', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fdba3ce0a10>] to pipeline
strategy "drop feature: funder" transformation for feature "funder" is COMPLETE!


strategy "leave feature as is (do nothing): gps_height" appended step ['leave feature as is (do nothing): gps_height', FunctionTransformer(func=<function C__leave_it_as_is__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fdba38839e0>)] to pipeline
strategy "leave feature as is (do nothing): gps_height" transformation for feature "gps_height" is COMPLETE!


strategy "impute lower-case transform: installer" appended step ['impute lower-case transform: installer', FunctionTransformer(func=<function C__impute_lcase__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fdba3cee830>)] to pipeline
strategy "impute lower-case transform: installer" transformation for feature "installer" is COMPLETE!
strategy "replace values for feature: installer" replacement_rules:
{
    "installer": [
        {
            "missing_values": NaN,
            "strategy": "constant",
            "fill_value": "none"
        },
        {
            "missing_values": "0",
            "strategy": "constant",
            "fill_value": "none"
        },
        {
            "missing_values": "-",
            "strategy": "constant",
            "fill_value": "none"
        }
    ]
}
strategy "replace values for feature: installer" appended step ['replace values for feature: installer', <scjpnlib.utils.skl_transformers.SimpleValueTransformer object at 0x7fdba388

strategy "replace values for feature: latitude" replacement_rules:
{
    "latitude": [
        {
            "missing_values": -2e-08,
            "strategy": "constant",
            "fill_value": 0.0
        }
    ]
}
strategy "replace values for feature: latitude" appended step ['replace values for feature: latitude', <scjpnlib.utils.skl_transformers.SimpleValueTransformer object at 0x7fdba41357d0>] to pipeline
strategy "replace values for feature: latitude" transformation for feature "latitude" is COMPLETE!


strategy "drop feature: wpt_name" appended step ['drop feature: wpt_name', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fdba61e2650>] to pipeline
strategy "drop feature: wpt_name" transformation for feature "wpt_name" is COMPLETE!


strategy "drop feature: num_private" appended step ['drop feature: num_private', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fdba61e2250>] to pipeline
strategy "drop feature: num_private" transformation for feature "num_private" is COMPLETE!


strategy "impute lower-case transform: basin" appended step ['impute lower-case transform: basin', FunctionTransformer(func=<function C__impute_lcase__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fdba3cee710>)] to pipeline
strategy "impute lower-case transform: basin" transformation for feature "basin" is COMPLETE!
** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: basin" appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: basin', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fdba3886680>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'basin' in X match those that were previously fit
added new feature: basin_target_encoded
stra

** TargetEncoderLOOTransformer FIT INFO **: transformer has been fit to X
strategy "(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: region_code" appended step ['(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: region_code', FunctionTransformer(func=<function C__target_encode__StrategyTransformer.get_transformer.<locals>.<lambda> at 0x7fdba38867a0>)] to pipeline
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'region_code' in X match those that were previously fit
added new feature: region_code_target_encoded
strategy '(prefit) target-encoder (LOO==False, post_encode_null_to_global_mean==True) transform: region_code' appended step ['drop after target encoding: region_code', <scjpnlib.utils.skl_transformers.DropColumnsTransformer object at 0x7fdba3885b10>] to pipeline
strategy '(prefit) target-encoder (LOO==False, po

In [21]:
scjpnutils.analyze_values(X_train_preprocessed, 'X_train AFTER preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,gps_height,int64,2415,"[1424, 1358, 0, 299, -14, 1490, 1421, 915, 891...",0.045174,95.48,0,0.0,
1,longitude,float64,51753,"[37.61840909, 30.6557619, 33.81319755, 33.1151...",0.96807,3.19,0,0.0,
2,latitude,float64,51755,"[-3.26320247, -3.53625, -9.47660713, -4.835553...",0.968107,3.19,0,0.0,
3,population,int64,1019,"[1, 500, 0, 263, 400, 520, 350, 60, 300, 40, 1...",0.019061,98.09,0,0.0,
4,public_meeting,object,3,"[True, False, nan]",5.6e-05,99.99,3006,0.056229,"Int64Index([32908, 12197, 64825, 50761, 52163,..."
5,recorded_by,object,1,[GeoData Consultants Ltd],1.9e-05,100.0,0,0.0,
6,scheme_management,object,13,"[Company, VWC, nan, Water authority, Water Boa...",0.000243,99.98,3503,0.065526,"Int64Index([36420, 7651, 41429, 14391, 64446,..."
7,scheme_name,object,2636,"[Kitukuni water supply, nan, Sinyanga water s...",0.049308,95.07,25363,0.474429,"Int64Index([38945, 1186, 36420, 57631, 51349,..."
8,permit,object,3,"[True, False, nan]",5.6e-05,99.99,2733,0.051122,"Int64Index([54872, 29018, 63991, 60976, 61749,..."
9,extraction_type,object,18,"[gravity, afridev, swn 80, other, mono, submer...",0.000337,99.97,0,0.0,


In [22]:
scjpnutils.analyze_values(X_validation, 'X_validation BEFORE preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,amount_tsh,float64,56,"[0.0, 5000.0, 30.0, 8000.0, 50.0, 500.0, 1500....",0.009428,99.06,0,0.0,
1,date_recorded,object,310,"[2013-03-13, 2011-03-15, 2013-02-19, 2013-01-1...",0.052189,94.78,0,0.0,
2,funder,object,622,"[Danida, Kalitasi, Wateraid, Gaica, Lga, W.B, ...",0.104714,89.53,366,0.061616,"Int64Index([11582, 19363, 56691, 5592, 33874,..."
3,gps_height,int64,1717,"[1435, 1271, 1384, 1538, 0, 919, 78, 1266, 324...",0.289057,71.09,0,0.0,
4,installer,object,681,"[DANIDA, DANID, Kalitasi, SEMA, GAICA, DWE, Ca...",0.114646,88.54,368,0.061953,"Int64Index([11582, 19363, 56691, 5592, 33874,..."
5,longitude,float64,5782,"[34.90829229, 35.81058308, 37.99252752, 34.758...",0.973401,2.66,0,0.0,
6,latitude,float64,5782,"[-11.16992903, -7.50962812, -4.23701933, -4.30...",0.973401,2.66,0,0.0,
7,wpt_name,object,4588,"[Kanisani, none, Kwa Mzee Tadei, Kulumbe, Shul...",0.772391,22.76,0,0.0,
8,num_private,int64,33,"[0, 6, 120, 17, 7, 32, 180, 93, 5, 8, 1, 15, 3...",0.005556,99.44,0,0.0,
9,basin,object,9,"[Lake Nyasa, Rufiji, Pangani, Internal, Lake R...",0.001515,99.85,0,0.0,


In [23]:
X_validation_preprocessed = pipeline_data_preprocessor.transform(X_validation)

** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
added new feature: installer_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'basin' in X match those that were previously fit
added new feature: basin_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'region_code' in X match those that were previously fit
added new feature: region_code_target_encoded
** TargetEncoderLOOTransformer TRANSFORM INFO **: NOT using Leave-One-Out
** TargetEncoderLOOTransformer TRANSFORM INFO **: unique categories of 'district_code' in X match those that were previously fit
added new feature: district_code_target_encoded


In [24]:
scjpnutils.analyze_values(X_validation_preprocessed, 'X_validation AFTER preprocessing');

Unnamed: 0,feature,dtype,n_unique,unique_vals,n_unique_ratio,p_cat,n_null,n_null_ratio,null_index
0,gps_height,int64,1717,"[1435, 1271, 1384, 1538, 0, 919, 78, 1266, 324...",0.289057,71.09,0,0.0,
1,longitude,float64,5782,"[34.90829229, 35.81058308, 37.99252752, 34.758...",0.973401,2.66,0,0.0,
2,latitude,float64,5782,"[-11.16992903, -7.50962812, -4.23701933, -4.30...",0.973401,2.66,0,0.0,
3,population,int64,388,"[60, 1, 15, 189, 0, 200, 454, 168, 260, 270, 7...",0.06532,93.47,0,0.0,
4,public_meeting,object,3,"[nan, True, False]",0.000505,99.95,328,0.055219,"Int64Index([ 3746, 16680, 70311, 74214, 43963,..."
5,recorded_by,object,1,[GeoData Consultants Ltd],0.000168,99.98,0,0.0,
6,scheme_management,object,12,"[VWC, WUG, Parastatal, WUA, nan, Water authori...",0.00202,99.8,374,0.062963,"Int64Index([ 8649, 25311, 70311, 37230, 19565,..."
7,scheme_name,object,1171,"[Government, Kihoro, Mvango Water Supply, Mwan...",0.197138,80.29,2803,0.471886,"Int64Index([16680, 61612, 10609, 7097, 8649,..."
8,permit,object,3,"[True, False, nan]",0.000505,99.95,323,0.054377,"Int64Index([26755, 10610, 49966, 56721, 42625,..."
9,extraction_type,object,17,"[gravity, mono, submersible, ksb, other, nira/...",0.002862,99.71,0,0.0,
