# Setup

### Imports

In [1]:
# built-ins
import os
import json
import math
import time
import pickle
import traceback
from os import path
from pathlib import Path
from datetime import datetime

# common
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# misc
from IPython.display import display, clear_output, Markdown
from termcolor import colored

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.decomposition import PCA

# metrics
from sklearn.model_selection import ShuffleSplit, GridSearchCV

### Initial tasks

In [2]:
# allow multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# suppress warnings
import sys, os, warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# also suppress warnings of parallel processes such as grid search cv
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
    
# configure pandas
pd.set_option("display.max_columns", None)

### Utils / Helpers

In [3]:
def cprint(text, color):
    print(colored(text, color, attrs=['bold']))
    
def print_red(text):
    cprint(text, 'red')

def print_blue(text):
    cprint(text, 'blue')

### Path Definitions

In [4]:
path_root = '.'
path_dataset = path.join(path_root, 'dataset')
path_csv = path.join(path_dataset, 'csv')
path_csv_output =  path_csv
path_models = path.join(path_root, 'models')
    
# Create directories.
Path(path_models).mkdir(parents=True, exist_ok=True)

In [5]:
class PrintDuration(object):
    class printer(str):
        def __repr__(self):
            return self
        
    def __enter__(self):
        self.start_time = datetime.now()
        self.last_tick = self.start_time
        self.tick_count = 0
        self.tick_times = 0
        
        self.completed = False
        self.progress = 0
        self.ert = 0
        self.att = 0
        self.out = None
        
        return self.tick
  
    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is not None:
            traceback.print_exception(exc_type, exc_value, tb)
        
        self.completed = True
        self.render()
        
    def tdformat(self, seconds):
        hours, remainder = divmod(seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        return '{:02}:{:02}:{:02}'.format(int(hours), int(minutes), int(seconds))
    
    def render(self):
        output = ''
        
        if self.completed:
            complete_time = (datetime.now() - self.start_time).total_seconds()
            complete_time = self.tdformat(complete_time)
            output = f'100% completed, total run time = {complete_time}'
        else:
            percent = round(self.progress * 100)
            att = self.tdformat(self.att)
            ert = self.tdformat(self.ert)
            output = f'{percent}% completed, remaining time = {ert}, avg ticktime = {att}'
        
        output = self.printer(output)
        
        if self.out is None:
            self.out = display(output, display_id=True)
        else:
            self.out.update(output)
    
    def tick(self, progress):
        now = datetime.now()
        
        # calculate
        work_time = (now - self.start_time).total_seconds()
        tick_time = (now - self.last_tick).total_seconds()
        self.tick_count += 1
        self.tick_times += tick_time
        avg_tick_time = self.tick_times // self.tick_count
        
        if progress > 0:
            total_ticks = self.tick_count // progress
            remained_ticks = total_ticks - self.tick_count
            est_remain_time = avg_tick_time * remained_ticks
        else:
            est_remain_time = 0
            
        # set
        self.progress = progress
        self.att = avg_tick_time
        self.ert = est_remain_time
        
        # render
        self.render() 

### Detect Env

In [6]:
ENV_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE') is not None

### Path Definitions

In [7]:
path_root = '.'
path_dataset = path.join(path_root, 'dataset')
path_csv = path.join(path_dataset, 'csv')
path_csv_output =  path_csv
path_models = path.join(path_root, 'models')

if ENV_KAGGLE:
    path_root = '/kaggle/working'
    path_dataset = '/kaggle'
    path_csv = path.join(path_dataset, 'csv')
    path_csv_output = path_root
    path_models = path.join(path_root, 'models')
    
# Create directories.
Path(path_models).mkdir(parents=True, exist_ok=True)

### Configs

In [8]:
cfg_autosave_models = False
cfg_force_train = False

if ENV_KAGGLE:
    cfg_autosave_models = True
    cfg_force_train = False

# Hyperparameters

In [9]:
hp_seed = 7908
hp_cv_splits = 10
hp_test_size = 0.2

# Preprocessing

In [10]:
# read csv
df = pd.read_csv(path.join(path_csv, 'data.csv'), encoding='utf-8')

### Simple EDA

In [11]:
df.shape
df.info()

(319795, 18)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [12]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BMI,319795.0,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,319795.0,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,319795.0,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,319795.0,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0


In [14]:
df.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

# Pipeline Setup

In [25]:
class OutlierRemover:
    @staticmethod
    def numeric(data):
        cols = data.select_dtypes(include=['float64', 'int64']).columns.to_list()
        return OutlierRemover(cols)
    
    def __init__(self, cols):
        self.cols = cols
        self.bands = {}
    
    def fit(self, data):
        for col in self.cols:
            Q1 = train[col].quantile(0.25)
            Q3 = train[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_band = Q1 - 1.5 * IQR
            upper_band = Q3 + 1.5 * IQR
            
            self.bands[col] = (lower_band, upper_band)
    
    def transform(self, data):
        for col in self.cols:
            lower_band, upper_band = self.bands[col]
            inliers = ~((data[col] < lower_band) | (data[col] > upper_band))
            data = data[inliers]
            
        return data
            
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
    
class MultiLabelEncoder():
    @staticmethod
    def binary(data):
        cols = [col for col in data.columns if data[col].nunique() == 2]
        return MultiLabelEncoder(cols)
    
    def __init__(self, cols):
        self.cols = cols
        self.encoders = {col: LabelEncoder() for col in cols}
    
    def fit(self, data):
        for col in self.cols:
            self.encoders[col].fit(data[col])

    def transform(self, data):
        for col in self.cols:
            data[col] = self.encoders[col].transform(data[col])
        
        return data
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

class Preprocessor:
    def __init__(self, data, test_index, train_index, options={}):
        self.data = data
        self.test_index = np.array(test_index)
        self.train_index = np.array(train_index)
        self.options = options
    
    def test(self):
        return self.data.loc[self.test_index]
    
    def train(self):
        return self.data.loc[self.train_index]
    
    def get_data(self):
        return self.test(), self.train()
    
    def apply(self):
        data = self.data

        # remove outliers
        outlier_strategy = self.options.get('outlier_strategy', 'train_only')
        outlier_remover = OutlierRemover.numeric(data)
        match outlier_strategy:
            case 'train_only':
                data.loc[self.train_index] = outlier_remover.fit_transform(self.train())
            case 'include_test':
                outlier_remover.fit(self.train())
                data = outlier_remover.transform(data)
            case 'all':
                data = outlier_remover.fit_transform(data)
        
        self.train_index = self.train_index[np.isin(self.train_index, data.index.values)]
        self.test_index = self.test_index[np.isin(self.test_index, data.index.values)]
        
        # encode labels
        encode_labels = self.options.get('encode_labels', True)
        if encode_labels:
            data = MultiLabelEncoder.binary(data).fit_transform(data)
            
        onehot_encoding = self.options.get('onehot_encoding', None)
        if onehot_encoding is not None:
            cols = onehot_encoding
            data = pd.get_dummies(data, columns=cols, prefix=cols)
        
        # ordinal encoding
        ordinal_encoding = self.options.get('ordinal_encoding', None)
        if ordinal_encoding is not None:
            for col, ordinals in ordinal_encoding.items():
                encoder = OrdinalEncoder(categories=[ordinals])
                data[[col]] = encoder.fit_transform(data[[col]])
        
        # scaler
        scale = self.options.get('scale', True)
        if scale:
            pass
        
        # set and return data.
        self.data = data
        
        # return test and train data separately
        return self.get_data()

# remove
split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
train_index, test_index = next(iter(split.split(df)))

test = df.iloc[test_index]
train = df.iloc[train_index]

# before
print(test_index.shape, train_index.shape)
print(test.shape, train.shape)
print('train')
train.sort_index().head()
print('test')
test.sort_index().head()

options = {
    'outlier_strategy': 'all',
    'encode_labels': True,
    'onehot_encoding': ['Race', 'Diabetic'],
    'ordinal_encoding': {
        'GenHealth': ['Poor', 'Fair', 'Good', 'Very good','Excellent'],
        'AgeCategory': ['18-24', '25-29','30-34', '35-39', '40-44', '45-49', '50-54',
                        '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']
    }
}

pp = Preprocessor(df, test_index, train_index, options)
print(pp.options)

test, train = pp.apply()

# after
print(test.shape, train.shape)
print('train')
train.sort_index().head()
print('test')
test.sort_index().head()

(63959,) (255836,)
(63959, 18) (255836, 18)
train


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
8,No,26.45,No,No,No,0.0,0.0,No,Female,80 or older,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No


test


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes
7,No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,9.0,Yes,No,No
16,No,26.26,No,No,No,5.0,2.0,No,Female,70-74,White,No,No,Very good,10.0,No,No,No


{'outlier_strategy': 'all', 'encode_labels': True, 'onehot_encoding': ['Race', 'Diabetic'], 'ordinal_encoding': {'GenHealth': ['Poor', 'Fair', 'Good', 'Very good', 'Excellent'], 'AgeCategory': ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']}}
(46283, 26) (184999, 26)
train


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
1,0,20.34,0,0,1,0.0,0.0,0,0,12.0,1,3.0,7.0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11.0,0,2.0,6.0,0,0,1,0,0,0,0,0,1,1,0,0,0
8,0,26.45,0,0,0,0.0,0.0,0,0,12.0,0,1.0,5.0,0,1,0,0,0,0,0,0,1,0,1,0,0
9,0,40.69,0,0,0,0.0,0.0,1,1,9.0,1,2.0,10.0,0,0,0,0,0,0,0,0,1,1,0,0,0
11,0,28.71,1,0,0,0.0,0.0,0,0,7.0,1,3.0,5.0,0,0,0,0,0,0,0,0,1,1,0,0,0


test


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
7,0,31.64,1,0,0,5.0,0.0,1,0,12.0,0,2.0,9.0,1,0,0,0,0,0,0,0,1,0,0,1,0
16,0,26.26,0,0,0,5.0,2.0,0,0,10.0,0,3.0,10.0,0,0,0,0,0,0,0,0,1,1,0,0,0
24,0,25.75,0,0,0,0.0,0.0,0,0,12.0,1,3.0,6.0,0,0,1,0,0,0,0,0,1,1,0,0,0
30,0,25.84,1,0,0,5.0,0.0,0,1,10.0,1,2.0,8.0,0,0,0,0,0,1,0,0,0,1,0,0,0
41,0,28.13,0,0,0,0.0,0.0,0,1,8.0,1,4.0,8.0,0,0,0,0,0,0,0,0,1,1,0,0,0
