# Setup

### Imports

In [None]:
# built-ins
import os
import json
import math
import time
import pickle
import traceback
from os import path
from pathlib import Path
from datetime import datetime

# common
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# misc
from IPython.display import display, clear_output, Markdown
from termcolor import colored

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA

# metrics
from sklearn.model_selection import ShuffleSplit, GridSearchCV

### Initial tasks

In [None]:
# allow multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# suppress warnings
import sys, os, warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# also suppress warnings of parallel processes such as grid search cv
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
    
# configure pandas
pd.set_option("display.max_columns", None)

### Utils / Helpers

In [None]:
def cprint(text, color):
    print(colored(text, color, attrs=['bold']))
    
def print_red(text):
    cprint(text, 'red')

def print_blue(text):
    cprint(text, 'blue')

### Path Definitions

In [None]:
path_root = '.'
path_dataset = path.join(path_root, 'dataset')
path_csv = path.join(path_dataset, 'csv')
path_csv_output =  path_csv
path_models = path.join(path_root, 'models')
    
# Create directories.
Path(path_models).mkdir(parents=True, exist_ok=True)

In [None]:
class PrintDuration(object):
    class printer(str):
        def __repr__(self):
            return self
        
    def __enter__(self):
        self.start_time = datetime.now()
        self.last_tick = self.start_time
        self.tick_count = 0
        self.tick_times = 0
        
        self.completed = False
        self.progress = 0
        self.ert = 0
        self.att = 0
        self.out = None
        
        return self.tick
  
    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is not None:
            traceback.print_exception(exc_type, exc_value, tb)
        
        self.completed = True
        self.render()
        
    def tdformat(self, seconds):
        hours, remainder = divmod(seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        return '{:02}:{:02}:{:02}'.format(int(hours), int(minutes), int(seconds))
    
    def render(self):
        output = ''
        
        if self.completed:
            complete_time = (datetime.now() - self.start_time).total_seconds()
            complete_time = self.tdformat(complete_time)
            output = f'100% completed, total run time = {complete_time}'
        else:
            percent = round(self.progress * 100)
            att = self.tdformat(self.att)
            ert = self.tdformat(self.ert)
            output = f'{percent}% completed, remaining time = {ert}, avg ticktime = {att}'
        
        output = self.printer(output)
        
        if self.out is None:
            self.out = display(output, display_id=True)
        else:
            self.out.update(output)
    
    def tick(self, progress):
        now = datetime.now()
        
        # calculate
        work_time = (now - self.start_time).total_seconds()
        tick_time = (now - self.last_tick).total_seconds()
        self.tick_count += 1
        self.tick_times += tick_time
        avg_tick_time = self.tick_times // self.tick_count
        
        if progress > 0:
            total_ticks = self.tick_count // progress
            remained_ticks = total_ticks - self.tick_count
            est_remain_time = avg_tick_time * remained_ticks
        else:
            est_remain_time = 0
            
        # set
        self.progress = progress
        self.att = avg_tick_time
        self.ert = est_remain_time
        
        # render
        self.render() 

### Detect Env

In [None]:
ENV_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE') is not None

### Path Definitions

In [None]:
path_root = '.'
path_dataset = path.join(path_root, 'dataset')
path_csv = path.join(path_dataset, 'csv')
path_csv_output =  path_csv
path_models = path.join(path_root, 'models')

if ENV_KAGGLE:
    path_root = '/kaggle/working'
    path_dataset = '/kaggle'
    path_csv = path.join(path_dataset, 'csv')
    path_csv_output = path_root
    path_models = path.join(path_root, 'models')
    
# Create directories.
Path(path_models).mkdir(parents=True, exist_ok=True)

### Configs

In [None]:
cfg_autosave_models = False
cfg_force_train = False

if ENV_KAGGLE:
    cfg_autosave_models = True
    cfg_force_train = False

# Hyperparameters

In [None]:
hp_seed = 7908
hp_cv_splits = 10
hp_test_size = 0.2

# Preprocessing

In [None]:
# read csv
df = pd.read_csv(path.join(path_csv, 'data.csv'), encoding='utf-8')

### Simple EDA

In [None]:
df.shape
df.info()

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

# Pipeline Setup

In [None]:
class OutlierRemover:
    @staticmethod
    def numeric(data):
        cols = data.select_dtypes(include=['float64', 'int64']).columns.to_list()
        return OutlierRemover(cols)
    
    def __init__(self, cols):
        self.cols = cols
        self.bands = {}
    
    def fit(self, data):
        for col in self.cols:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_band = Q1 - 1.5 * IQR
            upper_band = Q3 + 1.5 * IQR
            
            self.bands[col] = (lower_band, upper_band)
    
    def transform(self, data):
        for col in self.cols:
            lower_band, upper_band = self.bands[col]
            inliers = ~((data[col] < lower_band) | (data[col] > upper_band))
            data = data[inliers]
            
        return data
            
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
    
class MultiLabelEncoder():
    @staticmethod
    def binary(data):
        cols = [col for col in data.columns if data[col].nunique() == 2]
        return MultiLabelEncoder(cols)
    
    def __init__(self, cols):
        self.cols = cols
        self.encoders = {col: LabelEncoder() for col in cols}
    
    def fit(self, data):
        for col in self.cols:
            self.encoders[col].fit(data[col])

    def transform(self, data):
        for col in self.cols:
            data[col] = self.encoders[col].transform(data[col])
        
        return data
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

class Preprocessor:
    @staticmethod
    def process(*args, **kwargs):
        processor = Preprocessor(*args, **kwargs)
        processor.apply()
        return processor
    
    def __init__(self, data, test_index, train_index, options={}):
        self.data = data
        self.test_index = np.array(test_index)
        self.train_index = np.array(train_index)
        self.options = options
    
    @property
    def test(self):
        return self.data.loc[self.test_index]
    
    @test.setter
    def test(self, value):
        self.data.loc[self.test_index] = value
    
    @property
    def train(self):
        return self.data.loc[self.train_index]
    
    @train.setter
    def train(self, value):
        self.data.loc[self.train_index] = value

    def x(self, dframe):
        return dframe.drop(self.options['target'], axis=1).to_numpy()
    
    def y(self, dframe):
        return dframe[self.options['target']].to_numpy()
        
    @property
    def x_test(self):
        return self.x(self.test)
        
    @property
    def x_train(self):
        return self.x(self.train)
    
    @property
    def y_test(self):
        return self.y(self.test)
        
    @property
    def y_train(self):
        return self.y(self.train)
    
    def chop(self):
        return self.x_test, self.y_test, self.x_train, self.y_train
    
    def apply(self):
        # remove outliers
        outlier_strategy = self.options.get('outlier_strategy', 'train_only')
        outlier_remover = OutlierRemover.numeric(self.data)
        match outlier_strategy:
            case 'train_only':
                self.train = outlier_remover.fit_transform(self.train)
            case 'include_test':
                outlier_remover.fit(self.train())
                self.data = outlier_remover.transform(self.data)
            case 'all':
                self.data = outlier_remover.fit_transform(self.data)
        
        # update removed indexes.
        self.train_index = self.train_index[np.isin(self.train_index, self.data.index.values)]
        self.test_index = self.test_index[np.isin(self.test_index, self.data.index.values)]
        
        # encode labels
        encode_labels = self.options.get('encode_labels', True)
        if encode_labels:
            self.data = MultiLabelEncoder.binary(self.data).fit_transform(self.data)
            
        onehot_encoding = self.options.get('onehot_encoding', None)
        if onehot_encoding is not None:
            cols = onehot_encoding
            self.data = pd.get_dummies(self.data, columns=cols, prefix=cols)
        
        # ordinal encoding
        ordinal_encoding = self.options.get('ordinal_encoding', None)
        if ordinal_encoding is not None:
            for col, ordinals in ordinal_encoding.items():
                encoder = OrdinalEncoder(categories=[ordinals])
                self.data[[col]] = encoder.fit_transform(self.data[[col]])
        
        # scaler
        scale = self.options.get('scale', True)
        if scale:
            scaler = StandardScaler()
            self.train = scaler.fit_transform(self.train)
            self.test = scaler.transform(self.test)

# remove
split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
train_index, test_index = next(iter(split.split(df)))

options = {
    'target': 'HeartDisease',
    'outlier_strategy': 'all',
    'encode_labels': True,
    'onehot_encoding': ['Race', 'Diabetic'],
    'ordinal_encoding': {
        'GenHealth': ['Poor', 'Fair', 'Good', 'Very good','Excellent'],
        'AgeCategory': ['18-24', '25-29','30-34', '35-39', '40-44', '45-49', '50-54',
                        '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']
    }
}

preprocessor = Preprocessor.process(df, test_index, train_index, options)
X_test, Y_test, X_train, Y_train = preprocessor.chop()
print(X_test.shape, Y_test.shape, X_train.shape, Y_train.shape)
