# Setup

### Initial tasks

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

### Imports

In [2]:
# built-ins
import os
import json
import math
import time
import traceback
from os import path
from pathlib import Path
from datetime import datetime

# common
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# misc
from IPython.display import display, clear_output

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit, GridSearchCV

# training
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

### Utils / Helpers

In [3]:
def load_json(path):
    with open(path, encoding='utf-8') as f:
        return json.load(f)

In [4]:
class PrintDuration(object):
    def __enter__(self):
        self.start_time = datetime.now()
        self.last_tick = self.start_time
        self.tick_count = 0
        self.tick_times = 0
        
        return self.tick
  
    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is not None:
            traceback.print_exception(exc_type, exc_value, tb)

        clear_output(wait=True)
    
    class printer(str):
        def __repr__(self):
            return self
        
    def tdformat(self, seconds):
        hours, remainder = divmod(seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        return '{:02}:{:02}:{:02}'.format(int(hours), int(minutes), int(seconds))
    
    def tick(self, progress):
        now = datetime.now()
        
        # calculate
        work_time = (now - self.start_time).total_seconds()
        tick_time = (now - self.last_tick).total_seconds()
        self.tick_count += 1
        self.tick_times += tick_time

        avg_tick_time = self.tick_times // self.tick_count
        
        if progress > 0:
            total_ticks = self.tick_count // progress
            remained_ticks = total_ticks - self.tick_count
            est_remain_time = avg_tick_time * remained_ticks
        else:
            est_remain_time = 0
            
        # format
        percent = round(progress*100)
        att = self.tdformat(avg_tick_time)
        ert = self.tdformat(est_remain_time)
        
        output = f'{percent}% completed, remaining time = {ert}, average tick time = {att}'
        
        # print
        clear_output(wait=True)
        display(self.printer(output), display_id=True)

### Detect Env

In [5]:
ENV_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE') is not None

### Path Definitions

In [6]:
path_root = '.'
path_dataset = path.join(path_root, 'dataset')
path_csv = path.join(path_dataset, 'csv')
path_csv_output =  path_csv
path_models = path.join(path_root, 'models')

if ENV_KAGGLE:
    path_root = '/kaggle/working'
    path_dataset = '/kaggle/input/aihw2'
    path_csv = path.join(path_dataset, 'csv')
    path_csv_output = path_root
    path_models = path.join(path_root, 'models')
    
# Create directories.
Path(path_models).mkdir(parents=True, exist_ok=True)

### Configs

In [7]:
cfg_autosave_models = False
cfg_force_train = False

if ENV_KAGGLE:
    cfg_autosave_models = True
    cfg_force_train = False

# Hyperparameters

In [8]:
hp_seed = 7908
hp_cv_splits = 10
hp_test_size = 0.2

# Preprocessing

In [9]:
def map_employement_duration(entry):
    entry = str(entry).lower()
    split = entry.split(' ')
    num = split[0]
    output = entry
    
    if "-" in num:
        num = num.split("-")[1]
    
    if "weeks" in entry:
        output = float(num) / 52
    elif ("month" in entry) or ("ay" in entry):
        output = float(num) / 12
    elif ("years" in entry) or ("sene" in entry) or ("yıl" in entry):
        output = float(num)
    else:
        try:
            output = float(num)
        except:
            output = 0
        
    output = round(output, 3)
    return output

In [10]:
# read encodings
encodings = load_json(path.join(path_dataset, 'encodings.json'))

# read csvs
csv_en = pd.read_csv(path.join(path_csv, 'english.csv'), dtype=str, encoding='utf-8')
csv_tr = pd.read_csv(path.join(path_csv, 'turkish.csv'), dtype=str, encoding='utf-8')

# drop columns
csv_en.drop('Timestamp', axis=1, inplace=True)
csv_tr.drop('Timestamp', axis=1, inplace=True)

# rename columns
csv_en.rename(columns=encodings['columns']['en'], inplace=True)
csv_tr.rename(columns=encodings['columns']['tr'], inplace=True)

# encode columns
csv_en.replace(encodings['values']['en'], inplace=True)
csv_tr.replace(encodings['values']['tr'], inplace=True)

# concat csvs
df = pd.concat([csv_en, csv_tr], axis=0).reset_index(drop=True)

# fix NaNs
df.fillna(0, inplace=True)

# convert types
df['age'] = df['age'].apply(lambda x: int(x))
df['weight'] = df['weight'].apply(lambda x: int(float(x.replace(',', '.'))))
df['height'] = df['height'].apply(lambda x: int(x.translate({ord(x): '' for x in [',', '.', ' ']})))
df['employment_duration'] = df['employment_duration'].apply(map_employement_duration)

# save csv
df.to_csv(path.join(path_csv_output, 'data.csv'), index=None, header=True, encoding='utf-8-sig')
df.info()

# separate data and labels
data = df.drop('weight', axis=1).to_numpy()
labels = df['weight'].to_numpy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  322 non-null    int64  
 1   height               322 non-null    int64  
 2   weight               322 non-null    int64  
 3   gender               322 non-null    int64  
 4   smoking              322 non-null    int64  
 5   drinking             322 non-null    int64  
 6   exercise             322 non-null    int64  
 7   married              322 non-null    int64  
 8   children             322 non-null    int64  
 9   student              322 non-null    int64  
 10  employed             322 non-null    int64  
 11  employment_duration  322 non-null    float64
dtypes: float64(1), int64(11)
memory usage: 30.3 KB


In [11]:
df.head()

(322, 11)


Unnamed: 0,age,height,weight,gender,smoking,drinking,exercise,married,children,student,employed,employment_duration
0,36,175,82,1,0,3,1,0,0,0,1,12.0
1,22,178,77,1,0,2,1,0,0,1,0,0.0
2,32,174,65,1,0,2,1,0,0,1,2,0.038
3,21,156,65,0,0,0,2,0,0,1,0,0.0
4,24,176,94,1,0,1,1,0,0,1,0,0.0


# Training

In [12]:
class Trainer:
    def __init__(self, estimator, data, labels, n_splits, test_size, seed, hp_grid=None):
        self.estimator = estimator
        self.data = data
        self.labels = labels
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.hp_grid = hp_grid
        
        self.stats = []
        self.best_stats = None
        self.best_estimator = None
    
    def split(self):
        split = ShuffleSplit(n_splits=self.n_splits, test_size=self.test_size, random_state=self.seed)
        
        for train_index, test_index in split.split(self.data):
            train_data = (self.data[train_index], self.labels[train_index])
            test_data = (self.data[test_index], self.labels[test_index])

            yield(train_data, test_data)
    
    def train(self, tick=None):
        for split_index, (train_data, test_data) in enumerate(self.split()):
            if tick is not None:
                tick(split_index/self.n_splits)
            
            X_train, Y_train = train_data
            X_test, Y_test = test_data
            
            # standardize age weight and employment duration.
            cols = [0, 1, 10]
            scaler = preprocessing.StandardScaler()
            scaler.fit(X_train[:, cols])
            X_train[:, cols] = scaler.transform(X_train[:, cols])
            X_test[:, cols] = scaler.transform(X_test[:, cols])
            
            # default values
            best_params = None
            best_estimator = self.estimator
            
            # fit estimator
            if self.hp_grid is not None:
                cv = GridSearchCV(self.estimator, self.hp_grid, cv=self.n_splits)
                cv.fit(X_train, Y_train)
                Y_pred = cv.best_estimator_.predict(X_test)
                
                best_params = cv.best_params_
                best_estimator = cv.best_estimator_
            else:
                self.estimator.fit(X_train, Y_train)
                Y_pred = self.estimator.predict(X_test)
            
            rsme = round(np.sqrt(mean_squared_error(Y_test, Y_pred)), 2)
            result = dict(y_true=Y_test, y_pred=Y_pred, best_params=best_params, rsme=rsme,
                          seed=self.seed, best_estimator=self.estimator)
            
            self.stats.append(result)
        
    def collect_best_stats(self):
        best_rsme = math.inf
        total_rsme = 0
        best_stats = None
        
        for stats in self.stats:
            rsme = stats['rsme']

            total_rsme += rsme
            if rsme < best_rsme:
                best_rsme = rsme
                best_stats = stats
        
        self.best_stats = best_stats
        self.best_estimator = best_stats['best_estimator']
        self.mean_rsme = total_rsme / len(self.stats)

In [13]:
class SetTrainer:   
    def __init__(self, data, labels, n_splits, test_size, seed):
        self.data = data
        self.labels = labels
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.estimators = {}
    
    def set_estimator(self, name, estimator, hp_grid=None):
        self.estimators[name] = (name, estimator, hp_grid)
    
    def set_estimator_grid(self, name, new_grid):
        estimator = self.estimators[name][1]
        self.set_estimator(name, estimator, new_grid)
    
    def train_all(self):
        for name in self.estimators.keys():
            self.train_estimator(name)
        
    def train_estimator(self, name, seed=None, save=True):
        if seed is None:
            seed = self.seed
            
        name, estimator, hp_grid = self.estimators[name]
        trainer = Trainer(estimator(), self.data, self.labels, self.n_splits,
                              self.test_size, seed, hp_grid)
        
        with PrintDuration() as tick:
            trainer.train(tick)
        
        trainer.collect_best_stats()
        
        if save:
            setattr(self, name, trainer)
            
        return trainer
        
    def search_best_seed(self, name, seed_range=100):
        best_rsme = math.inf
        best_seed = 0

        for seed in range(seed_range):
            estimator = self.train_estimator(name, seed, save=False)
            rsme = estimator.best_stats["rsme"]

            if rsme < best_rsme:
                best_rsme = rsme
                best_seed = seed
                print(f'{seed} -> {rsme} - {estimator.mean_rsme}')
        
        print(f'Best seed found as {best_seed}')
        return best_seed
    
    def get_results_dataframe(self, name, shuffle=False, ascending=False):
        trainer = getattr(self, name)

        true = trainer.best_stats['y_true'].reshape(-1)
        pred = trainer.best_stats['y_pred'].reshape(-1)
        
        df = pd.DataFrame(data={
            'true': true,
            'prediction': pred,
            'diff': np.absolute(true - pred)
        })
    
        if shuffle:
            df = df.sample(frac=1)
        else:
            df = df.sort_values('diff', ascending=ascending)
        
        return df
    
    def print_stats(self, name):
        trainer = getattr(self, name)
        print('best_rsme', trainer.best_stats['rsme'])
        print('mean_rsme', trainer.mean_rsme)
        print('best_params', trainer.best_stats['best_params'])
        
    # Method alias
    add_estimator = set_estimator

In [14]:
set_trainer = SetTrainer(data, labels, hp_cv_splits, hp_test_size, hp_seed)

## Linear Regression 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [15]:
# Add estimator
set_trainer.add_estimator('linear', LinearRegression)

# Search for best seed.
best_seed = 7908 # set_trainer.search_best_seed('linear', 10000)

# Train with best seed.
_ = set_trainer.train_estimator('linear', best_seed);

# Show stats.
set_trainer.print_stats('linear')

# Show predicts.
set_trainer.get_results_dataframe('linear', ascending=True).head()

best_rsme 6.56
mean_rsme 11.323
best_params None


Unnamed: 0,true,prediction,diff
50,66,66.070456,0.070456
42,82,82.15262,0.15262
10,90,90.222599,0.222599
18,82,81.67739,0.32261
23,51,51.561429,0.561429


## Support Vector Regression

[docs](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html)

In [16]:
# Add estimator
set_trainer.add_estimator('svr', SVR, {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': [1.5, 10],
    'gamma': [1e-7, 1e-4],
    'epsilon': [0.1, 0.2, 0.3, 0.5]
})

# Train with best seed.
_ = set_trainer.train_estimator('svr');

# Show stats.
set_trainer.print_stats('svr')

# Show predicts.
set_trainer.get_results_dataframe('svr', ascending=True).head()

best_rsme 7.4
mean_rsme 11.400000000000002
best_params {'C': 10, 'epsilon': 0.1, 'gamma': 1e-07, 'kernel': 'linear'}


Unnamed: 0,true,prediction,diff
0,74,74.189789,0.189789
61,64,64.526977,0.526977
58,70,70.599672,0.599672
4,65,64.343367,0.656633
18,82,82.810229,0.810229


## Bayesian Ridge

[doc](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.BayesianRidge.html)

In [17]:
# Add estimator
set_trainer.add_estimator('br', BayesianRidge, {
    'n_iter': [300, 500, 700, 1000]
})

# Train with best seed.
_ = set_trainer.train_estimator('br');

# Show stats.
set_trainer.print_stats('br')

# Show predicts.
set_trainer.get_results_dataframe('br', ascending=True).head()

best_rsme 6.75
mean_rsme 11.25
best_params {'n_iter': 300}


Unnamed: 0,true,prediction,diff
27,59,59.283629,0.283629
50,66,66.291115,0.291115
18,82,81.383847,0.616153
58,70,69.042758,0.957242
10,90,88.900119,1.099881


## kNN

[doc](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

In [18]:
# Add estimator
set_trainer.add_estimator('knn', KNeighborsRegressor, {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
})

# Train with best seed.
_ = set_trainer.train_estimator('knn');

# Show stats.
set_trainer.print_stats('knn')

# Show predicts.
set_trainer.get_results_dataframe('knn', ascending=True).head()

best_rsme 8.72
mean_rsme 12.211
best_params {'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}


Unnamed: 0,true,prediction,diff
13,72,72.0,0.0
51,75,75.5,0.5
0,74,75.3,1.3
50,66,64.4,1.6
47,66,67.7,1.7


## Decision Tree

[docs](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

In [19]:
# Add estimator
set_trainer.add_estimator('dt', DecisionTreeRegressor, {
    'criterion': ['squared_error', 'absolute_error'],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2]
})

# Train with best seed.
_ = set_trainer.train_estimator('dt');

# Show stats.
set_trainer.print_stats('dt')

# Show predicts.
set_trainer.get_results_dataframe('dt', ascending=True).head()

best_rsme 10.31
mean_rsme 14.039000000000001
best_params {'criterion': 'absolute_error', 'min_samples_leaf': 2, 'min_samples_split': 8}


Unnamed: 0,true,prediction,diff
51,75,75.0,0.0
23,51,51.0,0.0
37,83,83.0,0.0
57,84,84.5,0.5
39,50,49.0,1.0


## Bagging

[doc](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)

In [None]:
# Add estimator
set_trainer.add_estimator('bag', BaggingRegressor, {
    'max_features': [3, 5, 7, 9, 11],
    'n_estimators': [10, 20, 50, 100],
})

# Train with best seed.
_ = set_trainer.train_estimator('bag');

# Show stats.
set_trainer.print_stats('bag')

# Show predicts.
set_trainer.get_results_dataframe('bag', ascending=True).head()

20% completed, remaining time = 00:03:51, average tick time = 00:00:21

## Random Forest

[doc](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

In [None]:
# Add estimator
set_trainer.add_estimator('rf', RandomForestRegressor, {
    'max_features': [3, 5, 7, 9, 11],
    'n_estimators': [100, 200, 500, 1000]
})

# Train with best seed.
_ = set_trainer.train_estimator('rf');

# Show stats.
set_trainer.print_stats('rf')

# Show predicts.
set_trainer.get_results_dataframe('rf', ascending=True).head()

## XGBoost

[doc](https://xgboost.readthedocs.io/en/stable/parameter.html)

In [None]:
# Add estimator
set_trainer.add_estimator('xgb', XGBRegressor, {
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 500, 1000],
    'subsample': [0.5, 0.75, 1],
})

# Train with best seed.
_ = set_trainer.train_estimator('xgb');

# Show stats.
set_trainer.print_stats('xgb')

# Show predicts.
set_trainer.get_results_dataframe('xgb', ascending=True).head()

## AdaBoost

[doc](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html)

In [None]:
# Add estimator
set_trainer.add_estimator('ada', AdaBoostClassifier, {
    'loss': ['linear', 'square', 'exponential'],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 500, 1000],
})

# Train with best seed.
_ = set_trainer.train_estimator('ada');

# Show stats.
set_trainer.print_stats('ada')

# Show predicts.
set_trainer.get_results_dataframe('ada', ascending=True).head()

## LGBM

[doc](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('lgbm', LGBMRegressor, {
    'colsample_bytree': [0.4, 0.5, 0.6, 0.9, 1],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 500, 1000],
})

# Train with best seed.
_ = set_trainer.train_estimator('lgbm');

# Show stats.
set_trainer.print_stats('lgbm')

# Show predicts.
set_trainer.get_results_dataframe('lgbm', ascending=True).head()

## MLP

[doc](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)

In [None]:
# Add estimator
set_trainer.add_estimator('mlp', MLPRegressor, {
    'early_stopping': [True],
    'activation': ['relu'],
    'solver': ['sgd', 'adam'],
    'hidden_layer_sizes': [(8,8), (16,16), (64,64)],
    'batch_size': ['auto', 8, 16, 32, 64],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.003, 0.01],
})

# Train with best seed.
_ = set_trainer.train_estimator('mlp');

# Show stats.
set_trainer.print_stats('mlp')

# Show predicts.
set_trainer.get_results_dataframe('mlp', ascending=True).head()