# Setup

### Initial tasks

In [123]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

### Imports

In [153]:
# built-ins
import os
import json
from os import path

# common
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from skompiler import skompile

# training
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

### Utils / Helpers

In [125]:
def load_json(path):
    with open(path) as f:
        return json.load(f)

### Path Definitions

In [126]:
root = path.abspath(os.getcwd())
dataset_root = path.join(root, 'dataset')

# Hyperparameters

In [127]:
hp_seed = 42
hp_cv_splits = 10
hp_test_size = 0.2

# Preprocessing

In [128]:
def map_employement_duration(entry):
    entry = str(entry).lower()
    split = entry.split(' ')
    num = split[0]
    output = entry
    
    if "-" in num:
        num = num.split("-")[1]
    
    if "weeks" in entry:
        output = float(num) / 52
    elif ("month" in entry) or ("ay" in entry):
        output = float(num) / 12
    elif ("years" in entry) or ("sene" in entry) or ("yıl" in entry):
        output = float(num)
    else:
        try:
            output = float(num)
        except:
            output = 0
        
    output = round(output, 3)
    return output

def remove_outliers(dframe, columns):
    for column in columns:
        Q1 = dframe[column].quantile(0.25)
        Q3 = dframe[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_band = Q1 - 1.5*IQR
        upper_band = Q3 + 1.5*IQR
        
        normals = ~((dframe[column] < lower_band) | (dframe[column] > upper_band))
        dframe = dframe.loc[normals]
    
    return dframe

In [136]:
# read encodings
encodings = load_json(path.join(dataset_root, 'encodings.json'))

# read csvs
csv_en = pd.read_csv(path.join(dataset_root, 'csv/english.csv'), dtype=str)
csv_tr = pd.read_csv(path.join(dataset_root, 'csv/turkish.csv'), dtype=str)

# drop columns
csv_en.drop('Timestamp', axis=1, inplace=True)
csv_tr.drop('Timestamp', axis=1, inplace=True)

# rename columns
csv_en.rename(columns=encodings['columns']['en'], inplace=True)
csv_tr.rename(columns=encodings['columns']['tr'], inplace=True)

# encode columns
csv_en.replace(encodings['values']['en'], inplace=True)
csv_tr.replace(encodings['values']['tr'], inplace=True)

# concat csvs
df = pd.concat([csv_en, csv_tr], axis=0).reset_index()

# fix NaNs
df.fillna(0, inplace=True)

# convert types
df['age'] = df['age'].apply(lambda x: int(x))
df['weight'] = df['weight'].apply(lambda x: int(float(x.replace(',', '.'))))
df['height'] = df['height'].apply(lambda x: int(x.translate({ord(x): '' for x in [',', '.', ' ']})))
df['employment_duration'] = df['employment_duration'].apply(map_employement_duration)

# save csv
df.to_csv(path.join(dataset_root, 'csv/data.csv'), index=None, header=True, encoding='utf-8-sig')
df.info()

# separate data and labels
data = df.drop('weight', axis=1).to_numpy()[:, 1:]
labels = df['weight'].to_numpy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                322 non-null    int64  
 1   age                  322 non-null    int64  
 2   height               322 non-null    int64  
 3   weight               322 non-null    int64  
 4   gender               322 non-null    int64  
 5   smoking              322 non-null    int64  
 6   drinking             322 non-null    int64  
 7   exercise             322 non-null    int64  
 8   married              322 non-null    int64  
 9   children             322 non-null    int64  
 10  student              322 non-null    int64  
 11  employed             322 non-null    int64  
 12  employment_duration  322 non-null    float64
dtypes: float64(1), int64(12)
memory usage: 32.8 KB


# Explatory Data Analysis

In [130]:
df.head()

Unnamed: 0,index,age,height,weight,gender,smoking,drinking,exercise,married,children,student,employed,employment_duration
0,0,36,175,82,1,0,3,1,0,0,0,1,12.0
1,1,22,178,77,1,0,2,1,0,0,1,0,0.0
2,2,32,174,65,1,0,2,1,0,0,1,2,0.038
3,3,21,156,65,0,0,0,2,0,0,1,0,0.0
4,4,24,176,94,1,0,1,1,0,0,1,0,0.0


# Training

In [166]:
class Trainer:
    def __init__(self, estimator, data, labels, n_splits, test_size, seed, hp_grid=None):
        self.estimator = estimator
        self.data = data
        self.labels = labels
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.hp_grid = hp_grid
        self.stats = []
    
    def split(self):
        split = ShuffleSplit(n_splits=self.n_splits, test_size=self.test_size, random_state=self.seed)
        
        for train_index, test_index in split.split(self.data):
            train_data = (self.data[train_index], self.labels[train_index])
            test_data = (self.data[test_index], self.labels[test_index])

            yield(train_data, test_data)
    
    def train(self):
        for split_index, (train_data, test_data) in enumerate(self.split()):
            X_train, Y_train = train_data
            X_test, Y_test = test_data
            
            # standardize age weight and employment duration.
            cols = [0, 1, 10]
            scaler = preprocessing.StandardScaler()
            scaler.fit(X_train[:, cols])
            X_train[:, cols] = std_scaler.transform(X_train[:, cols])
            X_test[:, cols] = std_scaler.transform(X_test[:, cols])
            
            # default values
            best_params = None
            
            # fit estimator
            if self.hp_grid is not None:
                cv = GridSearchCV(self.estimator, self.hp_grid, cv=self.n_splits)
                cv.fit(X_train, Y_train)
                Y_pred = cv.best_estimator_.predict(X_test)
                best_params = cv.best_params_
            else:
                self.estimator.fit(X_train, Y_train)
                Y_pred = self.estimator.predict(X_test)
            
            rsme = round(np.sqrt(mean_squared_error(Y_test, Y_pred)), 2)
            result = dict(pred=Y_pred, best_params=best_params, rsme=rsme, seed=self.seed)
            
            self.stats.append(result)

In [177]:
class SetTrainer:
    def __init__(self, data, labels, n_splits, test_size, seed):
        self.data = data
        self.labels = labels
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.estimators = {}
    
    def add_estimator(self, name, estimator, hp_grid=None):
        self.estimators[name] = (name, estimator, hp_grid)
    
    def train_all(self):
        for name in self.estimators.keys():
            self.train_estimator(name)
        
    def train_estimator(self, name):
        name, estimator, hp_grid = self.estimators[name]
        trainer = Trainer(estimator, self.data, self.labels, self.n_splits,
                              self.test_size, self.seed, hp_grid)
        
        trainer.train()
        
        setattr(self, name, trainer)

In [181]:
set_trainer = SetTrainer(data, labels, hp_cv_splits, hp_test_size, hp_seed)

# Linear regression
set_trainer.add_estimator('linear', LinearRegression())

# Support Vector Regression
set_trainer.add_estimator('svr', SVR(), {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': [1.5, 10],
    'gamma': [1e-7, 1e-4],
    'epsilon': [0.1, 0.2, 0.3, 0.5]
})

# Decision Tree
set_trainer.add_estimator('dt', DecisionTreeRegressor())

# Random Forest
set_trainer.add_estimator('rf', RandomForestRegressor())

# XGBoost
set_trainer.add_estimator('xgb', XGBRegressor())

# AdaBoost
set_trainer.add_estimator('ada', AdaBoostClassifier())

# LGBM
set_trainer.add_estimator('lgbm', LGBMRegressor())

# Bayesian Ridge Regression
set_trainer.add_estimator('bayesian', BayesianRidge())

# Bagging 
set_trainer.add_estimator('bag', BaggingRegressor())

# MLP
set_trainer.add_estimator('mlp', MLPRegressor(), {
    'hidder_layer_sizes': [(8,8), (16,16), (64,64)]
})

# kNN
set_trainer.add_estimator('knn', KNeighborsRegressor())

# Train all
set_trainer.train_all()