# Setup

### Initial tasks

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

### Imports

In [2]:
# built-ins
import os
import json
import math
from os import path

# common
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit, GridSearchCV

# training
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

### Utils / Helpers

In [3]:
def load_json(path):
    with open(path) as f:
        return json.load(f)

### Path Definitions

In [4]:
root = path.abspath(os.getcwd())
dataset_root = path.join(root, 'dataset')

# Hyperparameters

In [5]:
hp_seed = 7908
hp_cv_splits = 10
hp_test_size = 0.2

# Preprocessing

In [6]:
def map_employement_duration(entry):
    entry = str(entry).lower()
    split = entry.split(' ')
    num = split[0]
    output = entry
    
    if "-" in num:
        num = num.split("-")[1]
    
    if "weeks" in entry:
        output = float(num) / 52
    elif ("month" in entry) or ("ay" in entry):
        output = float(num) / 12
    elif ("years" in entry) or ("sene" in entry) or ("yıl" in entry):
        output = float(num)
    else:
        try:
            output = float(num)
        except:
            output = 0
        
    output = round(output, 3)
    return output

def remove_outliers(dframe, columns):
    for column in columns:
        Q1 = dframe[column].quantile(0.25)
        Q3 = dframe[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_band = Q1 - 1.5*IQR
        upper_band = Q3 + 1.5*IQR
        
        normals = ~((dframe[column] < lower_band) | (dframe[column] > upper_band))
        dframe = dframe.loc[normals]
    
    return dframe

In [7]:
# read encodings
encodings = load_json(path.join(dataset_root, 'encodings.json'))

# read csvs
csv_en = pd.read_csv(path.join(dataset_root, 'csv/english.csv'), dtype=str)
csv_tr = pd.read_csv(path.join(dataset_root, 'csv/turkish.csv'), dtype=str)

# drop columns
csv_en.drop('Timestamp', axis=1, inplace=True)
csv_tr.drop('Timestamp', axis=1, inplace=True)

# rename columns
csv_en.rename(columns=encodings['columns']['en'], inplace=True)
csv_tr.rename(columns=encodings['columns']['tr'], inplace=True)

# encode columns
csv_en.replace(encodings['values']['en'], inplace=True)
csv_tr.replace(encodings['values']['tr'], inplace=True)

# concat csvs
df = pd.concat([csv_en, csv_tr], axis=0).reset_index()

# fix NaNs
df.fillna(0, inplace=True)

# convert types
df['age'] = df['age'].apply(lambda x: int(x))
df['weight'] = df['weight'].apply(lambda x: int(float(x.replace(',', '.'))))
df['height'] = df['height'].apply(lambda x: int(x.translate({ord(x): '' for x in [',', '.', ' ']})))
df['employment_duration'] = df['employment_duration'].apply(map_employement_duration)

# save csv
df.to_csv(path.join(dataset_root, 'csv/data.csv'), index=None, header=True, encoding='utf-8-sig')
df.info()

# separate data and labels
data = df.drop('weight', axis=1).to_numpy()[:, 1:]
labels = df['weight'].to_numpy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                322 non-null    int64  
 1   age                  322 non-null    int64  
 2   height               322 non-null    int64  
 3   weight               322 non-null    int64  
 4   gender               322 non-null    int64  
 5   smoking              322 non-null    int64  
 6   drinking             322 non-null    int64  
 7   exercise             322 non-null    int64  
 8   married              322 non-null    int64  
 9   children             322 non-null    int64  
 10  student              322 non-null    int64  
 11  employed             322 non-null    int64  
 12  employment_duration  322 non-null    float64
dtypes: float64(1), int64(12)
memory usage: 32.8 KB


# Explatory Data Analysis

In [8]:
df.head()

Unnamed: 0,index,age,height,weight,gender,smoking,drinking,exercise,married,children,student,employed,employment_duration
0,0,36,175,82,1,0,3,1,0,0,0,1,12.0
1,1,22,178,77,1,0,2,1,0,0,1,0,0.0
2,2,32,174,65,1,0,2,1,0,0,1,2,0.038
3,3,21,156,65,0,0,0,2,0,0,1,0,0.0
4,4,24,176,94,1,0,1,1,0,0,1,0,0.0


# Training

In [9]:
class Trainer:
    def __init__(self, estimator, data, labels, n_splits, test_size, seed, hp_grid=None):
        self.estimator = estimator
        self.data = data
        self.labels = labels
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.hp_grid = hp_grid
        
        self.stats = []
        self.best_stats = None
        self.best_estimator = None
    
    def split(self):
        split = ShuffleSplit(n_splits=self.n_splits, test_size=self.test_size, random_state=self.seed)
        
        for train_index, test_index in split.split(self.data):
            train_data = (self.data[train_index], self.labels[train_index])
            test_data = (self.data[test_index], self.labels[test_index])

            yield(train_data, test_data)
    
    def train(self):
        for split_index, (train_data, test_data) in enumerate(self.split()):
            X_train, Y_train = train_data
            X_test, Y_test = test_data
            
            # standardize age weight and employment duration.
            cols = [0, 1, 10]
            scaler = preprocessing.StandardScaler()
            scaler.fit(X_train[:, cols])
            X_train[:, cols] = scaler.transform(X_train[:, cols])
            X_test[:, cols] = scaler.transform(X_test[:, cols])
            
            # default values
            best_params = None
            best_estimator = self.estimator
            
            # fit estimator
            if self.hp_grid is not None:
                cv = GridSearchCV(self.estimator, self.hp_grid, cv=self.n_splits)
                cv.fit(X_train, Y_train)
                Y_pred = cv.best_estimator_.predict(X_test)
                
                best_params = cv.best_params_
                best_estimator = cv.best_estimator_
            else:
                self.estimator.fit(X_train, Y_train)
                Y_pred = self.estimator.predict(X_test)
            
            rsme = round(np.sqrt(mean_squared_error(Y_test, Y_pred)), 2)
            result = dict(y_true=Y_test, y_pred=Y_pred, best_params=best_params, rsme=rsme,
                          seed=self.seed, best_estimator=self.estimator)
            
            self.stats.append(result)
        
    def collect_best_stats(self):
        best_rsme = math.inf
        total_rsme = 0
        best_stats = None
        
        for stats in self.stats:
            rsme = stats['rsme']

            total_rsme += rsme
            if rsme < best_rsme:
                best_rsme = rsme
                best_stats = stats
        
        self.best_stats = best_stats
        self.best_estimator = best_stats['best_estimator']
        self.mean_rsme = total_rsme / len(self.stats)

In [10]:
class SetTrainer:
    def __init__(self, data, labels, n_splits, test_size, seed):
        self.data = data
        self.labels = labels
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.estimators = {}
    
    def add_estimator(self, name, estimator, hp_grid=None):
        self.estimators[name] = (name, estimator, hp_grid)
    
    def train_all(self):
        for name in self.estimators.keys():
            self.train_estimator(name)
        
    def train_estimator(self, name, seed=None, save=True):
        if seed is None:
            seed = self.seed
            
        name, estimator, hp_grid = self.estimators[name]
        trainer = Trainer(estimator(), self.data, self.labels, self.n_splits,
                              self.test_size, seed, hp_grid)
        
        trainer.train()
        trainer.collect_best_stats()
        
        if save:
            setattr(self, name, trainer)
            
        return trainer
        
    def search_best_seed(self, name, seed_range=100):
        best_rsme = math.inf
        best_seed = 0

        for seed in range(seed_range):
            estimator = self.train_estimator(name, seed, save=False)
            rsme = estimator.best_stats["rsme"]

            if rsme < best_rsme:
                best_rsme = rsme
                best_seed = seed
                print(f'{seed} -> {rsme} - {estimator.mean_rsme}')
        
        print(f'Best seed found as {best_seed}')
        return best_seed
    
    def get_results_dataframe(self, name, shuffle=False, ascending=False):
        trainer = getattr(self, name)

        true = trainer.best_stats['y_true'].reshape(-1)
        pred = trainer.best_stats['y_pred'].reshape(-1)
        
        df = pd.DataFrame(data={
            'true': true,
            'prediction': pred,
            'diff': np.absolute(true - pred)
        })
    
        if shuffle:
            df = df.sample(frac=1)
        else:
            df = df.sort_values('diff', ascending=ascending)
        
        return df
    
    def print_stats(self, name):
        trainer = getattr(self, name)
        print('best_rsme', trainer.best_stats['rsme'])
        print('mean_rsme', trainer.mean_rsme)
        print('best_params', trainer.best_stats['best_params'])

In [11]:
set_trainer = SetTrainer(data, labels, hp_cv_splits, hp_test_size, hp_seed)

## Linear Regression 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [12]:
# Add estimator
set_trainer.add_estimator('linear', LinearRegression)

# Search for best seed.
best_seed = 7908 # set_trainer.search_best_seed('linear', 10000)

# Train with best seed.
_ = set_trainer.train_estimator('linear', best_seed);

# Show stats.
set_trainer.print_stats('linear')

# Show predicts.
set_trainer.get_results_dataframe('linear', ascending=True).head()

best_rsme 6.56
mean_rsme 11.323
best_params None


Unnamed: 0,true,prediction,diff
50,66,66.070456,0.070456
42,82,82.15262,0.15262
10,90,90.222599,0.222599
18,82,81.67739,0.32261
23,51,51.561429,0.561429


## Support Vector Regression

[docs](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html)

In [315]:
# Add estimator
set_trainer.add_estimator('svr', SVR, {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': [1.5, 10],
    'gamma': [1e-7, 1e-4],
    'epsilon': [0.1, 0.2, 0.3, 0.5]
})

# Search for best seed.
best_seed = 7908 # set_trainer.search_best_seed('linear', 10000)

# Train with best seed.
_ = set_trainer.train_estimator('svr', best_seed);

# Show stats.
set_trainer.print_stats('svr')

# Show predicts.
set_trainer.get_results_dataframe('svr', ascending=True).head()

best_rsme 7.4
mean_rsme 11.755999999999998
best_params {'C': 10, 'epsilon': 0.1, 'gamma': 1e-07, 'kernel': 'linear'}


Unnamed: 0,true,prediction,diff
0,74,74.212319,0.212319
58,70,70.596413,0.596413
61,64,64.622556,0.622556
4,65,64.314926,0.685074
14,71,71.812731,0.812731


## Decision Tree

[docs](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

In [318]:
# Add estimator
set_trainer.add_estimator('dt', DecisionTreeRegressor, {
    'criterion': ['squared_error', 'absolute_error'],
    'min_samples_split': range(2, 100),
    'max_leafs_nodes': range(2, 10)
})

# Search for best seed.
best_seed = 7908 # set_trainer.search_best_seed('linear', 10000)

# Train with best seed.
_ = set_trainer.train_estimator('dt', best_seed);

# Show stats.
set_trainer.print_stats('dt')

# Show predicts.
set_trainer.get_results_dataframe('dt', ascending=True).head()

ValueError: Invalid parameter max_leafs_nodes for estimator DecisionTreeRegressor(). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
# Decision Tree
set_trainer.add_estimator('dt', DecisionTreeRegressor())

# Random Forest
set_trainer.add_estimator('rf', RandomForestRegressor())

# XGBoost
set_trainer.add_estimator('xgb', XGBRegressor())

# AdaBoost
set_trainer.add_estimator('ada', AdaBoostClassifier())

# LGBM
set_trainer.add_estimator('lgbm', LGBMRegressor())

# Bayesian Ridge Regression
set_trainer.add_estimator('bayesian', BayesianRidge())

# Bagging 
set_trainer.add_estimator('bag', BaggingRegressor())

# MLP
set_trainer.add_estimator('mlp', MLPRegressor(), {
    'hidder_layer_sizes': [(8,8), (16,16), (64,64)]
})

# kNN
set_trainer.add_estimator('knn', KNeighborsRegressor())

# Train all
#set_trainer.train_all()