# Setup

### Initial tasks

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Imports

In [36]:
# built-ins
import os
import json
from os import path

# common
import pandas as pd
import numpy as np

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# training
from sklearn.linear_model import LinearRegression

In [3]:
'''
import seaborn as sns 
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
'''

'\nimport seaborn as sns \nfrom sklearn.ensemble import RandomForestClassifier\n\nfrom lightgbm import LGBMClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_val_score\n'

### Utils / Helpers

In [4]:
def load_json(path):
    with open(path) as f:
        return json.load(f)

### Path Definitions

In [5]:
root = path.abspath(os.getcwd())
dataset_root = path.join(root, 'dataset')

# Read Dataset

In [6]:
def map_employement_duration(entry):
    entry = str(entry).lower()
    split = entry.split(' ')
    num = split[0]
    output = entry
    
    if "-" in num:
        num = num.split("-")[1]
    
    if "weeks" in entry:
        output = float(num) / 52
    elif ("month" in entry) or ("ay" in entry):
        output = float(num) / 12
    elif ("years" in entry) or ("sene" in entry) or ("yıl" in entry):
        output = float(num)
    else:
        try:
            output = float(num)
        except:
            output = 0
        
    output = round(output, 3)
    return output

In [7]:
# read encodings
encodings = load_json(path.join(dataset_root, 'encodings.json'))

# read csvs
csv_en = pd.read_csv(path.join(dataset_root, 'csv/english.csv'), dtype=str)
csv_tr = pd.read_csv(path.join(dataset_root, 'csv/turkish.csv'), dtype=str)

# drop columns
csv_en.drop('Timestamp', axis=1, inplace=True)
csv_tr.drop('Timestamp', axis=1, inplace=True)

# rename columns
csv_en.rename(columns=encodings['columns']['en'], inplace=True)
csv_tr.rename(columns=encodings['columns']['tr'], inplace=True)

# encode columns
csv_en.replace(encodings['values']['en'], inplace=True)
csv_tr.replace(encodings['values']['tr'], inplace=True)

# concat csvs
df = pd.concat([csv_en, csv_tr], axis=0)

# fix NaNs
df.fillna(0, inplace=True)

# convert types
df['age'] = df['age'].apply(lambda x: int(x))
df['weight'] = df['weight'].apply(lambda x: int(float(x.replace(',', '.'))))
df['height'] = df['height'].apply(lambda x: int(x.translate({ord(x): '' for x in [',', '.', ' ']})))
df['employment_duration'] = df['employment_duration'].apply(map_employement_duration)

# save csv
df.to_csv(path.join(dataset_root, 'csv/data.csv'), index=None, header=True, encoding='utf-8-sig')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 323 entries, 0 to 310
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  323 non-null    int64  
 1   height               323 non-null    int64  
 2   weight               323 non-null    int64  
 3   gender               323 non-null    int64  
 4   smoking              323 non-null    int64  
 5   drinking             323 non-null    int64  
 6   exercise             323 non-null    int64  
 7   married              323 non-null    int64  
 8   children             323 non-null    int64  
 9   student              323 non-null    int64  
 10  employed             323 non-null    int64  
 11  employment_duration  323 non-null    float64
dtypes: float64(1), int64(11)
memory usage: 32.8 KB


# Explatory Data Analysis

# Hyperparameters

In [8]:
hp_test_size = 0.2
hp_val_size = 0.1
hp_split_seed = 42

# Preprocessing

### Helpers

In [9]:
def remove_outliers(dframe, columns):
    for column in columns:
        Q1 = dframe[column].quantile(0.25)
        Q3 = dframe[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_band = Q1 - 1.5*IQR
        upper_band = Q3 + 1.5*IQR
        
        normals = ~((dframe[column] < lower_band) | (dframe[column] > upper_band))
        dframe = dframe.loc[normals]
    
    return dframe

### Pipeline

In [36]:
# Split into test, val and train.
rest_df, test_df = train_test_split(df, test_size=hp_test_size, random_state=hp_split_seed)
train_df, val_df = train_test_split(rest_df, test_size=hp_val_size, random_state=hp_split_seed)

# Remove outliers from train.
train_df = remove_outliers(train_df, ['age', 'height', 'weight'])
print(train_df.shape)

# Apply standardization
cols = ['age', 'height', 'employment_duration']
std_scaler = preprocessing.StandardScaler().fit(train_df[cols])
test_df[cols] = std_scaler.transform(test_df[cols])
val_df[cols] = std_scaler.transform(val_df[cols])
train_df[cols] = std_scaler.transform(train_df[cols])

# Separate data and labels.
X_test, Y_test = test_df.drop('weight', axis=1), test_df['weight'].to_frame()
X_val, Y_val = val_df.drop('weight', axis=1), val_df['weight'].to_frame()
X_train, Y_train = train_df.drop('weight', axis=1), train_df['weight'].to_frame()

(206, 12)


# Training

### Helpers

In [47]:
def create_pred_df(true, pred):
    if type(true) is pd.DataFrame:
        true = true.to_numpy()
        
    if type(pred) is pd.DataFrame:
        pred = pred.to_numpy()
        
    return pd.DataFrame(data={
        'true': true.reshape(-1),
        'prediction': pred.reshape(-1)
    })

## Linear Regression

In [46]:
reg = LinearRegression().fit(X_train, Y_train)
reg.score(X_train, Y_train)
Y_pred = reg.predict(X_test)

create_pred_df(Y_test, Y_pred).head(10)

0.38718719283026737

Unnamed: 0,true,prediction
0,65,69.219688
1,71,75.76798
2,82,91.980959
3,73,66.138671
4,62,67.810096
5,76,72.753085
6,54,71.929837
7,74,77.22002
8,88,74.905459
9,74,83.311389
