# Setup

### Initial tasks

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Imports

In [2]:
import os
import json
from os import path

import pandas as pd
import numpy as np



In [None]:
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

### Utils / Helpers

In [3]:
def load_json(path):
    with open(path) as f:
        return json.load(f)

### Path Definitions

In [4]:
root = path.abspath(os.getcwd())
dataset_root = path.join(root, 'dataset')

# Read Dataset

In [5]:
def map_employement_duration(entry):
    entry = str(entry).lower()
    split = entry.split(' ')
    num = split[0]
    output = entry
    
    if "-" in num:
        num = num.split("-")[1]
    
    if "weeks" in entry:
        output = float(num) / 52
    elif ("month" in entry) or ("ay" in entry):
        output = float(num) / 12
    elif ("years" in entry) or ("sene" in entry) or ("yıl" in entry):
        output = float(num)
    else:
        try:
            output = float(num)
        except:
            output = 0
        
    output = round(output, 3)
    return output

In [6]:
# read encodings
encodings = load_json(path.join(dataset_root, 'encodings.json'))

# read csvs
csv_en = pd.read_csv(path.join(dataset_root, 'csv/english.csv'), dtype=str)
csv_tr = pd.read_csv(path.join(dataset_root, 'csv/turkish.csv'), dtype=str)

# drop columns
csv_en.drop('Timestamp', axis=1, inplace=True)
csv_tr.drop('Timestamp', axis=1, inplace=True)

# rename columns
csv_en.rename(columns=encodings['columns']['en'], inplace=True)
csv_tr.rename(columns=encodings['columns']['tr'], inplace=True)

# encode columns
csv_en.replace(encodings['values']['en'], inplace=True)
csv_tr.replace(encodings['values']['tr'], inplace=True)

# concat csvs
df = pd.concat([csv_en, csv_tr], axis=0)

# fix NaNs
df.fillna(0, inplace=True)

# convert types
df['age'] = df['age'].apply(lambda x: int(x))
df['weight'] = df['weight'].apply(lambda x: int(float(x.replace(',', '.'))))
df['height'] = df['height'].apply(lambda x: int(x.translate({ord(x): '' for x in [',', '.', ' ']})))
df['employement_duration'] = df['employement_duration'].apply(map_employement_duration)

# save csv
df.to_csv(path.join(dataset_root, 'csv/data.csv'), index=None, header=True, encoding='utf-8-sig')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 323 entries, 0 to 310
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   323 non-null    int64  
 1   height                323 non-null    int64  
 2   weight                323 non-null    int64  
 3   gender                323 non-null    int64  
 4   smoking               323 non-null    int64  
 5   drinking              323 non-null    int64  
 6   exercise              323 non-null    int64  
 7   married               323 non-null    int64  
 8   children              323 non-null    int64  
 9   student               323 non-null    int64  
 10  employed              323 non-null    int64  
 11  employement_duration  323 non-null    float64
dtypes: float64(1), int64(11)
memory usage: 32.8 KB


# Explatory Data Analysis

# Preprocessing

In [9]:
def remove_outliers(dframe, columns):
    for column in columns:
        Q1 = dframe[column].quantile(0.25)
        Q3 = dframe[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_band = Q1 - 1.5*IQR
        upper_band = Q3 + 1.5*IQR
        
        normals = ~((dframe[column] < lower_band) | (dframe[column] > upper_band))
        dframe = dframe.loc[normals]
    
    return dframe

In [10]:
train = remove_outliers(df, ['age', 'height', 'weight'])
print(train.shape)

(288, 12)
