## Feature Engineering

This workbook contains all necessary and possible steps of feature engineering before running the models.
This includes:
- dropping columns
- filling missing values
- outlier treatment
- train/test split
- normalization
- data transformation
- oversampling technique

### 1. Imports

In [1]:
# package imports
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from ipynb.fs.full.prep_datasets import pipeline_preparation

In [2]:
# library imports
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import power_transform
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split



### 2. Loading and preparing the data

In [3]:
# loading the dataset and calling the pipeline_preparation function
df = pd.read_csv("final_df_dec.csv")
df, df_2016 = pipeline_preparation(df)
df.head()

Unnamed: 0,oprev_turn_2016,oprev_turn_2015,oprev_turn_2014,oprev_turn_2013,oprev_turn_2012,netinc_2016,netinc_2015,netinc_2014,netinc_2013,netinc_2012,...,wc_2015,wc_2014,wc_2013,wc_2012,ebit_2016,ebit_2015,ebit_2014,ebit_2013,ebit_2012,bankrupt
0,-766.35,-7906.91,5816.03,18821.76,57891.48,-1812.06,-9470.22,868.65,7543.16,43498.62,...,,,,,-5824.87,-13714.53,-574.51,10786.05,31216.17,0
1,-38066.54,6595.85,6538.46,6033.35,-16398.63,6635.53,-117844.28,-12043.21,-65075.98,-62549.21,...,,,,,-42123.12,5689.15,5772.95,4640.32,-17129.81,0
2,-73110.61,-73994.18,-219163.04,-90426.31,-65.87,-145848.42,-141530.59,-280881.84,-203076.12,-146.32,...,,,,,-145182.9,-140783.81,-280590.36,-146308.52,-146.32,0
3,-28482.5,-32000.2,2570.26,9133.93,6622.08,-32358.51,-34233.79,118.54,5814.55,5723.77,...,,,,,-30769.77,-32322.32,2278.59,8806.28,6533.03,0
4,-2578.17,7294.13,0.01,4826.16,0.0,-40568.9,-31819.79,-37698.85,-77461.0,-11.65,...,,,,,-12652.25,-14650.9,-17555.11,-55605.31,-11.65,0


### 3. Feature engineering steps

In [4]:
# dropping all rows where the entire values are missing
def drop_cols(df):
    cols = df.columns[0:-1]
    df.dropna(axis = 0, subset = cols ,how = "all")
    return df

In [5]:
# filling the missing values, since there are infinite values inside replace them with NaN and fill missing values again
def fill_nan(df):
    cols = df.columns[0:-1]
    for cols in df:
        df[cols].fillna(df[cols].median(), inplace = True)
    
    df = df.replace([np.inf, -np.inf], np.nan)
    
    for cols in df:
        df[cols].fillna(df[cols].median(), inplace = True)
    return df

In [6]:
# a possible outlier caption method; NOT used
def outlier_cap(df):
    for column_name in df.columns:
        per_95 = np.percentile(df[column_name], 95)
        df[column_name] = df[column_name].apply(lambda x: per_95 if x > per_95 else x)
        per_05 = np.percentile(df[column_name], 5)
        df[column_name] = df[column_name].apply(lambda x: per_05 if x < per_05 else x)
    return df

In [7]:
# a possible outlier dropping method; NOT used
def outlier_drop(df):
    for column_name in df.columns:
        df = df.drop(np.argmax(df[column_name]))
        df = df.drop(np.argmin(df[column_name]))
    return df

In [8]:
# train-test-split
def data_split(df):
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 30)
    return X_train, X_test, y_train, y_test

In [9]:
# scaling X_train, as the different variables are displayed in different units
def normalization(X_train, X_test):
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)
    X_train = X_train.round(6)
    X_test = X_test.round(6)
    return X_train, X_test

In [10]:
# transforming the data since it is skewed
def transform(X_train, X_test):
    X_train = power_transform(X_train, method="yeo-johnson", copy=False)
    X_test = power_transform(X_test, method="yeo-johnson", copy=False)
    X_train = X_train.round(6)
    X_test = X_test.round(6)
    return X_train, X_test

In [11]:
# oversampling technique in order to train the model on a balanced dataset
def smote(X_train, y_train):
    sm = SMOTE(random_state = 2)
    X_train, y_train = sm.fit_sample(X_train, y_train)
    return X_train, y_train

### 4. Feature engineering pipeline

In [12]:
# final feature engineering pipeline
def pipeline_feature_eng(df):
    df = drop_cols(df)
    df = fill_nan(df)
    X_train, X_test, y_train, y_test = data_split(df)
    X_train, X_test = normalization(X_train, X_test)
    X_train, X_test = transform(X_train, X_test)
    X_train, y_train = smote(X_train, y_train)
    return df, X_train, X_test, y_train, y_test

In [13]:
# calling the feature engineering pipeline
df, X_train, X_test, y_train, y_test = pipeline_feature_eng(df)