In [16]:
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from helper_functions.preprocess import custom_imputer, time_tranformer

In [17]:
import pandas as pd

In [18]:
numerical_features = ['account_amount_added_12_24m',
                      'account_days_in_dc_12_24m',
                      'account_days_in_rem_12_24m',
                      'account_days_in_term_12_24m',
                      'account_incoming_debt_vs_paid_0_24m',
                      'age',
                      'avg_payment_span_0_12m',
                      'avg_payment_span_0_3m',
                      'max_paid_inv_0_12m',
                      'max_paid_inv_0_24m',
                      'num_active_div_by_paid_inv_0_12m',
                      'num_active_inv',
                      'num_arch_dc_0_12m',
                      'num_arch_dc_12_24m',
                      'num_arch_ok_0_12m',
                      'num_arch_ok_12_24m',
                      'num_arch_rem_0_12m',
                      'num_arch_written_off_0_12m',
                      'num_arch_written_off_12_24m',
                      'num_unpaid_bills',
                      'recovery_debt',
                      'sum_capital_paid_account_0_12m',
                      'sum_capital_paid_account_12_24m',
                      'sum_paid_inv_0_12m']

ordinal_features = ['account_status',
                    'account_worst_status_0_3m',
                    'account_worst_status_12_24m',
                    'account_worst_status_3_6m',
                    'account_worst_status_6_12m',
                    'status_last_archived_0_24m',
                    'status_2nd_last_archived_0_24m',
                    'status_3rd_last_archived_0_24m',
                    'status_max_archived_0_6_months',
                    'status_max_archived_0_12_months',
                    'status_max_archived_0_24_months',
                    'worst_status_active_inv']

nominal_features = ['merchant_category', 'merchant_group', 'has_paid', 'name_in_email']

time_features = ['time_hours']

In [26]:
def pipeline(numerical_features, ordinal_features, nominal_features, time_features):
    preproc_ordinal = make_pipeline(
    custom_imputer(),
    MinMaxScaler())

    preproc_nominal = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", drop='if_binary', sparse_output=False))

    preproc_numerical = make_pipeline(
        custom_imputer(),
        MinMaxScaler(),
        # KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
    )

    preproc_time = make_pipeline(
        time_tranformer())

    preproc_selector_multi = SelectFromModel(
        RandomForestClassifier(),
        threshold = "median", # drop all multivariate features lower than the median correlation
    )

    preproc_pipeline = ColumnTransformer(
        [('numerical', preproc_numerical, numerical_features),
        ('ordinal', preproc_ordinal, ordinal_features),
        ('ohe', preproc_nominal, nominal_features),
        ('time', preproc_time, time_features)],
        remainder="drop")

    final_preproc_pipeline = imbPipeline(steps=[('pipe',preproc_pipeline),
                                            ('smote',SMOTE(sampling_strategy=0.1)),
                                            ('RandomUnderSampler', RandomUnderSampler(sampling_strategy=0.5)),
                                            ('feature selecter', preproc_selector_multi)
                                            ])
    return final_preproc_pipeline

In [27]:
pipeline(numerical_features, ordinal_features, nominal_features, time_features)

In [28]:
# import training dataset
train_dataset = pd.read_csv('raw_data/train_dataset.csv')

# set target and variables
X_train = train_dataset.drop(columns=['default'])
y_train = train_dataset.default

# preprocessing pipeline
preproc_pipeline = pipeline(numerical_features, ordinal_features, nominal_features, time_features)


print(type(X_train))

# transform training data
preproc_data = preproc_pipeline.fit_transform(X_train, y_train)

print(type(preproc_data))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [31]:
preproc_data.shape

(18624, 58)

In [32]:
y_train.shape

(62983,)

In [33]:
preproc_pipeline.fit_transform(X_train, y_train)

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.3975483 ,  0.91758125],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.99482914, -0.10156273],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.79648595,  0.60465704],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.88471158,  0.4132791 ],
       [ 0.00701742,  0.        ,  0.        , ...,  0.        ,
        -0.72886544,  0.64583547],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.32978803, -0.94323995]])