In [20]:
%reload_kedro

In [2]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import missingno as msno


In [3]:
from src.ocp7_scoring_model_cloud.pipelines.preprocessing.nodes import (
    preprocess_application_train,
    preprocess_bureau_and_balance,
    preprocess_previous_applications,
    preprocess_pos_cash,
    preprocess_installments_payments,
    preprocess_credit_card_balance,
    join_datasets,
) 

In [4]:
def preprocess_merge_datasets(base_df, bureau_df, bureau_balance_df, previous_application_df, pos_cash_df, installments_payments_df, credit_card_balance_df):
    preprocess_train_df = preprocess_application_train(base_df)
    bureau_agg = preprocess_bureau_and_balance(bureau_df, bureau_balance_df)
    previous_application_agg = preprocess_previous_applications(previous_application_df)
    pos_agg = preprocess_pos_cash(pos_cash_df)
    ins_agg = preprocess_installments_payments(installments_payments_df)
    cc_agg = preprocess_credit_card_balance(credit_card_balance_df)

    preprocessed_df = join_datasets(preprocess_train_df, bureau_agg, previous_application_agg, pos_agg, ins_agg, cc_agg)

    return preprocessed_df

In [5]:
dataset_type = "" #change for "_debug" to load the debug dataset

df_test = catalog.load("application_test" + dataset_type)
bureau_df = catalog.load("bureau"+dataset_type)
bureau_balance_df = catalog.load("bureau_balance"+dataset_type)
previous_application_df = catalog.load("previous_application"+dataset_type)
pos_cash_df = catalog.load("pos_cash_balance"+dataset_type)
installments_payments_df = catalog.load("installments_payments"+dataset_type)
credit_card_balance_df = catalog.load("credit_card_balance"+dataset_type)

preprocessed_df = preprocess_merge_datasets(df_test, bureau_df, bureau_balance_df, previous_application_df, pos_cash_df, installments_payments_df, credit_card_balance_df)

Train samples: 48744


In [8]:
catalog.save("preprocessed_test_df", preprocessed_df)

In [22]:
preprocessed_test_df = catalog.load("preprocessed_test_df")

In [23]:
preprocessed_test_df.shape

[1m([0m[1;36m48744[0m, [1;36m794[0m[1m)[0m

In [17]:
from src.ocp7_scoring_model_cloud.pipelines.feature_processing.nodes import get_clean_features, process_features_for_ml

In [24]:
full_df_train = catalog.load("full_df_train")

In [26]:
features_list = [x for x in full_df_train.columns.tolist() if x not in ["SK_ID_CURR", "TARGET"]]

In [28]:
features_test_df = preprocessed_test_df[["SK_ID_CURR"]+features_list]

In [12]:
features_test_df = get_clean_features(preprocessed_test_df)

In [18]:
features_df = process_features_for_ml(features_test_df)

Training Features shape:  (48744, 481)


In [19]:
features_df.head(5)

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT,SK_ID_CURR
0,0.0,0.0,0.0,0.0,0.024654,0.238037,0.102453,0.184049,0.25738,0.333427,...,0.001256,0.004364,0.005031,0.006113,0.00126,0.442999,0.249229,0.968196,0.01791,100001
1,1.0,0.0,0.0,0.0,0.01644,0.080785,0.084558,0.06135,0.491855,0.399339,...,0.001717,0.005316,0.005107,0.006488,0.001718,0.839439,0.792747,0.988644,0.023881,100005
2,1.0,1.0,0.0,0.0,0.040054,0.280965,0.378515,0.265849,0.260854,0.288794,...,0.051995,7e-06,0.103948,0.010187,0.046184,0.995549,0.53613,0.564267,0.459701,100013
3,0.0,0.0,0.0,0.1,0.065721,0.695297,0.262078,0.695297,0.36176,0.628269,...,0.017311,1e-06,0.011314,0.004498,0.01506,0.990414,0.707388,0.799184,0.334328,100028
4,1.0,1.0,1.0,0.05,0.03492,0.263804,0.166995,0.263804,0.13534,0.680685,...,0.004089,0.012257,0.003199,0.011624,0.004075,0.840808,0.784282,0.984246,0.032836,100038


In [21]:
catalog.save("full_df_test", features_df)