In [1]:
from attrdict import AttrDict
import numpy as np
import pandas as pd
from scipy.stats import gmean
from deepsense import neptune
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from src import pipeline_config  as cfg
from src.pipelines import PIPELINES
from src.utils import init_logger, read_params, set_seed, create_submission, verify_submission, calculate_rank

neptune: Executing in Offline Mode.
neptune: Executing in Offline Mode.


In [2]:
ctx = neptune.Context()
params = read_params(ctx, fallback_file='neptune.yaml')

def _read_data(dev_mode, read_train=True, read_test=False):
    if dev_mode:
        nrows = cfg.DEV_SAMPLE_SIZE
    else:
        nrows = None

    raw_data = {}

    if read_train:
        raw_data['application_train'] = pd.read_csv(params.train_filepath, nrows=nrows)

    if read_test:
        raw_data['application_test'] = pd.read_csv(params.test_filepath, nrows=nrows)

    raw_data['bureau'] = pd.read_csv(params.bureau_filepath, nrows=nrows)
    raw_data['credit_card_balance'] = pd.read_csv(params.credit_card_balance_filepath, nrows=nrows)
    raw_data['installments_payments'] = pd.read_csv(params.installments_payments_filepath, nrows=nrows)
    raw_data['pos_cash_balance'] = pd.read_csv(params.POS_CASH_balance_filepath, nrows=nrows)
    raw_data['previous_application'] = pd.read_csv(params.previous_application_filepath, nrows=nrows)
    raw_data['bureau_balance'] = pd.read_csv(params.bureau_balance_filepath, nrows=nrows)

    return AttrDict(raw_data)

neptune: Executing in Offline Mode.


# Train the model

In [29]:
dev_mode = False

tables = _read_data(dev_mode, read_train=True, read_test=False)


train_data_split, valid_data_split = train_test_split(tables.application_train,
                                                      test_size=params.validation_size,
                                                      random_state=cfg.RANDOM_SEED,
                                                      shuffle=params.shuffle)

train_data = {'application': {'X': train_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
                              'y': train_data_split[cfg.TARGET_COLUMNS].values.reshape(-1),
                              'X_valid': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
                              'y_valid': valid_data_split[cfg.TARGET_COLUMNS].values.reshape(-1)
                              },
              'bureau_balance': {'X': tables.bureau_balance},
              'bureau': {'X': tables.bureau},
              'credit_card_balance': {'X': tables.credit_card_balance},
              'installments_payments': {'X': tables.installments_payments},
              'pos_cash_balance': {'X': tables.pos_cash_balance},
              'previous_application': {'X': tables.previous_application},
              }


In [30]:
train_data['application']['X'].shape


(246008, 121)

In [34]:

pipeline = PIPELINES['lightGBM'](config=cfg.SOLUTION_CONFIG, train_mode=True)
pipeline.clean_cache()
output = pipeline.fit_transform(train_data)
pipeline.clean_cache()


2020-05-07 19:45:39 steppy >>> initializing Step application_cleaning...
2020-05-07 19:45:39 steppy >>> initializing experiment directories under .\WORKDIR
2020-05-07 19:45:39 steppy >>> done: initializing experiment directories
2020-05-07 19:45:39 steppy >>> Step application_cleaning initialized
2020-05-07 19:45:39 steppy >>> initializing Step application_cleaning_valid...
2020-05-07 19:45:39 steppy >>> initializing experiment directories under .\WORKDIR
2020-05-07 19:45:39 steppy >>> done: initializing experiment directories
2020-05-07 19:45:39 steppy >>> Step application_cleaning_valid initialized
2020-05-07 19:45:39 steppy >>> initializing Step application_hand_crafted...
2020-05-07 19:45:39 steppy >>> initializing experiment directories under .\WORKDIR
2020-05-07 19:45:39 steppy >>> done: initializing experiment directories
2020-05-07 19:45:39 steppy >>> Step application_hand_crafted initialized
2020-05-07 19:45:39 steppy >>> initializing Step application_hand_crafted_valid...
202

  r = func(a, **kwargs)
  lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]
  lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]


In [39]:
data = output['data']
data = data.replace([np.inf, -np.inf], np.nan).fillna(0)
model = output['estimator']
predictions = output['prediction']

In [40]:
data

Unnamed: 0,annuity_income_percentage,car_to_birth_ratio,car_to_employ_ratio,children_ratio,credit_to_annuity_ratio,credit_to_goods_ratio,credit_to_income_ratio,days_employed_percentage,income_credit_percentage,income_per_child,...,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START
0,0.115160,0.000000,0.000000,0.000000,21.455713,1.211196,2.470840,0.132175,0.404721,112500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.069237,0.000000,0.000000,0.000000,22.454580,1.158392,1.554684,0.000000,0.643217,171000.0,...,0.0,0.0,-1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,1.0
2,0.169600,0.000000,0.000000,0.000000,30.866745,1.396000,5.235000,0.604434,0.191022,180000.0,...,1.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,-1.0,2.0
3,0.169143,0.000000,0.000000,0.000000,20.459459,1.211200,3.460572,0.368695,0.288970,157500.0,...,2.0,0.0,3.0,3.0,1.0,1.0,1.0,1.0,2.0,2.0
4,0.218533,0.000000,0.000000,0.000000,17.266626,1.132000,3.773333,0.000000,0.265018,67500.0,...,0.0,0.0,-1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,3.0
5,0.093046,0.000000,0.000000,0.000000,10.521121,1.066000,0.978952,0.000000,1.021501,122503.5,...,0.0,0.0,-1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,1.0
6,0.150000,-0.001198,-0.008264,0.333333,20.000000,1.000000,3.000000,0.144923,0.333333,33750.0,...,0.0,0.0,3.0,4.0,1.0,1.0,1.0,1.0,0.0,2.0
7,0.066667,0.000000,0.000000,0.333333,20.000000,1.000000,1.333333,0.059596,0.750000,101250.0,...,3.0,1.0,4.0,5.0,1.0,1.0,1.0,1.0,0.0,4.0
8,0.494167,0.000000,0.000000,0.333333,18.733221,1.110880,9.257334,0.170864,0.108022,67500.0,...,0.0,0.0,-1.0,6.0,1.0,0.0,1.0,1.0,0.0,5.0
9,0.275240,0.000000,0.000000,0.000000,25.588867,1.158401,7.043080,0.214796,0.141983,112500.0,...,4.0,0.0,3.0,7.0,1.0,1.0,1.0,1.0,0.0,5.0


In [41]:
import lime
import lime.lime_tabular

explainer = lime.lime_tabular.LimeTabularExplainer(data.values,
                                                   feature_names=model.feature_name(),
                                                   mode='regression')


In [42]:
import matplotlib

i = 1
exp = explainer.explain_instance(data.loc[i].values, model.predict, num_features=5)



In [43]:
exp.as_list()

[('EXT_SOURCE_1 <= 0.00', 0.013445695257227245),
 ('bureau_SK_ID_CURR_mean_AMT_CREDIT_MAX_OVERDUE <= 0.00',
  -0.01080727111725329),
 ('previous_application_SK_ID_CURR_mean_CNT_PAYMENT > 18.00',
  0.01064185330657766),
 ('external_sources_weighted <= 0.00', 0.009853332141494545),
 ('EXT_SOURCE_2 <= 0.39', 0.009776212630987846)]