In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import gc
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# for local
# from utils.storage import get_storage
import optuna

In [None]:
class Home_Credit:
    APPLICATION_TRAIN_PATH = '../input/home-credit-default-risk/application_train.csv'
    APPLICATION_TEST_PATH = '../input/home-credit-default-risk/application_test.csv'
    BUREAU_PATH = '../input/home-credit-default-risk/bureau.csv'
    BUREAU_BALANCE_PATH = '../input/home-credit-default-risk/bureau_balance.csv'
    PREVIOUS_APPLICATION_PATH = '../input/home-credit-default-risk/previous_application.csv'
    CREDIT_CARD_PATH = '../input/home-credit-default-risk/credit_card_balance.csv'
    INSTALLMENTS_PAYMENTS_PATH = '../input/home-credit-default-risk/installments_payments.csv'
    POS_CASH_BALANCE_PATH = '../input/home-credit-default-risk/POS_CASH_balance.csv'

    def __init__(self, debug=False):
        self.debug = debug
        self.nrows = 10000 if debug == True else None
        self.var_list_to_clear = []
    
    def clear_memory(self):
        for variable in self.var_list_to_clear:
            del variable
        self.var_list_to_clear = []
        gc.collect()
    
    def one_hot_encoding(self, df):
        cols = df.columns.tolist()
        cat_cols = [col for col in df.columns if df[col].dtype == 'object']
        df = pd.get_dummies(df, columns=cat_cols, dummy_na=True)
        # new_cols = [new_col for new_col in df.columns if new_col not in cols]
        return df #, new_cols
    
    def fill_zero_num_cols(self, df):
        num_cols = [col for col in df.columns if df[col].dtype != 'object']
        df.loc[:, num_cols] = df[num_cols].fillna(value=0)
        return df
    
    def convert_float64_to_float32(self, df):
        num_cols = [col for col in df.columns if df[col].dtype == 'float64']
        df.loc[:, num_cols] = df[num_cols].astype('float32')
        return df
    
    def get_num_columns_list(self, df, add_list=[]):
        num_cols = [col for col in df.columns if df[col].dtype != 'object']
        for elem in add_list:
            num_cols.append(elem)
        return num_cols

    def preprocess_aplication(self):
        train_df = pd.read_csv(self.APPLICATION_TRAIN_PATH, nrows=self.nrows)
        test_df = pd.read_csv(self.APPLICATION_TEST_PATH, nrows=self.nrows)
        all_df = train_df.append(test_df)
        self.var_list_to_clear.extend([train_df, test_df])
        all_df = all_df[all_df['CODE_GENDER'] != 'XNA']
        for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
            all_df[bin_feature], uniques = pd.factorize(all_df[bin_feature])

        encoded_df = self.one_hot_encoding(all_df)
        self.var_list_to_clear.extend([all_df])
        encoded_df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
        encoded_df['DAYS_EMPLOYED_PERC'] = encoded_df['DAYS_EMPLOYED'] / encoded_df['DAYS_BIRTH']
        encoded_df['INCOME_CREDIT_PERC'] = encoded_df['AMT_INCOME_TOTAL'] / encoded_df['AMT_CREDIT']
        encoded_df['INCOME_PER_PERSON'] = encoded_df['AMT_INCOME_TOTAL'] / encoded_df['CNT_FAM_MEMBERS']
        encoded_df['ANNUITY_INCOME_PERC'] = encoded_df['AMT_ANNUITY'] / encoded_df['AMT_INCOME_TOTAL']
        encoded_df['PAYMENT_RATE'] = encoded_df['AMT_ANNUITY'] / encoded_df['AMT_CREDIT']
        self.clear_memory()
        return encoded_df
    
    def preprocess_breau(self):
        bureau_df = pd.read_csv(self.BUREAU_PATH, nrows=self.nrows)
        bureau_balance_df = pd.read_csv(self.BUREAU_BALANCE_PATH, nrows=self.nrows)
        bureau_num_columns_list = self.get_num_columns_list(bureau_df)
        encoded_bureau_df = self.one_hot_encoding(bureau_df)
        encoded_bb_df = self.one_hot_encoding(bureau_balance_df)
        self.var_list_to_clear.extend([bureau_df, bureau_balance_df])
        # bureau_balance aggregate
        bureau_balance_agg_df = encoded_bb_df.groupby('SK_ID_BUREAU').agg('mean')
        bureau_merged_df = encoded_bureau_df.merge(bureau_balance_agg_df, how='left', on='SK_ID_BUREAU')
        bureau_merged_df = bureau_merged_df.drop('SK_ID_BUREAU', axis='columns')
        bureau_agg_df = bureau_merged_df.groupby('SK_ID_CURR').agg('mean')
        self.var_list_to_clear.extend([bureau_balance_agg_df, bureau_merged_df])
        # active
        active = encoded_bureau_df[encoded_bureau_df['CREDIT_ACTIVE_Active'] == 1]
        active_agg = active[bureau_num_columns_list].groupby('SK_ID_CURR').agg('mean')
        active_agg = active_agg.drop('SK_ID_BUREAU', axis='columns')
        active_agg.columns = pd.Index(['ACTIVE_' + col for col in active_agg.columns.tolist()])
        bureau_agg_df = pd.merge(bureau_agg_df, active_agg, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([active, active_agg])
        # closed
        closed = encoded_bureau_df[encoded_bureau_df['CREDIT_ACTIVE_Closed'] == 1]
        closed_agg = closed[bureau_num_columns_list].groupby('SK_ID_CURR').agg('mean')
        closed_agg = closed_agg.drop('SK_ID_BUREAU', axis='columns')
        closed_agg.columns = pd.Index(['CLOSED_' + col for col in closed_agg.columns.tolist()])
        bureau_merged_df = pd.merge(bureau_merged_df, closed_agg, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([closed, closed_agg])
        self.clear_memory()
        return bureau_agg_df

    def preprocess_prev_application(self):
        prev_df = pd.read_csv(self.PREVIOUS_APPLICATION_PATH, nrows=self.nrows)
        prev_num_columns_list = self.get_num_columns_list(prev_df)
        encoded_prev_df = self.one_hot_encoding(prev_df)
        self.var_list_to_clear.extend([prev_df])
        encoded_prev_df['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
        encoded_prev_df['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
        encoded_prev_df['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
        encoded_prev_df['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
        encoded_prev_df['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
        # Add feature: value ask / value received percentage
        encoded_prev_df['APP_CREDIT_PERC'] = encoded_prev_df['AMT_APPLICATION'] / encoded_prev_df['AMT_CREDIT']
        prev_agg_df = encoded_prev_df.groupby('SK_ID_CURR').agg('mean')
        self.var_list_to_clear.extend([encoded_prev_df])
        prev_agg_df = prev_agg_df.drop('SK_ID_PREV', axis='columns')
        # approved
        approved = encoded_prev_df[encoded_prev_df['NAME_CONTRACT_STATUS_Approved'] == 1]
        approved_agg = approved[prev_num_columns_list].groupby('SK_ID_CURR').agg('mean')
        approved_agg = approved_agg.drop('SK_ID_PREV', axis='columns')
        approved_agg.columns = pd.Index(['APPROVED_' + col for col in approved_agg.columns.tolist()])
        prev_agg_df = pd.merge(prev_agg_df, approved_agg, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([approved, approved_agg])
        # refused
        refused = encoded_prev_df[encoded_prev_df['NAME_CONTRACT_STATUS_Refused'] == 1]
        refused_agg = refused[prev_num_columns_list].groupby('SK_ID_CURR').agg('mean')
        refused_agg = refused_agg.drop('SK_ID_PREV', axis='columns')
        refused_agg.columns = pd.Index(['REFUSED_' + col for col in refused_agg.columns.tolist()])
        encoded_prev_df = pd.merge(encoded_prev_df, refused_agg, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([refused, refused_agg])
        
        self.clear_memory()
        return prev_agg_df

    def preprocess_pos_cash(self):
        pos_df = pd.read_csv(self.POS_CASH_BALANCE_PATH, nrows=self.nrows)
        encoded_pos_df = self.one_hot_encoding(pos_df)
        self.var_list_to_clear.extend([pos_df])
        pos_agg_df = encoded_pos_df.groupby('SK_ID_CURR').agg('mean')
        pos_agg_df['POS_COUNT'] = encoded_pos_df.groupby('SK_ID_CURR').size()
        pos_agg_df = pos_agg_df.drop('SK_ID_PREV', axis='columns')
        self.var_list_to_clear.extend([encoded_pos_df])
        self.clear_memory()
        return pos_agg_df

    def preprocess_installments_df(self):
        install_df = pd.read_csv(self.INSTALLMENTS_PAYMENTS_PATH, nrows=self.nrows)
        encoded_install_df = self.one_hot_encoding(install_df)
        self.var_list_to_clear.extend([install_df])
        install_agg_df = encoded_install_df.groupby('SK_ID_CURR').agg('mean')
        install_agg_df['INSTALL_COUNT'] = encoded_install_df.groupby('SK_ID_CURR').size()
        install_agg_df = install_agg_df.drop('SK_ID_PREV', axis='columns')
        self.var_list_to_clear.extend([encoded_install_df])
        self.clear_memory()
        return install_agg_df

    def preprocess_credit_card_df(self):
        credit_card_df = pd.read_csv(self.CREDIT_CARD_PATH, nrows=self.nrows)
        encoded_credit_card_df = self.one_hot_encoding(credit_card_df)
        self.var_list_to_clear.extend([credit_card_df])
        credit_card_agg_df = encoded_credit_card_df.groupby('SK_ID_CURR').agg('mean')
        credit_card_agg_df['CREDIT_COUNT'] = encoded_credit_card_df.groupby('SK_ID_CURR').size()
        credit_card_agg_df = credit_card_agg_df.drop('SK_ID_PREV', axis='columns')
        self.var_list_to_clear.extend([encoded_credit_card_df])
        self.clear_memory()
        return credit_card_agg_df

    def objective_multi_classifiers(self, trial):
        # search better model from RandomForestRegressor, XGBRegressor
        classifier_name = trial.suggest_categorical('classifier', ['RandomForest', 'XGBoost', 'LGBM'])
        # search better max_depth from 2 to 16
        max_depth = trial.suggest_int('max_depth', 2, 16)
        # search better n_estimators from 50 to 4000
        n_estimators = trial.suggest_int('n_estimators', 50, 7000)
        if classifier_name == 'RandomForest':
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=1234)
        elif classifier_name == 'XGBoost':
            model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, objective='binary:logistic', random_state=1234)
        else:
            model = LGBMClassifier(boosting_type='goss',n_estimators=n_estimators, max_depth=max_depth, objective='binary', num_leaves=34, random_state=1234)
        
        error_list = cross_val_score(model, self.X, self.y, cv=3, scoring='roc_auc')
        gc.collect()
        return error_list.mean()
    
    def objective_LGBM(self, trial):
        # hyper parameters for tuning
        n_estimators = trial.suggest_int('n_estimators', 50, 10000)
        learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5)
        max_depth = trial.suggest_int('max_depth', 2, 16)
        num_leaves = trial.suggest_int('num_leaves', 10, 50)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.01, 1)
        subsample = trial.suggest_float('subsample', 0.01, 1)
        reg_alpha = trial.suggest_float('reg_alpha', 0.001, 0.1)
        reg_lambda = trial.suggest_float('reg_lambda', 0.001, 0.1)
        min_split_gain = trial.suggest_float('min_split_gain', 0.001, 0.1)
        min_child_weight = trial.suggest_float('min_child_weight', 0.001, 50)
        
        model = LGBMClassifier(
            boosting_type='goss',
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            num_leaves=num_leaves,
            colsample_bytree=colsample_bytree,
            subsample=subsample,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            min_split_gain=min_split_gain,
            min_child_weight=min_child_weight,
            objective='binary',
            random_state=1234)
        
        error_list = cross_val_score(model, self.X, self.y, cv=3, scoring='roc_auc')
        gc.collect()
        return error_list.mean()

    def preprocess_data(self):
        # application
        all_df = self.preprocess_aplication()
        # bureau
        bureau_df = self.preprocess_breau()
        all_df = pd.merge(all_df, bureau_df, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([bureau_df])
        # previous application
        prev_application_df = self.preprocess_prev_application()
        all_df = pd.merge(all_df, prev_application_df, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([prev_application_df])
        # POS CASH
        pos_cash_df = self.preprocess_credit_card_df()
        all_df = pd.merge(all_df, pos_cash_df, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([pos_cash_df])
        # installments
        installments_df = self.preprocess_installments_df()
        all_df = pd.merge(all_df, installments_df, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([installments_df])
        # credit card
        credit_card_df = self.preprocess_credit_card_df()
        all_df = pd.merge(all_df, credit_card_df, how='left', on='SK_ID_CURR')
        self.var_list_to_clear.extend([credit_card_df])
        
        all_df = all_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        train_df = all_df[all_df['TARGET'].notnull()]
        test_df = all_df[all_df['TARGET'].isnull()]
        self.var_list_to_clear.extend([all_df])
        
        filled_train_df = self.fill_zero_num_cols(train_df)
        filled_test_df = self.fill_zero_num_cols(test_df)
        self.var_list_to_clear.extend([train_df, test_df])
        
        converted_train_df = self.convert_float64_to_float32(filled_train_df)
        converted_test_df = self.convert_float64_to_float32(filled_test_df)
        self.var_list_to_clear.extend([filled_train_df, filled_test_df])
        
        self.y = converted_train_df['TARGET']
        self.predict = converted_test_df[['SK_ID_CURR']]
        features = [feature for feature in converted_train_df.columns if feature not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']]
        self.X = converted_train_df[features]
        self.test_X = converted_test_df[features]
        print(f'shape of X: {self.X.shape}')
        print(f'shape of test_X: {self.test_X.shape}')
        self.clear_memory()

    def get_objective(self, objective_name):
        objective_dict = {
            "multi_classifiers": self.objective_multi_classifiers,
            "LGBM": self.objective_LGBM
        }
        return objective_dict[objective_name]
        
    def parameter_tuning_with_storage(self, study_name ,objective_name):
        study = optuna.create_study(direction='maximize', study_name=study_name, storage=get_storage(), load_if_exists=True)  # Create a new study.
        study.optimize(self.get_objective(objective_name), n_trials=10)
        
    def parameter_tuning(self, objective_name):
        study = optuna.create_study(direction='maximize')  # Create a new study.
        study.optimize(self.get_objective(objective_name), n_trials=10)
        self.study = study
    
    def parameter_tuning_test(self, objective_name):
        study = optuna.create_study(direction='maximize')  # Create a new study.
        study.optimize(self.get_objective(objective_name), n_trials=4)
        
    def predict_with_best_parameter(self, study_name=None):
        if study_name is not None:
            loaded_study = optuna.load_study(study_name=study_name, storage=get_storage())
        else:
            loaded_study = self.study
        best_params = loaded_study.best_trial.params
        model = LGBMClassifier(
            boosting_type='goss',
            n_estimators=best_params['n_estimators'],
            learning_rate=best_params['learning_rate'],
            max_depth=best_params['max_depth'],
            num_leaves=best_params['num_leaves'],
            colsample_bytree=best_params['colsample_bytree'],
            subsample=best_params['subsample'],
            reg_alpha=best_params['reg_alpha'],
            reg_lambda=best_params['reg_lambda'],
            min_split_gain=best_params['min_split_gain'],
            min_child_weight=best_params['min_child_weight'],
            objective='binary',
            random_state=1234)
        model.fit(self.X, self.y)
        probability = model.predict_proba(self.test_X)[:, 1]
        self.predict['TARGET'] = probability
        self.predict.to_csv('./submission.csv', index=False)

In [None]:
home_credit = Home_Credit()
home_credit.preprocess_data()
home_credit.parameter_tuning('LGBM')
home_credit.predict_with_best_parameter()