In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory|
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing the necessary libraries
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder




In [None]:
#function for one hot encoding of the categorical columns
def one_hot_encoding_dataframe(df):
    original_columns = list(df.columns)
    cat_columns=[x for x in df.columns if df[x].dtype == 'object']
    df=pd.get_dummies(df,columns=cat_columns,dummy_na= False)
    new_added_columns=list(set(df.columns).difference(set(original_columns)))
    return df,new_added_columns,df.columns

In [None]:
#loading the main test and train data
main_df=pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
test_df=pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')
train_target = main_df[['TARGET', 'SK_ID_CURR']]
test_target = test_df['SK_ID_CURR'].copy()


### We have choosen to use the stacking ensemble technique where an L0 model will be used to get the predictions from each table of the dataset. These predictions will then be combined as an input to the L1 model which will give the final output to be submitted

# **L0 MODELS**

### We start by selecting and extracting features from each table, then train it on the L0 models to get the prediction
Since we are still students with very little domain knowledge in finance, we do not have enough expertise to come up with the features on our own and the model performance is heavily dependent on the feature engineering. So the aggregates and features have been inspired from various existing solutions of the competition

Here is the list of solutions which inspired our feature selection
1. https://www.kaggle.com/hikmetsezen/micro-model-174-features-0-8-auc-on-home-credit
2. https://www.kaggle.com/yakupkaplan/home-credit-default-risk-prediction-model
3. https://github.com/wins999/Home_Credit_Loan_Prediction
4. https://www.kaggle.com/jamesdellinger/home-credit-putting-all-the-steps-together


# 1) Credit card balance

In [None]:
def credit_card_bal(df):
    
    categorical_cols = ['NAME_CONTRACT_STATUS']
    for col in categorical_cols:
            enc = LabelEncoder()
            df[col] = enc.fit_transform(df[col])
            
    #Creating new features
    df['ratio_ab_acl'] = df['AMT_BALANCE'] / df['AMT_CREDIT_LIMIT_ACTUAL']
    df['sum_draw'] = df[['AMT_DRAWINGS_ATM_CURRENT','AMT_DRAWINGS_CURRENT','AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT']].sum(axis=1)
    df['ratio_tc_tr'] = df['AMT_PAYMENT_TOTAL_CURRENT'] / df['AMT_TOTAL_RECEIVABLE']
    df['ratio_pc_ar'] = df['AMT_PAYMENT_CURRENT'] / df['AMT_RECIVABLE']
    df['sum_cntdraw'] = df[['CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT']].sum(axis=1)
    df['diff_tr_tc'] = df['AMT_TOTAL_RECEIVABLE'] / df['AMT_PAYMENT_TOTAL_CURRENT']
    df['ratio_pc_ptc'] = df['AMT_PAYMENT_CURRENT'] / df['AMT_PAYMENT_TOTAL_CURRENT']
    
    #Creating aggregates
    aggs = {
        'MONTHS_BALANCE': ['min', 'max', 'size'],
        'CNT_DRAWINGS_ATM_CURRENT': ['max'], 
        'CNT_DRAWINGS_CURRENT': ['max'],
        'CNT_DRAWINGS_POS_CURRENT': ['max'],
        'CNT_INSTALMENT_MATURE_CUM': ['mean', 'sum'],
        'AMT_BALANCE': ['min', 'max', 'mean', 'sum'],
        'AMT_CREDIT_LIMIT_ACTUAL': ['max', 'mean','var'],
        'AMT_DRAWINGS_ATM_CURRENT': ['max'],
        'AMT_DRAWINGS_CURRENT': ['max'],
        'AMT_DRAWINGS_POS_CURRENT': ['max'],
        'AMT_PAYMENT_CURRENT': ['max'],
        'AMT_PAYMENT_TOTAL_CURRENT': ['max'],
        'AMT_RECEIVABLE_PRINCIPAL': ['mean', 'sum'],
        'AMT_RECIVABLE': ['mean', 'sum'],
        'AMT_TOTAL_RECEIVABLE': ['mean'],
        
        #New features
        'ratio_ab_acl': ['min', 'max', 'mean'],
        'ratio_tc_tr': ['min', 'max', 'mean'],
        'ratio_pc_ar': ['min', 'max', 'mean'],
        'diff_tr_tc': ['min', 'max', 'mean'],
        'ratio_pc_ptc': ['min', 'max', 'mean']
    }
    #creating n from those aggregates
    cc_aggs = df.groupby('SK_ID_CURR').agg(aggs)
    cc_aggs.columns = pd.Index([i[0] + "_" + i[1].upper() + '_(CREDIT_CARD)' for i in cc_aggs.columns.tolist()])
    cc_aggs['CC_COUNT'] = df.groupby('SK_ID_CURR').size()
    
    return cc_aggs

In [None]:
credit_data = pd.read_csv('/kaggle/input/home-credit-default-risk/credit_card_balance.csv')
df_credit=credit_card_bal(credit_data)


The features of the credit table are merged with the main table to get all the indexes on the main table and so that we can use it to train the model. However, only columns corresponding to the credit table were used to train. 
Once the features are created, the next step is to Impute missing values and normalise the data. All missing values were imputed using median and minmax was used to normalise data

In [None]:
df_credit =main_df.merge(df_credit, how='left', on='SK_ID_CURR')
df_credit=df_credit[df_credit['TARGET'].notnull()]

y_train=df_credit['TARGET']
train_column=set(df_credit.columns)-set(main_df.columns)
X_train=df_credit[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)


After preprocessing the data, it is used to train the Classifier model. We have choosen LightGBM as our model as it was realised from literature review and other solutions that LightGBM trained faster and gave a better AUC score than logistic regression, decision tress or Neural netwroks

**goss** or gradient based one sided sampling was used as the number of train instances was very high and downsampling will improve the speed of the model while keeping the accuracy good. It is usually used for ensemble models

In [None]:
import sklearn
classes_zero = main_df[main_df['TARGET'] == 0]
classes_one = main_df[main_df['TARGET'] == 1]

# Convert parts into NumPy arrays for weight computation
zero_numpy = classes_zero['TARGET'].to_numpy()
one_numpy = classes_one['TARGET'].to_numpy()
all_together = np.concatenate((zero_numpy, one_numpy))
unique_classes = np.unique(all_together)

# Compute weights
weights = sklearn.utils.class_weight.compute_class_weight('balanced', unique_classes, all_together)
weights

In [None]:
poswt = len(classes_zero) / len(classes_one)
poswt

In [None]:
clf=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)

In [None]:
clf.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,clf.predict_proba(X_train)[:,1]))

After training the model, next step is to get its predictions on train data to train it on the L0 model

In [None]:
cred_bal=clf.predict_proba(X_train)[:,1]

Similarly, the L0 model is used to predict vales on the test data as well. These values will be used to check the score of the inidvidual table as well as be used as input for the L1 model

In [None]:
df_credit2=credit_card_bal(credit_data)
df_credit2 =test_df.merge(df_credit2, how='left', on='SK_ID_CURR')
test_col=set(df_credit2.columns)-set(test_df.columns)
X_test=df_credit2[test_col]


In [None]:
df_test=X_test.replace([np.inf, -np.inf],np.nan)
df_test=imputer.transform(df_test)
df_test=scaler.transform(df_test)

In [None]:
yp=clf.predict_proba(df_test)[:,1]

# 2) Installments payments balance

### Feature engineering

In [None]:
def install_bal(inst_df):
          
    inst_df['late_pay']=inst_df['DAYS_INSTALMENT']-inst_df['DAYS_ENTRY_PAYMENT']
    inst_df['less_pay']=inst_df['AMT_INSTALMENT']-inst_df['AMT_PAYMENT']
    inst_df['late_lp']=0.5*inst_df['late_pay']+0.5*inst_df['less_pay']
    inst_df['ltp_flag']=((inst_df['DAYS_INSTALMENT']-inst_df['DAYS_ENTRY_PAYMENT'])>0).astype(int)
    inst_df['lsp_flag']=((inst_df['AMT_INSTALMENT']-inst_df['AMT_PAYMENT'])>0).astype(int)
    
    for col in inst_df.columns:
        if col.startswith('DAYS'):
            inst_df[col].replace(365243, np.nan, inplace= True)
            
    inst_df,installments_payments_cat_columns,all_columns=one_hot_encoding_dataframe(inst_df)
       

    inst_df_agg={}
    for col in inst_df.columns:
        if col!='SK_ID_CURR' and col !='SK_ID_PREV':
            inst_df_agg[col]=['mean']
            if (col=='late_pay') |  (col=='less_pay') | (col=='NUM_INSTALMENT_VERSION') | (col=='NUM_INSTALMENT_NUMBER'):
              inst_df_agg[col]=['mean','sum','max','min']
    
    inst_agg = inst_df.groupby('SK_ID_CURR').agg(inst_df_agg)
    
    modified_col=[]
    for c in list(inst_agg.columns):
        modified_col.append("INST_"+c[0]+"_"+c[1].upper())
    inst_agg.columns=modified_col  
    
    inst_agg['cnt_inst'] = inst_df.groupby('SK_ID_CURR')['SK_ID_PREV'].count()

    no = -365*3
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['3365_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['3365_ltp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].min()
    inst_agg['3365_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['3365_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum() 
  
    no = -365*2
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['2365_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['2365_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['2365_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum()
    
    no = -365 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['365_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['365_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['365_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum()
    
    no = -180 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['180_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['180_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['180_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum()
    
    no = -90 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['90_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['90_ltp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].min()
    inst_agg['90_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
        
    no = -365*2
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['2365_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['2365_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['2365_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
    inst_agg['2365_lsp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['lsp_flag'].sum()

    no = -365 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['365_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['365_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['365_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
    inst_agg['365_lsp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['lsp_flag'].sum()
    
    no = -180 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['180_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['180_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['180_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
    
    no = -90 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['90_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['90_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['90_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
   
    
    return inst_agg

### Normalising and training the model
We have used the same L0 models for all the tables using the same parameters

In [None]:
inst_data = pd.read_csv('/kaggle/input/home-credit-default-risk/installments_payments.csv')

df_inst=install_bal(inst_data)
df_inst =main_df.merge(df_inst, how='left', on='SK_ID_CURR')
df_inst=df_inst[df_inst['TARGET'].notnull()]

y_train=df_inst['TARGET']
train_column=set(df_inst.columns)-set(main_df.columns)
X_train=df_inst[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)

In [None]:
clf2=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)

In [None]:
clf2.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,clf2.predict_proba(X_train)[:,1]))

In [None]:
print("ROCAUC Score :",roc_auc_score(y_train,clf2.predict_proba(X_train)[:,1]))

Predictions on train data

In [None]:
inst=clf2.predict_proba(X_train)[:,1]

In [None]:
df_inst2=install_bal(inst_data)
df_inst2 =test_df.merge(df_inst2, how='left', on='SK_ID_CURR')
test_col=set(df_inst2.columns)-set(test_df.columns)
X_test=df_inst2[test_col]

In [None]:
df_test=X_test.replace([np.inf, -np.inf],np.nan)
df_test=imputer1.transform(df_test)
df_test=scaler.transform(df_test)

In [None]:
df_test.shape

Getting the target values for test data and making a submission file using it

In [None]:
yp=clf2.predict_proba(df_test)[:,1]

# 3) POS application

### Feature engineering

In [None]:
def pos_appl(pos_df):
    
    pos_df=pos_df[pos_df['NAME_CONTRACT_STATUS']!='XNA'] 
    pos_df,pos_data_cat_columns,all_columns=one_hot_encoding_dataframe(pos_df)
     
    pos_data_agg={}
    for col in pos_df.columns:
        if col!='SK_ID_CURR' and col !='SK_ID_PREV':
            pos_data_agg[col]=['mean']
        if col=='MONTHS_BALANCE':
            pos_data_agg[col]=['sum','mean','max']
            
    
    pos_agg = pos_df.groupby('SK_ID_CURR').agg(pos_data_agg)
    
    modified_col=[]
    for col in list(pos_agg.columns):
        modified_col.append("POS_"+col[0]+"_"+col[1].upper())
    pos_agg.columns=modified_col
    pos_agg['pos_cnt'] = pos_df.groupby('SK_ID_CURR')['SK_ID_PREV'].count() 

    month = -24 
    pos_temp = pos_df[pos_df.MONTHS_BALANCE >= month].copy()
    pos_agg['24_c_inst_mean'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['24_c_inst_max'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
    
    month = -12 
    pos_temp = pos_df[pos_df.MONTHS_BALANCE >= month].copy()
    pos_agg['12_c_inst_mean'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['12_c_inst_max'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
    
    active = pos_df[pos_df['NAME_CONTRACT_STATUS_Active'] == 1]
   
    pos_agg['active_inst_mean'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT'].mean()
    pos_agg['active_inst_max'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT'].max()
  
    pos_agg['active_dpd_mean'] = active.groupby('SK_ID_CURR')['SK_DPD'].mean()
    pos_agg['active_dpd_max'] = active.groupby('SK_ID_CURR')['SK_DPD'].max()

    pos_agg['active_inst_fut_mean'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['active_inst_fut_max'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
  
    pos_agg['active_dpd_def_mean'] = active.groupby('SK_ID_CURR')['SK_DPD_DEF'].mean()
    pos_agg['active_dpd_def_max'] = active.groupby('SK_ID_CURR')['SK_DPD_DEF'].max()
  
    completed = pos_df[pos_df['NAME_CONTRACT_STATUS_Completed'] == 1]
    pos_agg['com_dpd_mean'] = completed.groupby('SK_ID_CURR')['SK_DPD'].mean()
    pos_agg['com_dpd_max'] = completed.groupby('SK_ID_CURR')['SK_DPD'].max()

    pos_agg['com_dpd_def_mean'] = completed.groupby('SK_ID_CURR')['SK_DPD_DEF'].mean()
    pos_agg['com_dpd_def_max'] = completed.groupby('SK_ID_CURR')['SK_DPD_DEF'].max()

    pos_agg['com_inst_fut_mean'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['com_inst_fut_max'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
  
    pos_agg['com_inst_mean'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT'].mean()
    pos_agg['com_inst_max'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT'].max()
  
    return pos_agg


In [None]:
pos_data = pd.read_csv('/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv')

df_pos=pos_appl(pos_data)
df_pos =main_df.merge(df_pos, how='left', on='SK_ID_CURR')
df_pos=df_pos[df_pos['TARGET'].notnull()]

y_train=df_pos['TARGET']
train_column=set(df_pos.columns)-set(main_df.columns)
X_train=df_pos[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)

In [None]:
clf3=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)

In [None]:
clf3.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,clf3.predict_proba(X_train)[:,1]))

Target values for train data

In [None]:
pos=clf3.predict_proba(X_train)[:,1]

In [None]:
df_pos2=pos_appl(pos_data)
df_pos2 =test_df.merge(df_pos2, how='left', on='SK_ID_CURR')
test_col=set(df_pos2.columns)-set(test_df.columns)
X_test=df_pos2[test_col]

In [None]:
df_test=X_test.replace([np.inf, -np.inf],np.nan)
df_test=imputer1.transform(df_test)
df_test=scaler.transform(df_test)

Target values for test data

In [None]:
yp=clf3.predict_proba(df_test)[:,1]

# 4) bureau balance data

### Feature engineering
This uses 2 tables - **Bureau balance and Bureau data** . Both the tables have been merged and then aggregated to extract features

In [None]:
def bureau_bal(bureau_balance_data,bureau_data):
    
    bureau_balance_data,bureau_balance_data_cat_columns,all_columns=one_hot_encoding_dataframe(bureau_balance_data)
    #Aggregate function to be applied on numerical column 
    bureau_balance_agg = {'MONTHS_BALANCE': ['min', 'max','sum']}

    #Aggregate function to be applied on cat column 
    for col in bureau_balance_data_cat_columns:
        if (col!='SK_BUREAU_ID'):
            bureau_balance_agg[col] = ['mean']

    bal_agg = bureau_balance_data.groupby(['SK_ID_BUREAU']).agg(bureau_balance_agg)
    
    modified_col=[]
    for col in list(bal_agg.columns):
        if (col!='SK_BUREAU_ID'):
            modified_col.append(col[0]+"_"+col[1].upper())
    bal_agg.columns=modified_col
    
    bureau_data,bureau_data_cat_columns,all_columns=one_hot_encoding_dataframe(bureau_data)

    bureau_data['SEC_LOAN_COUNT']=(bureau_data[['CREDIT_TYPE_Car loan','CREDIT_TYPE_Loan for the purchase of equipment','CREDIT_TYPE_Mortgage','CREDIT_TYPE_Real estate loan','CREDIT_TYPE_Loan for purchase of shares (margin lending)'
                         ]]==1).sum(axis=1)
    
    bureau_data['UNSEC_LOAN_COUNT']=(bureau_data[[ 'CREDIT_TYPE_Another type of loan',
       'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit',
       'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit',
       'CREDIT_TYPE_Loan for business development',
       'CREDIT_TYPE_Loan for working capital replenishment',
       'CREDIT_TYPE_Microloan', 'CREDIT_TYPE_Mobile operator loan',
       'CREDIT_TYPE_Unknown type of loan']]==1).sum(axis=1)
    
    bureau_data['ex_pay'] = bureau_data['AMT_ANNUITY']-bureau_data['AMT_CREDIT_SUM']
    bureau_data['debt']=bureau_data['AMT_CREDIT_SUM_DEBT']/bureau_data['AMT_CREDIT_SUM']
    bureau_data['annu_per_cred']=bureau_data['AMT_ANNUITY']/bureau_data['AMT_CREDIT_SUM']
    
    for col in bureau_data.columns:
        if col.startswith('DAYS'):
            bureau_data[col].replace(365243, np.nan, inplace= True)
            
    bureau_data = bureau_data.join(bal_agg, how='left', on=['SK_ID_BUREAU'])
     
    bureau_data_agg={}
    for col in bureau_data.columns:
        if (col!='SK_ID_CURR' or col!='SK_BUREAU_ID'):
            bureau_data_agg[col]=['mean']
            if (col=='AMT_CREDIT_SUM_DEBT') | (col=='AMT_CREDIT_SUM_OVERDUE') | (col=='UNSEC_LOAN_COUNT') |(col=='SEC_LOAN_COUNT'):
                bureau_data_agg[col]=['sum']
            if col=='DAYS_CREDIT':
                bureau_data_agg[col]=['min']
            if col=='debt':
                bureau_data_agg[col]=['mean']
    
            
    bureau_agg = bureau_data.groupby('SK_ID_CURR').agg(bureau_data_agg)
    
    modified_col=[]
    for col in list(bureau_agg.columns):
        modified_col.append(col[0]+"_"+col[1].upper())
    bureau_agg.columns=modified_col
    
    bureau_agg['abs_cre_max']=abs(bureau_agg['DAYS_CREDIT_MIN']/365) 
    bureau_agg['cre_max_od_max'] = bureau_data.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].max()
    bureau_agg['cnt'] = bureau_data.groupby('SK_ID_CURR')['SK_ID_BUREAU'].count()
    bureau_data_active = bureau_data[bureau_data['CREDIT_ACTIVE_Active'] == 1]
    
    bureau_agg['active_cred_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].mean()
    bureau_agg['active_cred_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].max()
    bureau_agg['cred_od_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].mean()
    bureau_agg['cred_od_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].max()   
    bureau_agg['active_cred_debt_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].mean()
    bureau_agg['active_cred_debt_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].max()
    bureau_agg['active_cred_limit_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].mean()
    bureau_agg['active_cred_limit_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].max()
    bureau_agg['cred_days_mean'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT'].mean()
    bureau_agg['cred_days_max'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT'].max()  
    bureau_agg['cred_ed_mean'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].mean()
    bureau_agg['cred_ed_max'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].max()
    bureau_data_closed = bureau_data[bureau_data['CREDIT_ACTIVE_Closed'] == 1]
    bureau_agg['cls_cred_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].mean()
    bureau_agg['cls_cred_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].max()
    bureau_agg['B_CLO_AMT_CREDIT_MAX_OVERDUE_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].mean()
    bureau_agg['B_CLO_AMT_CREDIT_MAX_OVERDUE_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].max()
    bureau_agg['cls_cred_d_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].mean()
    bureau_agg['cls_cred_d_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].max()
    bureau_agg['cls_cred_d_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].mean()
    bureau_agg['cls_cred_d_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].max()
    bureau_agg['cls_credd_ed_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].mean()
    bureau_agg['cls_credd_ed_max'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].max()
    bureau_agg['cls_credd_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT'].mean()
    bureau_agg['cls_credd_max'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT'].max()
    bureau_agg.drop(['SK_ID_CURR_MEAN'], axis=1, inplace= True)
    bureau_agg.drop(['SK_ID_BUREAU_MEAN'], axis=1, inplace= True)
    
    
    return bureau_agg

In [None]:
bureau_bal_data = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau_balance.csv')
bureau_data = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau.csv')

df_bureau=bureau_bal(bureau_bal_data,bureau_data)
df_bureau =main_df.merge(df_bureau, how='left', on='SK_ID_CURR')
df_bureau=df_bureau[df_bureau['TARGET'].notnull()]

y_train=df_bureau['TARGET']
train_column=set(df_bureau.columns)-set(main_df.columns)
X_train=df_bureau[train_column]


train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)


In [None]:
clf4=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)

In [None]:
clf4.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,clf4.predict_proba(X_train)[:,1]))

Traget values on train data

In [None]:
bureau=clf4.predict_proba(X_train)[:,1]

In [None]:
df_bureau2=bureau_bal(bureau_bal_data,bureau_data)
df_bureau2 =test_df.merge(df_bureau2, how='left', on='SK_ID_CURR')
test_col=set(df_bureau2.columns)-set(test_df.columns)
X_test=df_bureau2[test_col]

In [None]:
df_test=X_test.replace([np.inf, -np.inf],np.nan)
df_test=imputer1.transform(df_test)
df_test=scaler.transform(df_test)

Target values on test data

In [None]:
yp=clf4.predict_proba(df_test)[:,1]

# 5) Previous application

### Feature engineering

In [None]:
def prev_appl(prev_df):
    
    for col in prev_df.columns:
        if col.startswith('DAYS'):
            prev_df[col].replace(365243, np.nan, inplace= True)
    
    prev_df['extra_paid'] = prev_df['CNT_PAYMENT']*prev_df['AMT_ANNUITY']-prev_df['AMT_CREDIT']
    prev_df['to_pay'] = prev_df['CNT_PAYMENT']*prev_df['AMT_ANNUITY']-prev_df['AMT_DOWN_PAYMENT']
    prev_df['roi'] = (1/prev_df['CNT_PAYMENT'])*(((prev_df['CNT_PAYMENT']*prev_df['AMT_ANNUITY'])/prev_df['AMT_CREDIT'])-1)
    prev_df['si']= (prev_df['AMT_CREDIT']*prev_df['roi']*prev_df['CNT_PAYMENT'])/100    
    prev_df['xap']=((prev_df['CODE_REJECT_REASON']=='XAP')).astype(int)
    prev_df['dp']=(prev_df['AMT_DOWN_PAYMENT']<=(0.40*prev_df['AMT_CREDIT'])).astype(int) 
    prev_df,prev_df_cat_columns,all_columns=one_hot_encoding_dataframe(prev_df)
   
    prev_df_agg={}
    for col in prev_df.columns:
        if col!='SK_ID_CURR' and col !='SK_ID_PREV':
            prev_df_agg[col]=['mean']
        if (col=='DAYS_TERMINATION') | (col=='DAYS_FIRST_DUE') | (col=='DAYS_LAST_DUE') | (col=='AMT_CREDIT') | (col=='AMT_ANNUITY') | (col=='AMT_DOWN_PAYMENT') | (col=='DAYS_LAST_DUE_1ST_VERSION') |(col=='HOUR_APPR_PROCESS_START') :
            prev_df_agg[col]=['max','mean']
            
    prev_agg = prev_df.groupby('SK_ID_CURR').agg(prev_df_agg)
    
    modified_col=[]
    for col in list(prev_agg.columns):
        modified_col.append("PREV_"+col[0]+"_"+col[1].upper())
    
    prev_agg.columns=modified_col
    
    ref_canc = prev_df[(prev_df['NAME_CONTRACT_STATUS_Refused'] == 1) | (prev_df['NAME_CONTRACT_STATUS_Canceled'] == 1)]
    prev_agg['cr_cred_mean'] = ref_canc.groupby('SK_ID_CURR')['AMT_CREDIT'].mean()
    prev_agg['cr_cred_max'] = ref_canc.groupby('SK_ID_CURR')['AMT_CREDIT'].max()
    prev_agg['cr_int_mean'] = ref_canc.groupby('SK_ID_CURR')['roi'].mean()
    prev_agg['cr_int_max'] = ref_canc.groupby('SK_ID_CURR')['roi'].max()
    prev_agg['cr_annu_mean'] = ref_canc.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean()
    prev_agg['cr_annu_max'] = ref_canc.groupby('SK_ID_CURR')['AMT_ANNUITY'].max()
    prev_agg['cr_dp_mean'] = ref_canc.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].mean()
    prev_agg['cr_dp_max'] = ref_canc.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].max()
    prev_agg['cr_lp_mean'] = ref_canc.groupby('SK_ID_CURR')['to_pay'].mean()
    prev_agg['cr_lp_max'] = ref_canc.groupby('SK_ID_CURR')['to_pay'].max()
 
    appr = prev_df[(prev_df['NAME_CONTRACT_STATUS_Approved'] == 1)]
    prev_agg['va_cred_mean'] = appr.groupby('SK_ID_CURR')['AMT_CREDIT'].mean()
    prev_agg['va_cred_max'] = appr.groupby('SK_ID_CURR')['AMT_CREDIT'].max()
    prev_agg['va_annu_mean'] = appr.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean()
    prev_agg['va_annu_max'] = appr.groupby('SK_ID_CURR')['AMT_ANNUITY'].max()
    prev_agg['va_dp_mean'] = appr.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].mean()
    prev_agg['va_int_mean'] = appr.groupby('SK_ID_CURR')['roi'].mean()
    prev_agg['va_int_max'] = appr.groupby('SK_ID_CURR')['roi'].max()
    prev_agg['va_dp_max'] = appr.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].max()
    prev_agg['va_lp_mean'] = appr.groupby('SK_ID_CURR')['to_pay'].mean()
    prev_agg['va_lp_max'] = appr.groupby('SK_ID_CURR')['to_pay'].max()
    
    
    return prev_agg

In [None]:
prev_data = pd.read_csv('/kaggle/input/home-credit-default-risk/previous_application.csv')

df_prevapp=prev_appl(prev_data)
df_prevapp =main_df.merge(df_prevapp, how='left', on='SK_ID_CURR')
df_prevapp=df_prevapp[df_prevapp['TARGET'].notnull()]

y_train=df_prevapp['TARGET']
train_column=set(df_prevapp.columns)-set(main_df.columns)
X_train=df_prevapp[train_column]


train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)


In [None]:
clf5=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)

In [None]:
clf5.fit(X_train,y_train,eval_metric='auc')
#yp=lgbm_clf.predict(X_train)
print("ROCAUC Score :",roc_auc_score(y_train,clf5.predict_proba(X_train)[:,1]))

Target values on train data

In [None]:
prevapp=clf5.predict_proba(X_train)[:,1]

In [None]:
df_prevapp2=prev_appl(prev_data)

df_prevapp2 =test_df.merge(df_prevapp2, how='left', on='SK_ID_CURR')
test_col=set(df_prevapp2.columns)-set(test_df.columns)
X_test=df_prevapp2[test_col]

In [None]:
df_test=X_test.replace([np.inf, -np.inf],np.nan)
df_test=imputer1.transform(df_test)
df_test=scaler.transform(df_test)

Output on test data

In [None]:
yp=clf5.predict_proba(df_test)[:,1]

# 6) Application train data

### Feature engineering

Here the main application table and the test table are combined to do the feature engineering after which they are seperated back to test and train datasets. 
Then the model is trained on the train data and predicted on test data

In [None]:
def appl_train_test(app_train,app_test):
    
    df=app_train.append(app_test).reset_index()
        
    #as 365243 is an outlier
    df["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)
    df['hdwmqy']=(df[['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR']]).sum(axis=1)
    
    #Using domain knowledge 
    #time spent in work
    df['days_work']=df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['days_unemp']=abs(df['DAYS_BIRTH'])-abs(df['DAYS_EMPLOYED'])
    df['inc_per_price']=df['AMT_INCOME_TOTAL']/df['AMT_GOODS_PRICE']
    df['cred_per_price']=df['AMT_CREDIT']/df['AMT_GOODS_PRICE']
    df['ann_per_price']=df['AMT_ANNUITY']/df['AMT_GOODS_PRICE']
    #percentage income of person and the credit amount
    df['inc_per'] = df['AMT_INCOME_TOTAL'] / (df['CNT_FAM_MEMBERS']+1)
    #income per credit
    df['inc_per_cred'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['emp_per_cred'] = df['DAYS_EMPLOYED']/ df['AMT_CREDIT']  

    #Anually paid amount to amount credited
    df['pay_rate'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['loan_pay'] = df['AMT_INCOME_TOTAL']-df['AMT_ANNUITY']
    df['soc_cir']=((df[['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE']]).sum(axis=1))//4
    df['mean_eq']=((df[['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']]).mean(axis=1))    
    df['contact']=((df[['FLAG_MOBIL', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL']]).sum(axis=1))
    
    #Creating features from useful features
    df['ext_mean']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).mean(axis=1)   
    df['ext_med']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).median(axis=1)  
    df['ext_min']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).min(axis=1) 
    df['ext_max']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).max(axis=1)

    df['DOCUMNNET_COUNT']=(df[['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
       'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
       'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']]==1).sum(axis=1)
    
    df,cal_cols,acols=one_hot_encoding_dataframe(df)
    
    return df

In [None]:
dtrain=pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
dtest=pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')

df=appl_train_test(dtrain,dtest)


Splitting back to test and train data

In [None]:
train_data_df=df[df['TARGET'].notnull()]
train_column=set(train_data_df.columns)-set({'TARGET','index','SK_ID_CURR'})

In [None]:
test_data_df=df[df['TARGET'].isnull()]
test_column=set(test_data_df.columns)-set({'SK_ID_CURR','TARGET','index'})

In [None]:
#ytrain to train the model
y_train=train_data_df['TARGET']
len(y_train)

In [None]:
X_train=train_data_df[train_column]
from sklearn.impute import SimpleImputer
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)

scaler = MinMaxScaler(feature_range = (0, 1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)



### The main table has almost 200 columns after feature engineering. Other tables has lesser features and could be trained in 20minutes. However since the training time for this table would take a lot of time we decided to use lightgbm with default parameters.

In [None]:
lgbm_clf = LGBMClassifier(boosting_type= 'goss',
                          random_state=42,
                         scale_pos_weight=poswt)
lgbm_clf.fit(X_train,y_train)

In [None]:
print("ROCAUC Score :",roc_auc_score(y_train,lgbm_clf.predict_proba(X_train)[:,1]))

prediction on train data

In [None]:

atrain=lgbm_clf.predict_proba(X_train)[:,1]

 prediction on test data

In [None]:
X_test=test_data_df[test_column]
X_test=imputer1.transform(X_test)
X_test = scaler.transform(X_test)


In [None]:
yp=lgbm_clf.predict_proba(X_test)[:,1]

## Creating the final dataset to train the L1 model using the predictions of target on train data from each table

In [None]:
final_df = pd.DataFrame()
final_df.index = main_df['SK_ID_CURR']

In [None]:
final_df['bal']=cred_bal
final_df['inst']=inst
final_df['pos']=pos
final_df['bureau']=bureau
final_df['prevapp']=prevapp
final_df['train']=atrain

In [None]:
dtrain=pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
ls=dtrain['TARGET'].values
final_df['TARGET']=ls

In [None]:
#writing it to a file for future purposes
final_df.to_csv('final-ds.csv')

In [None]:
final_df.isnull().sum()

# **Level 1 model**

### For the L1 model we have choosen to use vanilla artificial neural networks as we need different weights assigned to predictions from different tables to represent the weightage of that table's prediction on the final target and ANN would train do assign weights to each input

In [None]:
df=pd.read_csv('../input/finaldata/final-train-ds.csv')
y=df['TARGET']
x=df.drop(['TARGET','SK_ID_CURR'],axis=1)

Importing modules for the model

In [None]:
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

**Model building**- We have used only 1 hidden layer to reduce the model complexity and reduce overfitting. The 6 features in each input will be mapped to 6 neurons in the hidden layer and weights will be assigned to them. Activation function used is relu which is a common choice for all hidden layers. The output layer has 1 neuron as we need 1 probablity value. Activation function is sigmoid as the class is binary

In [None]:
model = Sequential()
model.add(Dense(6, input_dim=6, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

**Model compiling** - stocastic gradient descent is the optimiser used, loss is binary cross entrophy as the output class is binary and the preformance metric used is AUC score, same as the metrics used to judge the scores in the competition

In [None]:
model.compile(optimizer='sgd',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.AUC()])


### It has to be metioned that the distribution of the output classes is not uniform. Lightgbm model has the capablity to assign weights to classes by changing parameters, similarly weights for classes need to be assigned to the Neural network model too. 

In [None]:
import sklearn
classes_zero = df[df['TARGET'] == 0]
classes_one = df[df['TARGET'] == 1]

# Convert parts into NumPy arrays for weight computation
zero_numpy = classes_zero['TARGET'].to_numpy()
one_numpy = classes_one['TARGET'].to_numpy()
all_together = np.concatenate((zero_numpy, one_numpy))
unique_classes = np.unique(all_together)

# Compute weights
weights = sklearn.utils.class_weight.compute_class_weight('balanced', unique_classes, all_together)

In [None]:
weights

There is more number of instances of class=0 hence it will have a lesser weight than class=1

In [None]:
#converting it to a dictionary
wt={}
wt[0]=weights[0]
wt[1]=weights[1]

**Model training** - based on brute force trial, more epoches caused the model to overfit. Upon increasing epoches more than 4 caused the auc score on train data to improve while that of validation reduced

In [None]:
train=model.fit(x, y, epochs=3, validation_split = 0.2,class_weight=wt,batch_size=50)

Checking the auc score with test split of the train data

### Preparing the dataset for predicting on test data

In [None]:
df1=pd.read_csv('../input/finaltestds/main-appl.csv')
df2=pd.read_csv('../input/finaltestds/bureau-bal.csv')
df3=pd.read_csv('../input/finaltestds2/cred-b.csv')
df4=pd.read_csv('../input/finaltestds/install.csv')
df5=pd.read_csv('../input/finaltestds/posapp.csv')
df6=pd.read_csv('../input/finaltestds/prev-appl.csv')

Merging all predictions of traget from all files

In [None]:
ftest=df3.merge(df4,how='left', on='SK_ID_CURR')
ftest=ftest.merge(df5,how='left', on='SK_ID_CURR')
ftest=ftest.merge(df2,how='left', on='SK_ID_CURR')
ftest=ftest.merge(df6,how='left', on='SK_ID_CURR')
ftest=ftest.merge(df1,how='left', on='SK_ID_CURR')

In [None]:
X_test=ftest.drop(['SK_ID_CURR'],axis=1)

In [None]:
yp=model.predict(X_test)

# Making submissions

### After getting the target values, it is written into a file to be submitted and get the table's score. 
**It needs to be noted that the next 2 cells will be used for each table as well as the final ensemble model**

In [None]:
dtest=pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')

In [None]:
def create_submission(x_test, y_test, target):
    """
    x_test is a dataframe
    y_test is an array with target values
    """
    submission = pd.DataFrame()
    submission.index = dtest['SK_ID_CURR']
    submission[target] = y_test
    submission.to_csv('final-new.csv')
    print("Finished writing to submission.csv")
    return pd.read_csv('./final-new.csv', index_col=0)

create_submission(dtest,yp, 'TARGET')

# Thus the final values for TARGET are obtained, and a submission is made

## Similarly LightGBM was also tried as the L1 model but it did not give a good score

In [None]:
fin = LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005134,
    num_leaves= 54,
    max_depth= 10,
    subsample_for_bin= 240000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    colsample_bytree= 0.508716,
    min_split_gain= 0.024766,
    subsample= 1,
    is_unbalance= False,)
#fin.fit(x,y)
#print("ROCAUC Score :",roc_auc_score(y_test,fin.predict_proba(x_test)[:,1]))

In [None]:
#yp=fin.predict_proba(X_test)[:,1]