# Score: 0.784

<a id="toc"></a>

# <u>Table of Contents</u>
1.) [TODO](#todo)  
2.) [Imports](#imports)  
3.) [Load data](#load)  
4.) [Bureau Balance](#bureau_bal)  
&nbsp;&nbsp;&nbsp;&nbsp; 4.1.) [Merge into Bureau](#merge_bureau_bal)  
5.) [POS CASH balance](#pos_cash)  
&nbsp;&nbsp;&nbsp;&nbsp; 5.1.) [Missing values](#pos_nan)  
&nbsp;&nbsp;&nbsp;&nbsp; 5.2.) [Merge into Previous Application](#merge_pos_cash)  
6.) [Installment Payments](#install_pay)  
&nbsp;&nbsp;&nbsp;&nbsp; 6.1.) [Missing values](#install_nan)  
&nbsp;&nbsp;&nbsp;&nbsp; 6.2.) [Merge into Previous Application](#merge_install_pay)  
7.) [Credit Card Balance](#credit)  
&nbsp;&nbsp;&nbsp;&nbsp; 7.1.) [Missing values](#credit_nan)  
&nbsp;&nbsp;&nbsp;&nbsp; 7.2.) [Merge into Previous Application](#merge_credit)  
8.) [Final Data Prep](#final_merge)  
&nbsp;&nbsp;&nbsp;&nbsp; 8.1.) [Missing values](#final_nan)  
9.) [Modeling](#models)  
10.) [Save file to CSV](#save)  

<a id="todo"></a>

# [^](#toc) <u>TODO</u>

- Fix skew on columns
- Tinker with the best way to replace missing values (dropping cols?)
- Look for outliers
- Merge db together
- Include timeline relatoinships like MONTHS_BALANCE
- Tune model parameters
- Address [this](https://www.kaggle.com/c/home-credit-default-risk/discussion/57248)

---
<a id="imports"></a>

# [^](#toc) <u>Imports</u>

In [1]:
### Standard imports
import pandas as pd
import numpy as np

# Time keeper
import time

# Progress bar
from tqdm import tqdm

### Removes warnings from output
import warnings
warnings.filterwarnings('ignore')

### Helper functions

In [2]:
# function to create dummy variables of categorical features
def get_dummies(df, cats):
    for col in cats:
        df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
    return df 

def fillna_num(df):
    missing_cols = [col for col in df.columns if any(df[col].isnull()) and df[col].dtype != object]
    for col in missing_cols:
        df[col] = df[col].fillna(df[col].median())
    return df

def fillna_cat(df):
    for col in [col for col in df if df[col].dtype==object]:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

def factorize_df(df, cats):
    for col in cats:
        df[col], _ = pd.factorize(df[col])
    return df 

---
<a id="load"></a>

# [^](#toc) <u>Load data</u>

In [3]:
DATA_PATH = "../data/home_default/"

bureau   = pd.read_csv(DATA_PATH + "bureau.csv")
prev_app = pd.read_csv(DATA_PATH + "previous_application.csv")

print("Shape of bureau:",    bureau.shape)
print("Shape of prev_app:",  prev_app.shape)

Shape of bureau: (1716428, 17)
Shape of prev_app: (1670214, 37)


### Missing Values

In [4]:
bureau   = fillna_num(bureau)
bureau   = fillna_cat(bureau)

prev_app = fillna_num(prev_app)
prev_app = fillna_cat(prev_app)

<a id="bureau_bal"></a>

# [^](#toc) <u>Bureau Balance</u>

In [5]:
bureau_balance = pd.read_csv(DATA_PATH + "bureau_balance.csv")
print("Shape of bureau_balance:",  bureau_balance.shape)

print("\nColumns of bureau_balance:")
print(" --- ".join(bureau_balance.columns.values))

Shape of bureau_balance: (27299925, 3)

Columns of bureau_balance:
SK_ID_BUREAU --- MONTHS_BALANCE --- STATUS


<a id="merge_bureau_bal"></a>

### [^](#toc) <u>Merge into Bureau</u>

In [7]:
merge_df = get_dummies(bureau_balance, ["STATUS"])
merge_df = merge_df.drop(["MONTHS_BALANCE", "STATUS"], axis=1)

# prep for merge
merge_df = merge_df.groupby("SK_ID_BUREAU").sum().reset_index()

### Add the median of the rest of the columns
right    = bureau_balance.groupby("SK_ID_BUREAU").median().reset_index()
merge_df = merge_df.merge(right=right, how="left", on="SK_ID_BUREAU").set_index("SK_ID_BUREAU")

### Remember added columns
merged_cols = ['bur_bal_' + col for col in merge_df.columns]
merge_df.columns = merged_cols

# Merge
bureau = bureau.merge(right=merge_df.reset_index(), how='left', on='SK_ID_BUREAU')

### Fill in new missing values

In [8]:
bureau["no_bureau_bal"] = bureau[merged_cols[0]].map(lambda x: 1 if np.isnan(x) else 0)
bureau[merged_cols]     = bureau[merged_cols].fillna(0)

---
<a id="pos_cash"></a>

# [^](#toc) <u>POS CASH balance</u>

In [9]:
pcb = pd.read_csv(DATA_PATH + "POS_CASH_balance.csv")
print("Shape of pcb:",  pcb.shape)

print("\nColumns of pcb:")
print(" --- ".join(pcb.columns.values))

Shape of pcb: (10001358, 8)

Columns of pcb:
SK_ID_PREV --- SK_ID_CURR --- MONTHS_BALANCE --- CNT_INSTALMENT --- CNT_INSTALMENT_FUTURE --- NAME_CONTRACT_STATUS --- SK_DPD --- SK_DPD_DEF


<a id="pos_nan"></a>

### [^](#toc) Missing Values

In [10]:
for col in ("CNT_INSTALMENT", "CNT_INSTALMENT_FUTURE"):
    pcb[col] = pcb[col].transform(lambda x: x.fillna(x.median()))

### Remove Outliers

In [11]:
pcb = pcb.drop(pcb[pcb.NAME_CONTRACT_STATUS.isin(["XNA", "Canceled"])].index)

### Get Dummies

In [12]:
merge_df = pcb[["SK_ID_PREV", "NAME_CONTRACT_STATUS"]]

merge_df = get_dummies(merge_df, ["NAME_CONTRACT_STATUS"])
merge_df = merge_df.drop("NAME_CONTRACT_STATUS", axis=1)

<a id="merge_pos_cash"></a>

### [^](#toc) <u>Merge into Previous Application</u>

In [13]:
# prep for merge
count    = merge_df.groupby("SK_ID_PREV").count()
merge_df = merge_df.groupby("SK_ID_PREV").sum().reset_index()
merge_df["N"] = list(count.iloc[:,0])

right    = pcb.drop("SK_ID_CURR", axis=1).groupby("SK_ID_PREV").median().reset_index()
merge_df = merge_df.merge(right=right, how="left", on="SK_ID_PREV").set_index("SK_ID_PREV")

merged_cols = ['pos_' + col for col in merge_df.columns]
merge_df.columns = merged_cols

# Merge
prev_app = prev_app.merge(right=merge_df.reset_index(), how='left', on='SK_ID_PREV')

### Fill in missing values

In [14]:
prev_app["no_pcb"] = prev_app[merged_cols[0]].map(lambda x: 1 if np.isnan(x) else 0)

for col in tqdm(merged_cols):
    not_null      = prev_app[col].notnull()
    mode          = prev_app[not_null][col].mode()
    prev_app[col] = prev_app[col].fillna(mode)    

100%|██████████| 13/13 [00:08<00:00,  1.53it/s]


---
<a id="install_pay"></a>

# [^](#toc) <u>Installment Payments</u>

In [15]:
install_pay = pd.read_csv(DATA_PATH + "installments_payments.csv")
print("Shape of install_pay:",  install_pay.shape)

print("\nColumns of install_pay:")
print(" --- ".join(install_pay.columns.values))

Shape of install_pay: (13605401, 8)

Columns of install_pay:
SK_ID_PREV --- SK_ID_CURR --- NUM_INSTALMENT_VERSION --- NUM_INSTALMENT_NUMBER --- DAYS_INSTALMENT --- DAYS_ENTRY_PAYMENT --- AMT_INSTALMENT --- AMT_PAYMENT


<a id="install_nan"></a>

### [^](#toc) <u>Missing values</u>

In [16]:
for col in ("DAYS_ENTRY_PAYMENT", "AMT_PAYMENT"):
    install_pay[col + "_nan"] = install_pay[col].map(lambda x: 1 if np.isnan(x) else 0)
    install_pay[col] = install_pay[col].fillna(0)

### Setup for merge

In [17]:
install_pay["AMT_MISSING"] = install_pay["AMT_INSTALMENT"] - install_pay["AMT_PAYMENT"]
temp = install_pay.groupby("SK_ID_PREV")["AMT_MISSING"]

merge_df = pd.DataFrame({
    "INSTALL_missing_max": temp.max(),
    "INSTALL_missing_min": temp.min(),
    "INSTALL_missing_med": temp.median(),
    "INSTALL_payment_nan": install_pay.groupby("SK_ID_PREV")["AMT_PAYMENT_nan"].sum(),
    "INSTALL_N":           temp.count()
})

### Add the rest of the columns

In [18]:
right = install_pay.drop("SK_ID_CURR", axis=1).groupby("SK_ID_PREV").median().reset_index()
merge_df = merge_df.reset_index()

merge_df = merge_df.merge(right=right, how="left", on="SK_ID_PREV").set_index("SK_ID_PREV")
merged_cols = merge_df.columns

<a id="merge_install_pay"></a>

### [^](#toc) <u>Merge into Previous Application</u>

In [19]:
# Merge
prev_app = prev_app.merge(right=merge_df.reset_index(), how='left', on='SK_ID_PREV')

### Fill in missing values

In [20]:
prev_app["no_install"] = prev_app[merged_cols[0]].map(lambda x: 1 if np.isnan(x) else 0)

for col in tqdm(merged_cols):
    not_null      = prev_app[col].notnull()
    mode          = prev_app[not_null][col].mode()
    prev_app[col] = prev_app[col].fillna(mode)    

100%|██████████| 14/14 [00:13<00:00,  1.00it/s]


---
<a id="credit"></a>

# [^](#toc) <u>Credit Card Balance</u>

In [21]:
credit_card = pd.read_csv(DATA_PATH + "credit_card_balance.csv")
print("Shape of credit_card:",  credit_card.shape)

print("\nColumns of credit_card:")
print(" --- ".join(credit_card.columns.values))

Shape of credit_card: (3840312, 23)

Columns of credit_card:
SK_ID_PREV --- SK_ID_CURR --- MONTHS_BALANCE --- AMT_BALANCE --- AMT_CREDIT_LIMIT_ACTUAL --- AMT_DRAWINGS_ATM_CURRENT --- AMT_DRAWINGS_CURRENT --- AMT_DRAWINGS_OTHER_CURRENT --- AMT_DRAWINGS_POS_CURRENT --- AMT_INST_MIN_REGULARITY --- AMT_PAYMENT_CURRENT --- AMT_PAYMENT_TOTAL_CURRENT --- AMT_RECEIVABLE_PRINCIPAL --- AMT_RECIVABLE --- AMT_TOTAL_RECEIVABLE --- CNT_DRAWINGS_ATM_CURRENT --- CNT_DRAWINGS_CURRENT --- CNT_DRAWINGS_OTHER_CURRENT --- CNT_DRAWINGS_POS_CURRENT --- CNT_INSTALMENT_MATURE_CUM --- NAME_CONTRACT_STATUS --- SK_DPD --- SK_DPD_DEF


<a id="credit_nan"></a>

### [^](#toc) <u>Missing Values and Outliers</u>

In [22]:
# ------------------------------
### Remove outliers
# Gets indices with outlier values
temp = credit_card[credit_card.NAME_CONTRACT_STATUS.isin(["Refused", "Approved"])].index

# Drops outlier values
credit_card = credit_card.drop(temp, axis=0)

# ------------------------------
#### Fill in missing values
cols = [
        "AMT_DRAWINGS_ATM_CURRENT", "AMT_DRAWINGS_OTHER_CURRENT", "AMT_DRAWINGS_POS_CURRENT", 
        "AMT_INST_MIN_REGULARITY", "AMT_PAYMENT_CURRENT", "CNT_DRAWINGS_ATM_CURRENT", 
        "CNT_DRAWINGS_OTHER_CURRENT", "CNT_DRAWINGS_POS_CURRENT", "CNT_INSTALMENT_MATURE_CUM"
]
for col in tqdm(cols):
    not_null = credit_card[col].notnull()
    mode = float(credit_card[not_null][col].mode())
    credit_card[col] = credit_card[col].fillna(mode)

100%|██████████| 9/9 [00:09<00:00,  1.09s/it]


### Setup Categorical column

In [23]:
temp = credit_card[["SK_ID_PREV", "NAME_CONTRACT_STATUS"]]

temp = get_dummies(temp, ["NAME_CONTRACT_STATUS"])
temp = temp.drop("NAME_CONTRACT_STATUS", axis=1)
temp = temp.groupby("SK_ID_PREV").sum()

### Select columns

In [24]:
merge_df = pd.DataFrame({
    "AMT_BALANCE": credit_card.groupby("SK_ID_PREV").AMT_BALANCE.mean(),
    "SK_DPD":      credit_card.groupby("SK_ID_PREV").SK_DPD.max(),
    "SK_DPD_DEF":  credit_card.groupby("SK_ID_PREV").SK_DPD_DEF.max(),
    "N":           credit_card.groupby("SK_ID_PREV").count().iloc[:,0]
})

merge_df = temp.join(merge_df)
del temp

### Add the rest of the columns

In [25]:
right = credit_card.drop("SK_ID_CURR", axis=1).groupby("SK_ID_PREV").median().reset_index()
merge_df = merge_df.reset_index()
merge_df = merge_df.merge(right=right, how="left", on="SK_ID_PREV").set_index("SK_ID_PREV")

<a id="merge_credit"></a>

### [^](#toc) <u>Merge into Previous Application</u>

In [26]:
# Merge
merged_cols = ['credit_' + col for col in merge_df.columns]
merge_df.columns = merged_cols
prev_app = prev_app.merge(right=merge_df.reset_index(), how='left', on='SK_ID_PREV')

### Fill in new NaN values

In [27]:
prev_app["no_credit"] = prev_app[merged_cols[0]].map(lambda x: 1 if np.isnan(x) else 0)

for col in tqdm(merged_cols):
    not_null = prev_app[col].notnull()
    median = prev_app[not_null][col].median()
    prev_app[col] = prev_app[col].fillna(median)    

100%|██████████| 29/29 [00:07<00:00,  3.70it/s]


---

### Misc clean up

In [28]:
### Drop unneeded ID columns
prev_app = prev_app.drop("SK_ID_PREV", axis=1)
bureau   = bureau.drop("SK_ID_BUREAU", axis=1)

print("Number of null in prev_app:", sum(prev_app.isnull().sum()))
print("Number of null in bureau:  ", sum(bureau.isnull().sum()))

Number of null in prev_app: 19985382
Number of null in bureau:   0


---
<a id="final_merge"></a>

# [^](#toc) <u>Final Data Prep</u>

In [29]:
train = pd.read_csv(DATA_PATH + "train.csv")
test  = pd.read_csv(DATA_PATH + "test.csv")

print("Shape of train:", train.shape)
print("Shape of test:",  test.shape)

Shape of train: (307511, 122)
Shape of test: (48744, 121)


### Split into predictors, target, and id

In [30]:
train_y = train.TARGET
train_x = train.drop(["TARGET"], axis=1)

test_id = test.SK_ID_CURR
test_x  = test

### Merge train and test data

In [31]:
full    = pd.concat([train_x, test_x])
train_N = len(train_x)

<a id="final_nan"></a>

### [^](#toc) <u>Missing values</u>

In [32]:
full = fillna_cat(full)
full = fillna_num(full)
sum(full.isnull().sum())

0

### Lump together values with low counts

In [33]:
##### Bureau

# CREDIT_CURRENCY
cols = ["currency 3", "currency 4"]
bureau.CREDIT_CURRENCY = bureau.CREDIT_CURRENCY.map(lambda x: "MISC" if x in cols else x)

# CREDIT_TYPE
cols = ["Cash loan (non-earmarked)", "Real estate loan", "Loan for the purchase of equipment",
        "Loan for purchase of shares (margin lending)", "Interbank credit", "Mobile operator loan"]
bureau.CREDIT_TYPE = bureau.CREDIT_TYPE.map(lambda x: "MISC" if x in cols else x)

##### Previous Application

# NAME_GOODS_CATEGORY
prev_app.NAME_GOODS_CATEGORY = prev_app.NAME_GOODS_CATEGORY.map(
    lambda x: "MISC" if x in ["Weapon", "Insurance"] else x)

# NAME_CASH_LOAN_PURPOSE
prev_app.NAME_CASH_LOAN_PURPOSE = prev_app.NAME_CASH_LOAN_PURPOSE.map(
    lambda x: "MISC" if x in ["Buying a garage", "Misc"] else x)

### Factorize

In [34]:
# Get categorical features
data_cats = [col for col in full.columns if full[col].dtype == 'object']

# Factorize the dataframe
full = factorize_df(full, data_cats)

### Merge Previous Application with full

In [None]:
# cat_cols = [
#         "NAME_CONTRACT_TYPE", "WEEKDAY_APPR_PROCESS_START",
#         "FLAG_LAST_APPL_PER_CONTRACT", "NAME_CASH_LOAN_PURPOSE",
#         "NAME_CONTRACT_STATUS", "NAME_PAYMENT_TYPE",
#         "CODE_REJECT_REASON", "NAME_TYPE_SUITE", "NAME_CLIENT_TYPE",
#         "NAME_GOODS_CATEGORY", "NAME_PORTFOLIO", "NAME_PRODUCT_TYPE",
#         "CHANNEL_TYPE", "NAME_SELLER_INDUSTRY", "NAME_YIELD_GROUP",
#         "PRODUCT_COMBINATION", "SK_ID_CURR"]
# num_cols = [col for col in prev_app.columns if col not in cat_cols]
# num_cols.append("SK_ID_CURR")

# # Numeric columns
# merge_df      = prev_app[num_cols].groupby('SK_ID_CURR').mean()
# merge_df["N"] = prev_app.groupby('SK_ID_CURR').count().iloc[:,0]

# # Categorical columns
# right = prev_app[cat_cols].set_index("SK_ID_CURR")
# right = pd.get_dummies(right).reset_index()
# right = right.groupby("SK_ID_CURR").sum().reset_index()

# merge_df = merge_df.reset_index()
# merge_df = merge_df.merge(right=right, how="left", on="SK_ID_CURR").set_index("SK_ID_CURR")

# merged_cols   = ['p_' + col for col in merge_df.columns]
# merge_df.columns = merged_cols

# full = full.merge(right=merge_df.reset_index(), how='left', on='SK_ID_CURR')

### NEW!  Adding way more features

In [36]:
cat_cols = [
        "NAME_CONTRACT_TYPE", "WEEKDAY_APPR_PROCESS_START",
        "FLAG_LAST_APPL_PER_CONTRACT", "NAME_CASH_LOAN_PURPOSE",
        "NAME_CONTRACT_STATUS", "NAME_PAYMENT_TYPE",
        "CODE_REJECT_REASON", "NAME_TYPE_SUITE", "NAME_CLIENT_TYPE",
        "NAME_GOODS_CATEGORY", "NAME_PORTFOLIO", "NAME_PRODUCT_TYPE",
        "CHANNEL_TYPE", "NAME_SELLER_INDUSTRY", "NAME_YIELD_GROUP",
        "PRODUCT_COMBINATION", "SK_ID_CURR"]
num_cols = [col for col in prev_app.columns if col not in cat_cols]
num_cols.append("SK_ID_CURR")

############## Numeric columns ##############
### Mean
merge_df         = prev_app[num_cols].groupby('SK_ID_CURR').mean()
merged_cols      = ['avg_' + col for col in merge_df.columns]
merge_df.columns = merged_cols

### Max
right         = prev_app[num_cols].groupby('SK_ID_CURR').max()
right_cols    = ['max_' + col for col in right.columns]
right.columns = right_cols

### Merge Mean and Max
merge_df = merge_df.reset_index()
merge_df = merge_df.merge(right=right.reset_index(), how="left", on="SK_ID_CURR").set_index("SK_ID_CURR")

############# Categorical columns #############
right = prev_app[cat_cols].set_index("SK_ID_CURR")
right = pd.get_dummies(right).reset_index()
right = right.groupby("SK_ID_CURR").sum().reset_index()

merge_df = merge_df.reset_index()
merge_df = merge_df.merge(right=right, how="left", on="SK_ID_CURR").set_index("SK_ID_CURR")

merge_df["N"] = prev_app.groupby('SK_ID_CURR').count().iloc[:,0]
merged_cols   = ['p_' + col for col in merge_df.columns]
merge_df.columns = merged_cols

full = full.merge(right=merge_df.reset_index(), how='left', on='SK_ID_CURR')

#### Fill NaN values

In [37]:
full["no_prev_app"] = full[merged_cols[0]].map(lambda x: 1 if np.isnan(x) else 0)

for col in tqdm(merged_cols):
    not_null  = full[col].notnull()
    median    = full[not_null][col].median()
    full[col] = full[col].fillna(median)    
    
sum(full.isnull().sum())

100%|██████████| 299/299 [04:54<00:00,  1.01it/s]


0

### Merge Bureau with full

In [38]:
cat_cols = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE', 'SK_ID_CURR']
num_cols = [col for col in bureau.columns if col not in cat_cols]
num_cols.append("SK_ID_CURR")

############## Numeric columns ##############
### Mean
merge_df         = bureau[num_cols].groupby('SK_ID_CURR').mean()
merged_cols      = ['avg_' + col for col in merge_df.columns]
merge_df.columns = merged_cols

### Max
right         = bureau[num_cols].groupby('SK_ID_CURR').max()
right_cols    = ['max_' + col for col in right.columns]
right.columns = right_cols

### Merge Mean and Max
merge_df = merge_df.reset_index()
merge_df = merge_df.merge(right=right.reset_index(), how="left", on="SK_ID_CURR").set_index("SK_ID_CURR")

############# Categorical columns #############
right = bureau[cat_cols].set_index("SK_ID_CURR")
right = pd.get_dummies(right).reset_index()
right = right.groupby("SK_ID_CURR").sum().reset_index()

merge_df = merge_df.reset_index()
merge_df = merge_df.merge(right=right, how="left", on="SK_ID_CURR").set_index("SK_ID_CURR")

merge_df["N"] = bureau.groupby('SK_ID_CURR').count().iloc[:,0]
merged_cols   = ['b_' + col for col in merge_df.columns]
merge_df.columns = merged_cols

full = full.merge(right=merge_df.reset_index(), how='left', on='SK_ID_CURR')

#### Fill NaN values

In [39]:
full["no_bureau"] = full[merged_cols[0]].map(lambda x: 1 if np.isnan(x) else 0)

for col in tqdm(merged_cols):
    not_null  = full[col].notnull()
    median    = full[not_null][col].median()
    full[col] = full[col].fillna(median)    

sum(full.isnull().sum())

100%|██████████| 62/62 [01:06<00:00,  1.07s/it]


0

### Delete unneeded columns

In [40]:
# full = full.drop(['APARTMENTS_MODE', 'BASEMENTAREA_MODE',
#        'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE',
#        'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE',
#        'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE',
#        'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE',
#        'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI',
#        'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI',
#        'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
#        'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI',
#        'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI',
#        'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI'], axis=1)

full = full.drop("SK_ID_CURR", axis=1)

### Split full back into train and test

In [41]:
train_x = full[:train_N]
test_x = full[train_N:]

### Processed data look

In [42]:
train_x.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,b_CREDIT_TYPE_Consumer credit,b_CREDIT_TYPE_Credit card,b_CREDIT_TYPE_Loan for business development,b_CREDIT_TYPE_Loan for working capital replenishment,b_CREDIT_TYPE_MISC,b_CREDIT_TYPE_Microloan,b_CREDIT_TYPE_Mortgage,b_CREDIT_TYPE_Unknown type of loan,b_N,no_bureau
0,0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,0,...,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0
1,0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,1,...,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0
2,1,0,1,0,0,67500.0,135000.0,6750.0,135000.0,0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0
3,0,1,0,0,0,135000.0,312682.5,29686.5,297000.0,0,...,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1
4,0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


<a id="models"></a>

# [^](#toc) <u>Models </u>

### sban's method

<div hidden>

tried

## learning rate


</div>

In [43]:
from sklearn.model_selection import train_test_split 
import lightgbm as lgb
    
training_x, val_x, training_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=17)
lgb_train = lgb.Dataset(data=training_x, label=training_y)
lgb_eval  = lgb.Dataset(data=val_x, label=val_y)

params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}

start = time.time()
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)
print("Training took {} seconds".format(round(time.time() - start)))

Training until validation scores don't improve for 150 rounds.
[200]	valid_0's auc: 0.741669
[400]	valid_0's auc: 0.756773
[600]	valid_0's auc: 0.76861
[800]	valid_0's auc: 0.774518
[1000]	valid_0's auc: 0.777448
[1200]	valid_0's auc: 0.779481
[1400]	valid_0's auc: 0.780592
[1600]	valid_0's auc: 0.781474
[1800]	valid_0's auc: 0.782084
[2000]	valid_0's auc: 0.782351
[2200]	valid_0's auc: 0.782535
[2400]	valid_0's auc: 0.78272
Early stopping, best iteration is:
[2381]	valid_0's auc: 0.78276
Training took 634 seconds


### NEW!  More iterations

add a clear output before each iteration

In [56]:
import gc

NUM_ITER = 5

preds = pd.DataFrame()

start1 = time.time()
for i in range(NUM_ITER):
    training_x, val_x, training_y, val_y = train_test_split(train_x, train_y, test_size=0.2)
    lgb_train = lgb.Dataset(data=training_x, label=training_y)
    lgb_eval  = lgb.Dataset(data=val_x, label=val_y)
    
    ### Delete old variables
    del training_x, val_x, training_y, val_y
    gc.collect()

    ### Training
    start2 = time.time()
    params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}
    model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=500, verbose_eval=100)
    print("Training took {} seconds".format(round(time.time() - start2)))
    
    ### Predicting
    start2   = time.time()
    pred     = model.predict(test_x)
    preds[i] = pred
    print("Predicting took {} seconds".format(round(time.time() - start2)))
    
    ### Delete old variables
    del lgb_train, lgb_eval, pred, model
    gc.collect()
    
print("Total training took {} seconds".format(round(time.time() - start2)))



train size: 246008
Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.739765
[200]	valid_0's auc: 0.745511
[300]	valid_0's auc: 0.753766
[400]	valid_0's auc: 0.761816
[500]	valid_0's auc: 0.768521
[600]	valid_0's auc: 0.773005
[700]	valid_0's auc: 0.775818
[800]	valid_0's auc: 0.777982
[900]	valid_0's auc: 0.779624
[1000]	valid_0's auc: 0.780744
[1100]	valid_0's auc: 0.781763
[1200]	valid_0's auc: 0.782467
[1300]	valid_0's auc: 0.783046
[1400]	valid_0's auc: 0.7835
[1500]	valid_0's auc: 0.783864
[1600]	valid_0's auc: 0.7842
[1700]	valid_0's auc: 0.784374
[1800]	valid_0's auc: 0.784502
[1900]	valid_0's auc: 0.78462
[2000]	valid_0's auc: 0.784706
[2100]	valid_0's auc: 0.784671
[2200]	valid_0's auc: 0.784717
[2300]	valid_0's auc: 0.784727
[2400]	valid_0's auc: 0.784796
[2500]	valid_0's auc: 0.784857
[2600]	valid_0's auc: 0.784883
[2700]	valid_0's auc: 0.784813
[2800]	valid_0's auc: 0.784839
[2900]	valid_0's auc: 0.784895
[3000]	valid_0's auc: 0.784831


In [59]:
preds.mean(axis=1)

0        0.027977
1        0.129554
2        0.025091
3        0.033974
4        0.136601
5        0.047445
6        0.006938
7        0.022963
8        0.013601
9        0.090261
10       0.053548
11       0.027306
12       0.169998
13       0.043243
14       0.047789
15       0.235612
16       0.052854
17       0.017143
18       0.060678
19       0.034247
20       0.028206
21       0.008363
22       0.042622
23       0.101389
24       0.036326
25       0.094908
26       0.087442
27       0.062852
28       0.034161
29       0.046265
           ...   
48714    0.027262
48715    0.011388
48716    0.242871
48717    0.010625
48718    0.036675
48719    0.132723
48720    0.027010
48721    0.106396
48722    0.131091
48723    0.096111
48724    0.107426
48725    0.078470
48726    0.042967
48727    0.013109
48728    0.008085
48729    0.108638
48730    0.070992
48731    0.025013
48732    0.112732
48733    0.030133
48734    0.035333
48735    0.057536
48736    0.020606
48737    0.164455
48738    0

### Found on Kaggle

In [None]:
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
    trn_x, trn_y = data[feats].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = data[feats].iloc[val_idx], y.iloc[val_idx]
    
    clf = LGBMClassifier(
        # n_estimators=1000,
        # num_leaves=20,
        # colsample_bytree=.8,
        # subsample=.8,
        # max_depth=7,
        # reg_alpha=.1,
        # reg_lambda=.1,
        # min_split_gain=.01
        n_estimators=10000,
        learning_rate=0.03,
        num_leaves = 40,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=10,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1,
    )
    
    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=300  #30
           )
    
    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(test[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()

### Predictions and save to CSV

In [60]:
# predictions = model.predict(test_x)

pd.DataFrame({
    "SK_ID_CURR": test_id,
    "TARGET": preds.mean(axis=1)
}).to_csv("../submissions/5_runs.csv", index=False)