In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, RocCurveDisplay
from sklearn.model_selection import StratifiedKFold

After looking into some discussions of the competition, I decided to focus on the application table. The reason is almost all highest important features are coming from this table. Once the baseline model is created, we could explore other tables more to improve the model.

In [None]:
df = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
df_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")

df.head

The first step in any problem solving is to look at the data we have and collect any useful information from high level.

In [None]:
df.TARGET.value_counts(normalize=True)

The training data contains unbalanced target group. There are several options to balance the dataset (undersampling, oversampling, etc.), however in this case, we should first try with the raw data.

In [None]:
# Checking missing values in the table
total = df.isnull().sum().sort_values(ascending=False)
missing = (df.isnull().sum() / df.isnull().count() * 100).sort_values(ascending=False)
missing_percent  = pd.concat([total, missing], axis=1, keys=["Total", "Missing (%)"])
missing_percent.head()

There are lots of missing values in some collumns. One option is to use imputation, the other is to leave the missing values as they are. In this kind of problem, missing values are also a valuable information, so they will be left as null.

Now, I will visualize several features in correlation with the target variable. Since this will only be an example, I chose the highest important features from the existing EDAs in the competition.

In [None]:
temp0 = df["EXT_SOURCE_3"][df["TARGET"]==0]
temp1 = df["EXT_SOURCE_3"][df["TARGET"]==1]

con = pd.concat([temp0, temp1], axis=1, ignore_index=False)
con.columns=["0", "1"]

ax = con.plot.hist(by=["0", "1"],bins=30, alpha=0.5,figsize=(15,8))
ax.set_xlabel("Value")
ax.set_title("External Source 3 Credit Value Histogram")

del temp0,temp1,con

As we can see from the figure above, the graph for default and non-default are quite different. The non-default graph is skewed to the higher values while the default graph is skewed slightly to the smaller values. Since we don't know the actual meaning of this external source credit value, we would just use it as it is.

In [None]:
temp0 = df["DAYS_BIRTH"][df["TARGET"]==0]
temp1 = df["DAYS_BIRTH"][df["TARGET"]==1]

con = pd.concat([temp0, temp1], axis=1, ignore_index=False)
con.columns=["0", "1"]
con = con/-365
ax = con.plot.hist(by=["0", "1"],bins=30, alpha=0.5,figsize=(15,8))
ax.set_xlabel("Age")
ax.set_title("Applicant's Age Histogram")

temp_total = df["DAYS_BIRTH"]/-365
bins = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 100]
temp1 = pd.cut(temp1/-365, bins).value_counts()
temp_total = pd.cut(temp_total, bins).value_counts()

percent = temp1.div(temp_total)*100

fig, ax2 = plt.subplots(1, 1,figsize=(15,8))
ax2 = percent.plot.bar(rot=0, ax=ax2)
ax2.set_xlabel("Age")
ax2.set_ylabel("Percentage")
ax2.set_title("Defaulting Applicant's Age in Percentage of Total")

del temp0,temp1, temp_total,con

For the figures comparing the applicant's age, there is no significant different between the two groups. However, when we use percentage of defaulting in the age groups, we could see a correlation between the age and the default variable. Younger applicants are more likely to default rather than older ones. This feature will most likely be useful for the model.

In [None]:
temp1 = df["NAME_EDUCATION_TYPE"][df["TARGET"]==1].value_counts()
temp_total = df["NAME_EDUCATION_TYPE"].value_counts()

percent = temp1.div(temp_total)
percent = percent.sort_values(ascending=False)

fig, ax = plt.subplots(1, 1,figsize=(15,8))
percent.plot.bar(rot=0, ax=ax)
ax.set_xlabel("Education")
ax.set_ylabel("Percentage")
ax.set_title("Defaulting Applicant's Education in Percentage of Total")

del temp1, temp_total, percent

Similar to the age figure, here we could see there is a trend in the education groups. Applicants with lower education is more likely to default in the loan rather than the ones with higher education.

From several feature visualization, we could see that even in higher level, there are some information that we could obtain. However, since there are already a complete EDAs in the competition discussion, I will stop the feature exploration and continue with the feature processing and model building.

In [None]:
def one_hot_encoder(df):
    cat_col = [col for col in df.columns if df[col].dtype == "object"]
    new_df = pd.get_dummies(df, columns=cat_col, dummy_na=True)
    return new_df

Using one hot encoder, I replace categorical features with each sub-category value 0 or 1. This will elaborate each sub-category rather than clump all into 1 feature.

In [None]:
df_all = df.append(df_test).reset_index()

# Replacing N and Y variables with 0 and 1 for binary feature with no missing values
for feat in ["FLAG_OWN_CAR", "FLAG_OWN_REALTY"]:
    df_all[feat], uniques = pd.factorize(df_all[feat])
    
# One-Hot Encoding for the rest categorical features
df_all = one_hot_encoder(df_all)

# There seems like to be anomalies in the data where days employed should have negative values, while some have really high positive value (365243)
df_all["DAYS_EMPLOYED"].replace(365243, np.nan, inplace= True)

Another feature engineering method is extracting features out of existing features. For example, we have the applicants income and the loan annuity, so we could combine those two features to obtain the ratio of the applicants monthly income to the loan annuity. Another example is to find the ratio between the application credit amount versus the applicants total income. We could extract much more features manually, however, we must also take consideration that more features will bring the curse of dimensionality, which may worsen the performance of the model.

In [None]:
# Example of extra features
df_all["AMT_ANNUITY / AMT_INCOME_TOTAL per month"] = df_all["AMT_ANNUITY"].divide(df_all["AMT_INCOME_TOTAL"]/12)
df_all["AMT_CREDIT / AMT_INCOME_TOTAL"] = df_all["AMT_CREDIT"].divide(df_all["AMT_INCOME_TOTAL"])

In [None]:
# Splitting dataframe
df = df_all[df_all["TARGET"].notnull()]
df_test = df_all[df_all["TARGET"].isnull()].drop(["TARGET"], axis=1)

del df_all

Other than extracting features (adding new features), we could also reduce features (called feature selection), which removes irrelevant features from the available ones. The basic technique in feature selection is using filter method, which removes the bad features from preliminary analysis, such as correlation coefficient, Weight of Evidence (WoE), or Information Value (IV). We remove the ones with no correlation or weight to the target variable, then proceed with the left features.
However, for this dataset and problem, corrcoef method is not the effective one as it actually worsen the performance of the model when I used only the 5 highest correlation features.

In [None]:
corrcoef = df.corr(method ='pearson')["TARGET"]
corrcoef.sort_values().dropna().iloc[np.r_[0:5, -5:-1]]

Due to time constraint, I train the model using all the features in the application table without many modification. The result showed 0.76 ROC AUC score on average in the train dataset and 0.747 score in the task submission.

In [None]:
# Preparing data for kfold cross-validation
df_y = df["TARGET"]
df_x = df.drop(["TARGET"], axis=1)
# df_x = df[["EXT_SOURCE_3", "EXT_SOURCE_2", "EXT_SOURCE_1", "DAYS_BIRTH", "DAYS_EMPLOYED"]]

# Removing special JSON characters so that lgbm could work
df_x = df_x.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

clf = LGBMClassifier(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=8,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
        )

# Variable for probability result
preds = np.zeros(df_test.shape[0])

fold = StratifiedKFold(n_splits=10, shuffle=True)
for n, (train_index, test_index) in enumerate(fold.split(df_x,df_y)):
    X_train, X_test = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_test = df_y.iloc[train_index], df_y.iloc[test_index]
    
    clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
            eval_metric="auc", verbose=200, early_stopping_rounds=200)
    
    
    preds += clf.predict_proba(df_test, num_iteration=clf.best_iteration_)[:, 1] / fold.n_splits
    print("ROC AUC score of fold " + str(n+1) + " is: " + str(roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])))

fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    
df_test['TARGET'] = preds
df_test[['SK_ID_CURR', 'TARGET']].to_csv("sub.csv", index= False)