In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler, normalize
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier as lgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
sns.set_style("whitegrid")

# Application table

In [None]:
application_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
application_test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")

In [None]:
print(application_train.shape)
print(application_test.shape)
application_train.head()

In [None]:
app_cols = application_train.columns
print(list(app_cols))

# 0. Data types and null counts

In [None]:
null_cnt_df = application_train.isnull().sum().reset_index(name = "null count")
dtypes_df = application_train.dtypes.reset_index(name="types")
pd.set_option('display.max_rows', 500)
pd.concat([null_cnt_df, dtypes_df], axis=1).T.drop_duplicates().T

# 1. Gender distribution who succeeded or not to pay the loan

In [None]:
sns.catplot(x="CODE_GENDER", col="TARGET", kind="count", data=application_train, hue="CODE_GENDER")

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5))
application_train.loc[application_train["TARGET"]==1, "CODE_GENDER"].reset_index().groupby("CODE_GENDER").size().plot(kind="pie", ax=ax1, colors=['orange', 'blue'])
application_train.loc[application_train["TARGET"]==0, "CODE_GENDER"].reset_index().groupby("CODE_GENDER").size().plot(kind="pie", ax=ax2, colors=['orange', 'blue', 'gold'])
ax1.set_ylabel("target 1")
ax2.set_ylabel("target 0")

In [None]:
application_test[["CODE_GENDER", "SK_ID_CURR"]].groupby("CODE_GENDER").size().plot(kind="pie", colors=['orange', 'blue'])
plt.title("gender count comparison in test data")

No XNA in test data therefore we may be able to remove those rows from the train data as well particularly as it has very few records.

# 2. How does owing a car affect paying the loan?

In [None]:
sns.catplot(x="FLAG_OWN_CAR", col="TARGET", kind="count", data=application_train, hue="FLAG_OWN_CAR")

In [None]:
application_test[["FLAG_OWN_CAR", "SK_ID_CURR"]].groupby("FLAG_OWN_CAR").size().plot(kind="pie", colors=['orange', 'blue'])
plt.title("FLAG_OWN_CAR comparison in test data")

# 3. Education Type

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(20, 5))
application_train.loc[application_train["TARGET"]==1, "NAME_EDUCATION_TYPE"].reset_index().groupby("NAME_EDUCATION_TYPE").size().plot(kind="pie", ax=ax1)
application_train.loc[application_train["TARGET"]==0, "NAME_EDUCATION_TYPE"].reset_index().groupby("NAME_EDUCATION_TYPE").size().plot(kind="pie", ax=ax2)
application_train[["NAME_EDUCATION_TYPE", "SK_ID_CURR"]].groupby("NAME_EDUCATION_TYPE").size().sort_values(ascending=False).plot(kind="bar", ax=ax3, \
                                            color=["purple", "darkorange", "green", "red", "gold"])
ax1.set_xlabel("target 1")
ax2.set_xlabel("target 0")
ax3.set_xlabel("bar plot")

# 4. Maritual Status

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 5))
application_train[["NAME_FAMILY_STATUS", "SK_ID_CURR"]].groupby("NAME_FAMILY_STATUS").size().sort_values(ascending=False).plot(kind="bar", ax=ax1, \
                                                                                color=["purple", "darkorange", "green", "red", "gold", "blue"])
application_test[["NAME_FAMILY_STATUS", "SK_ID_CURR"]].groupby("NAME_FAMILY_STATUS").size().sort_values(ascending=False).plot(kind="bar", ax=ax2, \
                                                                                color=["purple", "darkorange", "green", "red", "gold", "blue"])
ax1.set_title("maritual states in train data")
ax2.set_title("maritual states in test data")

No unknown family status in test data and very few in train. Maybe these records could be removed!

In [None]:
df = application_train.drop(application_train[application_train["NAME_FAMILY_STATUS"]=="Unknown"].index)
print(df["NAME_FAMILY_STATUS"].unique())

we can consider civil marriage and married in one class and widow, single, separated in another class to generate in new feature

In [None]:
def maritual_state_classification(x):
    if x in ('Married', 'Civil marriage'):
        return 1
    else:
        return 0
# df = pd.DataFrame()
df["maritual_state"] = application_train["NAME_FAMILY_STATUS"].apply(maritual_state_classification)

# 5. Which columns have the highest corrolation with target?

In [None]:
corr = application_train.corr()["TARGET"].sort_values()
top_negative_corr = corr.head(20)
top_positive_corr = corr.tail(20)
print("top negative corrolation: ", top_negative_corr)
print("top positive corrolations: ", top_positive_corr)

External sources and days employed have the highest negative corrolation and days birth has the highest positive corrolation so let's investigate those columns.

# 6. External sources

In [None]:
tr = sns.displot(application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]], kde=True)
plt.title("external source on train data")
ts = sns.displot(application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]], kde=True)
plt.title("external source on test data")

similar distribution of external sources on train and test data

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(20, 5))
sns.kdeplot(application_train.loc[application_train["TARGET"]==1, "EXT_SOURCE_1"], label="target 1", ax=ax1)
sns.kdeplot(application_train.loc[application_train["TARGET"]==0, "EXT_SOURCE_1"], label="target 0", ax=ax1)
ax1.set_title("ext source 1")

sns.kdeplot(application_train.loc[application_train["TARGET"]==1, "EXT_SOURCE_2"], label="target 1", ax=ax2)
sns.kdeplot(application_train.loc[application_train["TARGET"]==0, "EXT_SOURCE_2"], label="target 0", ax=ax2)
ax2.set_title("ext source 2")

sns.kdeplot(application_train.loc[application_train["TARGET"]==1, "EXT_SOURCE_3"], label="target 1", ax=ax3)
sns.kdeplot(application_train.loc[application_train["TARGET"]==0, "EXT_SOURCE_3"], label="target 0", ax=ax3)
ax3.set_title("ext source 3")

Let's generate new features from external sources

In [None]:
application_train["EXT_SOURCE_SUM"] = application_train["EXT_SOURCE_1"] + application_train["EXT_SOURCE_2"] + application_train["EXT_SOURCE_3"]
application_train["EXT_SOURCE_MUL"] = application_train["EXT_SOURCE_1"] * application_train["EXT_SOURCE_2"] * application_train["EXT_SOURCE_3"]

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 5))
sns.kdeplot(application_train.loc[application_train["TARGET"]==0, "EXT_SOURCE_SUM"], label="target 0", ax=ax1)
sns.kdeplot(application_train.loc[application_train["TARGET"]==1, "EXT_SOURCE_SUM"], label="target 1", ax=ax1)

sns.kdeplot(application_train.loc[application_train["TARGET"]==0, "EXT_SOURCE_MUL"], label="target 0", ax=ax2)
sns.kdeplot(application_train.loc[application_train["TARGET"]==1, "EXT_SOURCE_MUL"], label="target 1", ax=ax2)

# 7. Day_Birth

In [None]:
application_train["DAYS_BIRTH"].describe()

In [None]:
application_train["Age"] = application_train["DAYS_BIRTH"]/-365

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15, 4))
sns.histplot(application_train["Age"], kde=True, ax=ax1)
ax1.set_title("age distribution")

sns.kdeplot(application_train.loc[application_train["TARGET"]==0, "Age"], label="target 0", ax=ax2)
sns.kdeplot(application_train.loc[application_train["TARGET"]==1, "Age"], label="target 1", ax=ax2)
plt.legend()
ax2.set_title("age distribution for people who paid or not")

Let's see relationship between age and external source

In [None]:
sns.kdeplot(application_train.loc[round(application_train["Age"]/10)==2, "EXT_SOURCE_SUM"], label="age 2")
sns.kdeplot(application_train.loc[round(application_train["Age"]/10)==3, "EXT_SOURCE_SUM"], label="age 3")
sns.kdeplot(application_train.loc[round(application_train["Age"]/10)==4, "EXT_SOURCE_SUM"], label="age 4")
sns.kdeplot(application_train.loc[round(application_train["Age"]/10)==5, "EXT_SOURCE_SUM"], label="age 5")
sns.kdeplot(application_train.loc[round(application_train["Age"]/10)==6, "EXT_SOURCE_SUM"], label="age 6")
sns.kdeplot(application_train.loc[round(application_train["Age"]/10)==7, "EXT_SOURCE_SUM"], label="age 7")
plt.legend()

More external source on average for older people!

# 8. Days employed

In [None]:
application_train["DAYS_EMPLOYED"].describe()

365243 days employed! that is not correct. All values are supposed to be negative! let's see this value distribution in train and test data

In [None]:
invalid_days_emplyed_train = len(application_train[application_train["DAYS_EMPLOYED"]>0])/len(application_train["DAYS_EMPLOYED"]) * 100
invalid_days_emplyed_test = len(application_test[application_test["DAYS_EMPLOYED"]>0])/len(application_test["DAYS_EMPLOYED"]) * 100
print(invalid_days_emplyed_train)
print(invalid_days_emplyed_test)

20% of data have invalid employed days!