In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler, normalize
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier as lgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.impute import SimpleImputer as imputer
sns.set_style("whitegrid")

In [None]:
application_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
application_test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")

In [None]:
print("before shape: ", application_train.shape)
application_train = application_train.drop(application_train[application_train["CODE_GENDER"] == "XNA"].index, axis=0)
print("after shape: ", application_train.shape)
application_train = application_train.drop(application_train[application_train["NAME_FAMILY_STATUS"]=="Unknown"].index)
print("after shape: ", application_train.shape)

In [None]:
print("number of features of the original application: ", application_train.shape[1])
threshold = round(0.6*len(application_train))
application_train.dropna(axis=1, thresh=threshold, inplace=True)
y_train = application_train["TARGET"]
application_train = application_train.drop("TARGET", axis=1)
cols = application_train.columns
print("number of features after removing missing values: ", len(cols))
application_test = application_test[cols]

In [None]:
categorical_data = list(application_train.select_dtypes(include=['object']))
discrete_data = list(application_train.select_dtypes(include=['int64']))
continuous_data = list(application_train.select_dtypes(include=['float64']))

In [None]:
print("------ null values in categorical data ------")
print((application_train[categorical_data].isnull().sum()/len(application_train)).sort_values(ascending=False))
print("------ null values in discrete data ------")
print((application_train[discrete_data].isnull().sum()/len(application_train)).sort_values(ascending=False))
print("------ null values in continuous data ------")
print((application_train[continuous_data].isnull().sum()/len(application_train)).sort_values(ascending=False))

In [None]:
print("------ null values in categorical data ------")
print((application_test[categorical_data].isnull().sum()/len(application_test)).sort_values(ascending=False))
print("------ null values in discrete data ------")
print((application_test[discrete_data].isnull().sum()/len(application_test)).sort_values(ascending=False))
print("------ null values in continuous data ------")
print((application_test[continuous_data].isnull().sum()/len(application_test)).sort_values(ascending=False))

In [None]:
# fill null values of categorical data with most frequent value
most_frequent_imp = imputer(missing_values=np.nan, strategy='most_frequent')
application_train[categorical_data] = most_frequent_imp.fit_transform(application_train[categorical_data])
application_test[categorical_data] = most_frequent_imp.transform(application_test[categorical_data])

In [None]:
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8), (ax9, ax10, ax11, ax12), (ax13, ax14, ax15, ax16)) = plt.subplots(nrows=4, ncols=4, figsize=(20, 20))
sns.histplot(application_train["EXT_SOURCE_3"], ax=ax1)
sns.histplot(application_train["AMT_REQ_CREDIT_BUREAU_YEAR"],ax=ax2)
sns.histplot(application_train["AMT_REQ_CREDIT_BUREAU_QRT"], ax=ax3)
sns.histplot(application_train["AMT_REQ_CREDIT_BUREAU_MON"], ax=ax4)

sns.histplot(application_train["AMT_REQ_CREDIT_BUREAU_WEEK"], ax=ax5)
sns.histplot(application_train["AMT_REQ_CREDIT_BUREAU_DAY"], ax=ax6)
sns.histplot(application_train["AMT_REQ_CREDIT_BUREAU_HOUR"], ax=ax7)
sns.histplot(application_train["OBS_60_CNT_SOCIAL_CIRCLE"], ax=ax8)

sns.histplot(application_train["DEF_60_CNT_SOCIAL_CIRCLE"], ax=ax9)
sns.histplot(application_train["DEF_30_CNT_SOCIAL_CIRCLE"], ax=ax10)
sns.histplot(application_train["OBS_30_CNT_SOCIAL_CIRCLE"], ax=ax11)
sns.histplot(application_train["AMT_ANNUITY"], ax=ax12)

sns.histplot(application_train["EXT_SOURCE_2"], ax=ax13)
sns.histplot(application_train["AMT_GOODS_PRICE"], ax=ax14)
sns.histplot(application_train["CNT_FAM_MEMBERS"], ax=ax15)
sns.histplot(application_train["DAYS_LAST_PHONE_CHANGE"], ax=ax16)

all of the continuous columns are skewed therefore I fill them with median value

In [None]:
median_imp = imputer(missing_values=np.nan, strategy='median')
application_train[continuous_data] = median_imp.fit_transform(application_train[continuous_data])
application_test[continuous_data] = median_imp.transform(application_test[continuous_data])

In [None]:
print("null values in train: ", application_test.isnull().sum().sum())
print("null values in test: ", application_test.isnull().sum().sum())

In [None]:
def maritual_state_classification(x):
    if x in ('Married', 'Civil marriage'):
        return 1
    else:
        return 0
application_train["maritual_state"] = application_train["NAME_FAMILY_STATUS"].apply(maritual_state_classification)
application_test["maritual_state"] = application_test["NAME_FAMILY_STATUS"].apply(maritual_state_classification)
print("after shape: ", application_train.shape)
print("after shape: ", application_test.shape)

In [None]:
application_train["AGE"] = application_train["DAYS_BIRTH"]/-365
application_test["AGE"] = application_test["DAYS_BIRTH"]/-365
print("after shape: ", application_train.shape)
print("after shape: ", application_test.shape)

In [None]:
two_class_cols = []
multiclass_cols = []
def unique_vals(col):
    print("====", col.name, "====")
    unq = col.unique()
    print(unq)
    if len(unq) == 2:
        two_class_cols.append(col.name)
    else:
        multiclass_cols.append(col.name)
application_train[categorical_data].apply(unique_vals)

In [None]:
# apply label encoding for two class categorical data
for col in two_class_cols:
    le = LabelEncoder()
    application_train[col] = le.fit_transform(application_train[col])
    application_test[col] = le.transform(application_test[col])

In [None]:
application_train = pd.get_dummies(application_train)

application_test = pd.get_dummies(application_test)
application_train, application_test = application_train.align(application_test, join = 'inner', axis = 1)

print("after shape: ", application_train.shape)
print("after shape: ", application_test.shape)

In [None]:
x_train = normalize(application_train, axis=1)
x_resampled, y_resampled = RandomOverSampler().fit_resample(x_train, y_train)
x_tr, x_val, y_tr, y_val = train_test_split(x_resampled, y_resampled)
print(x_tr.shape, x_val.shape, y_tr.shape, y_val.shape)

model = lgb()
model.fit(x_tr, y_tr)
y_pred = model.predict_proba(x_val)
print("roc auc score", roc_auc_score(y_val, y_pred[:,1]))

In [None]:
x_train = normalize(application_train, axis=1)
x_test = normalize(application_test, axis=1)
x_resampled, y_resampled = RandomOverSampler().fit_resample(x_train, y_train)
model = lgb()
model.fit(x_resampled, y_resampled)
y_pred = model.predict_proba(x_test)

submission = pd.read_csv("/kaggle/input/home-credit-default-risk/sample_submission.csv")
submission["TARGET"] = y_pred[:,1]
submission.to_csv("simple_lgbm.csv", index=False)

In [None]:
# x_train = normalize(application_train, axis=1)
# x_resampled, y_resampled = RandomOverSampler().fit_resample(x_train, y_train)

# LGBM = lgb()
# distribution = {"boosting_type": ["gbdt", "goss", "rf"], "num_leaves": [31, 63, 127], \
#                 "max_depth":[-1, 100, 200], "learning_rate": [0.1, 0.01], \
#                "n_estimators": [100, 200, 300], "class_weight": ["balanced", None]}
# print("Start random search...")
# clf = RandomizedSearchCV(LGBM, distribution)
# search = clf.fit(x_resampled, y_resampled)
# print("best params: ", search.best_params_)
# print("best score: ", search.best_score_)

# best params:  {'num_leaves': 31, 'n_estimators': 200, 'max_depth': 200, 'learning_rate': 0.01, 'class_weight': 'balanced', 'boosting_type': 'gbdt'}
# best score:  0.5684218904768643

In [None]:
# x_train = normalize(application_train, axis=1)
# x_resampled, y_resampled = RandomOverSampler().fit_resample(x_train, y_train)
# x_tr, x_val, y_tr, y_val = train_test_split(x_resampled, y_resampled)
# print(x_tr.shape, x_val.shape, y_tr.shape, y_val.shape)

# model = lgb()
# model.fit(x_tr, y_tr)
# y_pred = model.predict_proba(x_val)
# print("roc auc score", roc_auc_score(y_val, y_pred[:,1]))

# model = lgb(**{'reg_lambda': 0.1, 
#                 'reg_alpha': 0.2, 
#                 'num_leaves': 70, 
#                 'n_estimators': 250, 
#                 'min_child_samples': 800, 
#                 'learning_rate': 0.05,
#                 'max_bin': 500,
#                 'objective': 'binary',
#                 'n_jobs': -1,
#                 'class_weight':'balanced',
#                 'random_state':100})
# model.fit(x_tr, y_tr)
# y_pred = model.predict_proba(x_val)
# print("roc auc score", roc_auc_score(y_val, y_pred[:,1]))

In [None]:
# x_train = normalize(application_train, axis=1)
# x_test = normalize(application_test, axis=1)
# x_resampled, y_resampled = RandomOverSampler().fit_resample(x_train, y_train)
# model = lgb()
# skf = StratifiedKFold(n_splits=10, shuffle=True)
# pred = np.zeros(len(x_test))
# for train_index, test_index in skf.split(x_resampled, y_resampled):
#     X_train, X_test = x_resampled[train_index], x_resampled[test_index]
#     Y_train, Y_test = y_resampled[train_index], y_resampled[test_index]
#     model.fit(X_train, Y_train)
#     y_pred = model.predict_proba(X_test)
#     print("roc auc score", roc_auc_score(Y_test, y_pred[:,1]))
#     pred += model.predict(x_test)/skf.n_splits

# submission = pd.read_csv("/kaggle/input/home-credit-default-risk/sample_submission.csv")
# submission["TARGET"] = pred
# submission.to_csv("kfold_lgbm.csv", index=False)