In [None]:
# Import the tools used in this notebook

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
plt.style.use('seaborn-darkgrid')

# View data

In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
train.head(5)


In [None]:
test.head(5)

In [None]:
train['TARGET'].value_counts()

In [None]:
train['TARGET'].astype(int).plot.hist()

In [None]:
# View the missing values in their own table. You can see that a few of the columns are missing a lot of their data.
missing_values = train.isnull().sum()
missing_values_percent = 100*train.isnull().sum()/len(train)
missing_val_table = pd.concat([missing_values,missing_values_percent],axis=1)
missing_values_col = missing_val_table.rename(columns={0:'Missing Value',1 :"percent of value"})
missing_values_col = missing_values_col[missing_val_table.iloc[:,1]!=0].sort_values("percent of value",ascending=False).round(1)
missing_values_col

# Data Preparation

In [None]:
le = LabelEncoder()
le_count = 0
for col in train:
  if train[col].dtype=='object':
    if len(list(train[col].unique())) <=2:
      le.fit(train[col])
      train[col]=le.transform(train[col])
      test[col]=le.transform(test[col])
      le_count += 1
print('Encoding ------------- completed')


In [None]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)
train_labels = train['TARGET']
train, test_dat = train.align(test, join='inner', axis=1)
train['TARGET'] = train_labels


In [None]:
train['DAYS_EMPLOYED'].plot.hist(title='Days Employment Histogram')
plt.xlabel('Days Employment')

In [None]:
def new_features(train,test):
  """Adding a few new features to the data"""
  try:
    train['DAYS_EMPLOYED_ANOM'] = train["DAYS_EMPLOYED"] == 365243
    train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)
    train['ANNUITY_INCOME_PERCENT'] = train['AMT_ANNUITY'] / \
        train['AMT_INCOME_TOTAL']
    train['CREDIT_TERM'] = train['AMT_ANNUITY'] / \
        train['AMT_CREDIT']
    train['DAYS_EMPLOYED_PERCENT'] = train['DAYS_EMPLOYED'] / \
        train['DAYS_BIRTH']

    test['DAYS_EMPLOYED_ANOM'] = test['DAYS_EMPLOYED'] == 365243
    test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

    train['CREDIT_INCOME_PERCENT'] = train['AMT_CREDIT'] / \
        train['AMT_INCOME_TOTAL']
    test['CREDIT_INCOME_PERCENT'] = test['AMT_CREDIT'] / \
        test['AMT_INCOME_TOTAL']
    test['ANNUITY_INCOME_PERCENT'] = test['AMT_ANNUITY'] / \
        test['AMT_INCOME_TOTAL']
    test['CREDIT_TERM'] = test['AMT_ANNUITY'] / \
        test['AMT_CREDIT']
    test['DAYS_EMPLOYED_PERCENT'] = test['DAYS_EMPLOYED'] / \
        test['DAYS_BIRTH']
    return train,test
  except:
    print("New feature function is not working, try again.")
  return train,test
new_features(train,test)


In [None]:
correlations = train.corr()['TARGET'].sort_values()
correlations

In [None]:
def change_age(age_days_negative):
    age_days_positive = -age_days_negative
    age_years = age_days_positive/365
    return age_years

train['DAYS_BIRTH'] = train['DAYS_BIRTH'].apply(change_age)
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(change_age)

test['DAYS_BIRTH'] = test['DAYS_BIRTH'].apply(change_age)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].apply(change_age)

In [None]:
# Create a grapgh for see the age of the client because DAYS_BIRTH has a high correlation.
plt.style.use('ggplot')
plt.figure(figsize=(10, 8))
plt.title('Age Distribution')
plt.xlabel('Age')
sns.kdeplot(train[train['TARGET'] == 1]['DAYS_BIRTH'], label='Target=1')
sns.kdeplot(train[train['TARGET'] == 0]['DAYS_BIRTH'], label='Target=0')
plt.grid()
plt.show()



In [None]:
# create a pipeline for numerical features
numeric_transformer = Pipeline(
    steps=[("num_imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())])
# create a pipeline for catagorical features
categorical_transformer = Pipeline(steps=[("cat_imputer", SimpleImputer(
    strategy="most_frequent")), ("encoder", OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, make_column_selector(dtype_exclude="object")),
    ("cat", categorical_transformer, make_column_selector(dtype_include="object"))])


# Split data

In [None]:
# Separate target
X = train.drop('TARGET', axis=1)
y = train['TARGET']

# Splitting data
X_train, x_test, y_train, y_test = train_test_split( X, y, train_size=0.8, stratify=y, random_state=42)


In [None]:
# create a function for trained models evaluation
def evaluate_model(model_pipeline):
    # prediction
    test_pred = model_pipeline.predict(x_test)
    test_pred_proba = model_pipeline.predict_proba(x_test)

    print('Validation roc auc score ---->   {:.4f}'.format(
        roc_auc_score(y_test, test_pred_proba[:, 1])))

    print('Confusion matrix:\n', confusion_matrix(y_test, test_pred))


In [None]:
# Create oversampler
undersampler = RandomUnderSampler(sampling_strategy=0.75)
# Create pipeline
model = LGBMClassifier(n_estimators=100, num_leaves=36,class_weight='balanced'
                      ,random_state=42, learning_rate=0.10)
steps = [('preprocessor', preprocessor),
         ('undersampler', undersampler), ('model', model)]
lgbm_pipeline = Pipeline(steps=steps)

lgbm_pipeline.fit(X_train, y_train)

evaluate_model(lgbm_pipeline)


# Submission

In [None]:
final_preds = lgbm_pipeline.predict(test)
submission = pd.DataFrame(test['SK_ID_CURR'], columns=['SK_ID_CURR'])
submission['TARGET'] = final_preds
submission

In [None]:
submission.to_csv('submission_loan_light.csv', index=False, header=True)
