In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

app_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
app_test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")

data_frames = [app_train,app_test]

In [None]:
app_train.describe()

**Correlações**
<br>Coeficiente de correlação de Pearson:
* 0.9 para mais ou para menos indica uma correlação muito forte.\n
* 0.7 a 0.9 positivo ou negativo indica uma correlação forte.\n
* 0.5 a 0.7 positivo ou negativo indica uma correlação moderada.
* 0.3 a 0.5 positivo ou negativo indica uma correlação fraca.
* 0 a 0.3 positivo ou negativo indica uma correlação desprezível.

In [None]:
correlations = app_train.corr()['TARGET']

In [None]:
correlations.sort_values(ascending=False)[:15]

In [None]:
correlations.sort_values()[:15]

**Critério utilizado - Selecionar as 4 maiores correlações para feature engeneering**

**DAYS_BIRTH**

In [None]:
app_train['DAYS_BIRTH'].describe()

In [None]:
figure = plt.figure(figsize = (20, 10))
plt.grid(linestyle='-')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'], label = 'target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'], label = 'target == 1')
figure.legend(loc="center",prop={'size': 15})
plt.show()

In [None]:
def transform_days(df):
    df['DAYS_BIRTH']=abs(df['DAYS_BIRTH'])
    return df

In [None]:
data_frames = [transform_days(df) for df in data_frames]

**External Sources**

In [None]:
figure = plt.figure(figsize = (20, 10))
figure.suptitle('EXT_SOURCE_1', fontsize=16)
plt.grid(linestyle='-')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'EXT_SOURCE_1'], label = 'target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'EXT_SOURCE_1'], label = 'target == 1')
figure.legend(loc="center",prop={'size': 15}) 
plt.show()

In [None]:
figure = plt.figure(figsize = (20, 10))
figure.suptitle('EXT_SOURCE_2', fontsize=16)
plt.grid(linestyle='-')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'EXT_SOURCE_2'], label = 'target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'EXT_SOURCE_2'], label = 'target == 1')
figure.legend(loc="center",prop={'size': 15}) 
plt.show()

In [None]:
figure = plt.figure(figsize = (20, 10))
figure.suptitle('EXT_SOURCE_3', fontsize=16)
plt.grid(linestyle='-')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'EXT_SOURCE_3'], label = 'target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'EXT_SOURCE_3'], label = 'target == 1')
figure.legend(loc="center",prop={'size': 15}) 
plt.show()

In [None]:
poly_colums = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']

In [None]:
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
sns.heatmap(ext_data_corrs,annot=True,cmap='RdYlGn',linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(8,8)
plt.xticks(rotation = 75); 
plt.show()

**Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 

def get_collumn_transformer(df):

    label_encoder_vars = [col for col in df.select_dtypes("object").columns if len(df[col].unique()) <= 2]
    dummies_vars = [col for col in df.select_dtypes("object").columns if len(df[col].unique()) > 2]
    numerical_vars = [col for col in df.select_dtypes("number").columns if col not in ["SK_ID_CURR", "TARGET"]]


    # dummies
    dummies_pipe = Pipeline(steps=[  
        ('one_hot_encoder', OneHotEncoder()),
        ('median', SimpleImputer(strategy = 'most_frequent'))
    ])
    # ordinal encoder
    ordinal_encoder_pipe = Pipeline(steps=[
      ("label_encoder", OrdinalEncoder()),
      ('median', SimpleImputer(strategy = 'most_frequent'))
    ])
    # standard scaler
    numerical_pipe = Pipeline(steps=[
        ("standard_scaler", StandardScaler()),
        ('median', SimpleImputer(strategy = 'median')),
    ])

    polynomial_pipe = Pipeline(steps=[ 
        ("imputer_median", SimpleImputer(strategy = 'median')),
        ("polynomial_pipe", PolynomialFeatures(degree = 3))
    ])

    column_transformer = ColumnTransformer(transformers=[    
        ("cat_label_encoder", ordinal_encoder_pipe, label_encoder_vars),
        ("cat_dummies", dummies_pipe, dummies_vars),
        ("numerical", numerical_pipe, numerical_vars),   
        ("polynomial",polynomial_pipe,poly_colums),
    ])
    return column_transformer

In [None]:
def train_model():
    X_treino = app_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
    y_treino  = app_train["TARGET"]
    column_transformer = get_collumn_transformer(app_train)
    X_treino_transf = column_transformer.fit_transform(X_treino)
    X_train, X_test, y_train, y_test = train_test_split(X_treino_transf, y_treino, test_size=0.33, random_state=42) 

    # Create the model
    import lightgbm as lgb
    from sklearn.metrics import roc_auc_score
    model = lgb.LGBMClassifier(n_estimators=2000, objective = 'binary', 
                               class_weight = 'balanced', learning_rate = 0.05, 
                               reg_alpha = 0.1, reg_lambda = 0.1, 
                               subsample = 0.8, n_jobs = -1, random_state = 50)

    # Train the model
    model.fit(X_treino_transf, y_treino, eval_metric = 'auc',
              eval_set = [(X_train, y_train), (X_test, y_test)],
              eval_names = ['train', 'valid'],
              early_stopping_rounds = 100, verbose = False)



    X_test_transf = column_transformer.transform(app_test.drop(["SK_ID_CURR","SK_ID_CURR"],axis=1))
    prob = model.predict_proba(X_test)[:,1]
    print("ROC_AUC_SCORE",roc_auc_score(y_test, prob))
    return model,column_transformer


In [None]:
import warnings
import sklearn
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [None]:
#train model
model,column_transformer = train_model()

#feature_importance
feature_name_transf = get_feature_names(column_transformer)
f_importance = [{'key': feature_name_transf[idx],'importance':importance} for idx,importance in enumerate(model.feature_importances_)]
f_importance.sort(key = lambda x:x['importance'],reverse=True)
importance_df = pd.DataFrame(f_importance)
figure = plt.figure(figsize=(10,10)) 
sns.barplot(x="importance", y="key", data=importance_df[:20])
figure.tight_layout()

In [None]:
# ----------
# Table - application_{train|test}.csv
# Col - AMT_INCOME_TOTAL
# Des -  Income of the client
# ----------
# Table - application_{train|test}.csv
# Col - AMT_CREDIT
# Des -  Credit amount of the loan
# ----------
# Table - application_{train|test}.csv
# Col - AMT_ANNUITY
# Des -  Loan annuity
# ----------

def domain_featuring(df):
    df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['CREDIT_TERM'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    return df

In [None]:
data_frames = [domain_featuring(df) for df in data_frames]

In [None]:
#train model
model,column_transformer = train_model()

#feature_importance
feature_name_transf = get_feature_names(column_transformer)
f_importance = [{'key': feature_name_transf[idx],'importance':importance} for idx,importance in enumerate(model.feature_importances_)]
f_importance.sort(key = lambda x:x['importance'],reverse=True)
importance_df = pd.DataFrame(f_importance)
figure = plt.figure(figsize=(10,10)) 
sns.barplot(x="importance", y="key", data=importance_df[:20])
figure.tight_layout()