In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install feature_engine

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.outliers import Winsorizer, OutlierTrimmer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV
import lightgbm as lgb
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
app_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
app_test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")

**BUREAU**

In [None]:
# Read in bureau
bureau = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau.csv')
bureau.head()

**Bureau Balance**

In [None]:
bureau_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau_balance.csv')
bureau_balance.head()

In [None]:
def agg_numeric(df, parent_var, df_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.
    
    Parameters
    --------
        df (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_name`. Columns with all duplicate
            values are removed. 
    
    """
    
    # Remove id variables other than grouping variable
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    # Only want the numeric variables
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))
    
    agg.columns = columns
    
    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [None]:
def agg_categorical(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))
     

    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean',"min","max"])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean',"min","max"]:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

**Bureau_balance aggregation**

In [None]:
bureau_balance_agg_numeric = agg_numeric(bureau_balance,"SK_ID_BUREAU",'bureau_balance')

In [None]:
bureau_balance_agg_numeric.head()

In [None]:
bureau_balance_agg_categorical = agg_categorical(bureau_balance,"SK_ID_BUREAU",'bureau_balance')

In [None]:
bureau_balance_agg_categorical.head()

**MERGE WITH BUREAU**

In [None]:
bureau = bureau.merge(bureau_balance_agg_numeric, on = 'SK_ID_BUREAU', how = 'left')
bureau = bureau.merge(bureau_balance_agg_categorical, on = 'SK_ID_BUREAU', how = 'left')

**Bureau Aggregation**

In [None]:
bureau_agg_numeric = agg_numeric(bureau,"SK_ID_CURR",'bureau')

In [None]:
bureau_agg_categorical = agg_categorical(bureau,"SK_ID_CURR",'bureau')

**Domain Features**

In [None]:
app_train['CREDIT_INCOME_PERCENT'] = app_train['AMT_CREDIT'] / app_train['AMT_INCOME_TOTAL']
app_train['ANNUITY_INCOME_PERCENT'] = app_train['AMT_ANNUITY'] / app_train['AMT_INCOME_TOTAL']
app_train['CREDIT_TERM'] = app_train['AMT_ANNUITY'] / app_train['AMT_CREDIT']
app_train['DAYS_EMPLOYED_PERCENT'] = app_train['DAYS_EMPLOYED'] / app_train['DAYS_BIRTH']

app_test['CREDIT_INCOME_PERCENT'] = app_test['AMT_CREDIT'] / app_test['AMT_INCOME_TOTAL']
app_test['ANNUITY_INCOME_PERCENT'] = app_test['AMT_ANNUITY'] / app_test['AMT_INCOME_TOTAL']
app_test['CREDIT_TERM'] = app_test['AMT_ANNUITY'] / app_test['AMT_CREDIT']
app_test['DAYS_EMPLOYED_PERCENT'] = app_test['DAYS_EMPLOYED'] / app_test['DAYS_BIRTH']

**MERGE WITH APP**

In [None]:
app_train = app_train.merge(bureau_agg_numeric, on = 'SK_ID_CURR', how = 'left')
app_train = app_train.merge(bureau_agg_categorical, on = 'SK_ID_CURR', how = 'left') 

app_test = app_test.merge(bureau_agg_numeric, on = 'SK_ID_CURR', how = 'left')
app_test = app_test.merge(bureau_agg_categorical, on = 'SK_ID_CURR', how = 'left')

In [None]:
corr = app_train.corr()

In [None]:
corr["TARGET"].sort_values(ascending=False)[:20]

In [None]:
corr["TARGET"].sort_values(ascending=True)[:20]

In [None]:
figure = plt.figure(figsize = (15, 5))
figure.suptitle('DAYS_CREDIT', fontsize=16)
plt.grid(linestyle='-')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'bureau_DAYS_CREDIT_mean'], label = 'target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'bureau_DAYS_CREDIT_mean'], label = 'target == 1')
figure.legend(loc="center",prop={'size': 15}) 
plt.show()

In [None]:
figure = plt.figure(figsize = (15, 5))
figure.suptitle('MONTHS_BALANCE min_y_mean', fontsize=16)
plt.grid(linestyle='-')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'bureau_bureau_balance_MONTHS_BALANCE_min_mean'], label = 'target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'bureau_bureau_balance_MONTHS_BALANCE_min_mean'], label = 'target == 1')
figure.legend(loc="center",prop={'size': 15}) 
plt.show()

**Pipelines**

In [None]:
label_encoder_vars = [col for col in app_train.select_dtypes("object").columns if len(app_train[col].unique()) <= 2]

In [None]:
dummies_vars = [col for col in app_train.select_dtypes("object").columns if len(app_train[col].unique()) > 2]

In [None]:
numerical_vars = [col for col in app_train.select_dtypes("number").columns if col not in ["SK_ID_CURR", "TARGET"]]

In [None]:
X_treino = app_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
y_treino = app_train["TARGET"] 

In [None]:
# dummies
dummies_pipe = Pipeline(steps=[         
  ('one_hot_encoder', OneHotEncoder())
])

# ordinal encoder
ordinal_encoder_pipe = Pipeline(steps=[ 
  ("label_encoder", OrdinalEncoder())
])

# standard scaler
numerical_pipe = Pipeline(steps=[ 
  ("standard_scaler", StandardScaler())
])

# Polynomial Features
polynomial_pipe = Pipeline(steps=[ 
    ("imputer_median", SimpleImputer(strategy = 'median')),
    ("polynomial_pipe", PolynomialFeatures(degree = 3))
])

In [None]:
poly_colums = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']
# column transformer
column_transformer = ColumnTransformer(transformers=[    
    ("cat_label_encoder", ordinal_encoder_pipe, label_encoder_vars),
    ("cat_dummies", dummies_pipe, dummies_vars),
    ("numerical", numerical_pipe, numerical_vars),    
    ("polynomial",polynomial_pipe,poly_colums),
])

X_treino_transf = column_transformer.fit_transform(X_treino)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_treino_transf, y_treino, test_size=0.33, random_state=42)
# Create the model
model = lgb.LGBMClassifier(n_estimators=2000, objective = 'binary', 
                           class_weight = 'balanced', learning_rate = 0.05, 
                           reg_alpha = 0.1, reg_lambda = 0.1, 
                           subsample = 0.8, n_jobs = -1, random_state = 50)

# Train the model
model.fit(X_treino_transf, y_treino, eval_metric = 'auc',
          eval_set = [(X_train, y_train), (X_test, y_test)],
          eval_names = ['train', 'valid'],
          early_stopping_rounds = 100, verbose = 200)

In [None]:
from sklearn.metrics import roc_auc_score
prob = model.predict_proba(X_treino_transf)[:,1]
print("ROC_AUC_SCORE",roc_auc_score(y_treino, prob))

In [None]:
import warnings
import sklearn
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [None]:
#feature_importance
feature_name_transf = get_feature_names(column_transformer)
f_importance = [{'key': feature_name_transf[idx],'importance':importance} for idx,importance in enumerate(model.feature_importances_)]
f_importance.sort(key = lambda x:x['importance'],reverse=True)
importance_df = pd.DataFrame(f_importance)
figure = plt.figure(figsize=(10,10)) 
sns.barplot(x="importance", y="key", data=importance_df[:20])
figure.tight_layout()