In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Data manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Modeling
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Data Viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Ignore warnings
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### I. Import Data

In [None]:
#import train and test data
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

#import other tables
pos = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')
installment = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')
credit = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')
p_app = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')

### II. Data Cleaning

In [None]:
#identifying the data types
train.dtypes.unique()


In [None]:
#dataframe of only categorical types
cat_train = train.select_dtypes(['object'])
cat_test = test.select_dtypes(['object'])

#dataframe of numerical types
num_train = train.select_dtypes(['int64','float64'])
num_test = test.select_dtypes(['int64','float64'])

In [None]:
# Sanity Check: Numerical Values
num_train.head()

In [None]:
# Sanity Check: Categorical Data Values
cat_train.head()

In [None]:
#Identifying uniqueness
cat_train.apply(pd.Series.nunique, axis=0)

#### Convert Binary to Boolean

In [None]:
binary_col=[]
for col in cat_train.columns:
    if len(list(cat_train[col].unique())) <=2:
        binary_col.append(col)
cat_train[binary_col].head # Sanity Check

#### Label Encoding for Train and Test

In [None]:
lb_mkr = LabelEncoder()
for col in binary_col:
    cat_train[col] = lb_mkr.fit_transform(cat_train[col])
    cat_test[col] = lb_mkr.fit_transform(cat_test[col])

In [None]:
cat_train[binary_col].head() # Sanity Check

#### One Hot Encoding 

In [None]:
cat_train.head()

In [None]:
cat_train = pd.get_dummies(cat_train)
cat_test = pd.get_dummies(cat_test)
cat_train.head()

In [None]:
# Combining the dataframes into one
encoded_train = pd.concat([num_train,cat_train], axis=1)
encoded_test = pd.concat([num_test, cat_test], axis=1)

In [None]:
# Outlier Investigation
encoded_train['DAYS_EMPLOYED'].describe()

In [None]:
anom = encoded_train[encoded_train['DAYS_EMPLOYED'] == 365243]
non_anom = encoded_train[encoded_train['DAYS_EMPLOYED'] != 365243]
print('The anomalies default on %0.2f%% of loans' % (100 * anom['TARGET'].mean()))

In [None]:
# Create an anomalous flag column
encoded_train['DAYS_EMPLOYED_ANOM'] = encoded_train["DAYS_EMPLOYED"] == 365243
encoded_test['DAYS_EMPLOYED_ANOM'] = encoded_test["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
encoded_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
encoded_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

### Aggregating Numericals

In [None]:
def agg_numeric(df, group_var, df_name):

    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = df[group_var]

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]


    for var in agg.columns.levels[0]:
        if var != group_var:
            
            # [:-1] because the index column is ''
            for stat in agg.columns.levels[1][:-1]:
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

### Aggregating Categoricals

In [None]:
# We have four supplemental table and we need to combine them together and merge it with main table using SK_ID_CURR

# count_categorical function will do a OHE for categorical data and for each column, calculate sum and proportion of each value
def count_categorical(df, group_var, df_name):
    
    # get the categorical data and do OHE
    cat = pd.get_dummies(df.select_dtypes('object'))
    
    # add the id
    cat[group_var] = df[group_var]
    
    # aggregate by id
    cat = cat.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # for each first level column name, we define a new column name
    for var in cat.columns.levels[0]:
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    cat.columns = column_names
    
    return cat

### Aggregating Tables

In [None]:
# Pos
pos_counts = count_categorical(pos, group_var = 'SK_ID_CURR', df_name = 'os')
pos_agg = agg_numeric(pos, group_var = 'SK_ID_CURR', df_name = 'pos')

In [None]:
# Installment
installment_agg = agg_numeric(installment, group_var = 'SK_ID_CURR', df_name = 'installment')

In [None]:
# Credit
credit_counts = count_categorical(credit, group_var = 'SK_ID_CURR', df_name = 'credit')
credit_agg = agg_numeric(credit, group_var = 'SK_ID_CURR', df_name = 'credit')

In [None]:
# p_app
p_app_counts = count_categorical(p_app, group_var = 'SK_ID_CURR', df_name = 'p_app')
p_app_agg = agg_numeric(p_app, group_var = 'SK_ID_CURR', df_name = 'p_app')

### Merging with Test and Train 

In [None]:
# Merge with the train
encoded_train = encoded_train.merge(pos_counts, on = 'SK_ID_CURR', how = 'left')
encoded_train = encoded_train.merge(pos_agg, on = 'SK_ID_CURR', how = 'left')

encoded_train = encoded_train.merge(installment_agg, on = 'SK_ID_CURR', how = 'left')

encoded_train = encoded_train.merge(credit_counts, on = 'SK_ID_CURR', how = 'left')
encoded_train = encoded_train.merge(credit_agg, on = 'SK_ID_CURR', how = 'left')

encoded_train = encoded_train.merge(p_app_counts, on = 'SK_ID_CURR', how = 'left')
encoded_train = encoded_train.merge(p_app_agg, on = 'SK_ID_CURR', how = 'left')

In [None]:
# Merge with the test
encoded_test = encoded_test.merge(pos_counts, on = 'SK_ID_CURR', how = 'left')
encoded_test = encoded_test.merge(pos_agg, on = 'SK_ID_CURR', how = 'left')

encoded_test = encoded_test.merge(installment_agg, on = 'SK_ID_CURR', how = 'left')

encoded_test = encoded_test.merge(credit_counts, on = 'SK_ID_CURR', how = 'left')
encoded_test = encoded_test.merge(credit_agg, on = 'SK_ID_CURR', how = 'left')

encoded_test = encoded_test.merge(p_app_counts, on = 'SK_ID_CURR', how = 'left')
encoded_test = encoded_test.merge(p_app_agg, on = 'SK_ID_CURR', how = 'left')

In [None]:
print('Training set full shape: ', encoded_train.shape)
print('Testing set full shape: ' , encoded_test.shape)

In [None]:
copy_encoded_train = encoded_train.copy()
copy_encoded_test = encoded_test.copy()

### III. Feature Selection

In [None]:
# Calculating proportion by summing NA values and dividing by length of DF
prop_na = encoded_train.isna().sum()/len(encoded_train)
# Filtering out columns with less than 5% NA values to clean up the visualization below
prop_na = prop_na[prop_na > 0.3]
prop_na = prop_na.sort_values(0, ascending=True).rename('missing_perc').reset_index()

In [None]:
prop_na.head()

In [None]:
plt.figure(figsize=(20, 20))

barh = plt.barh(prop_na['index'], prop_na['missing_perc'], alpha=0.85, color='green')

plt.title('Proportion of NA Values')
plt.xticks(np.arange(.1, 1.01, .1))

plt.tight_layout()

In [None]:
def remove_missing(df_train, df_test, thredhold):
    
    # get the missing column and missing values percentage
    train_missing = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False)
    test_missing = (df_test.isnull().sum() / len(df_test)).sort_values(ascending = False)
    
    # filter the missing values by thredhold
    train_missing = train_missing.index[train_missing > thredhold]
    test_missing = test_missing.index[test_missing > thredhold]
    
    # combine the missing values columns from train and test
    all_missing = list(set(set(train_missing) | set(test_missing)))
    print('There are %d columns with more than %s%% missing values' % (len(all_missing), thredhold))
    
    # save the target column
    train_labels = train["TARGET"]
    
    # drop the missing values columns
    df_train = df_train.drop(columns = all_missing)
    df_test = df_test.drop(columns = all_missing)
    
    # align the columns from both table
    df_train, df_test = df_train.align(df_test, join = 'inner', axis = 1)
    df_train, df_test = df_train.align(df_test, join = 'inner', axis = 1)
    
    df_train["TARGET"] = train_labels
    
    print('Training set full shape: ', df_train.shape)
    print('Testing set full shape: ' , df_test.shape)
    
    return df_train, df_test

In [None]:
encoded_train, encoded_test = remove_missing(encoded_train, encoded_test, 0.60)

### Feature Importance

In [None]:
# store and remove the id column

train_id = encoded_train['SK_ID_CURR']
test_id = encoded_test['SK_ID_CURR']

encoded_train = encoded_train.drop('SK_ID_CURR', axis=1)
encoded_test = encoded_test.drop('SK_ID_CURR', axis=1)

In [None]:
# store and remove the train data target
target_train = encoded_train['TARGET']
encoded_train = encoded_train.drop('TARGET', axis=1)

In [None]:
encoded_train.head()

In [None]:
# format the column name
encoded_train.columns = [''.join (c if c.isalnum() else '_' for c in str(x)) for x in encoded_train.columns]
encoded_test.columns = [''.join (c if c.isalnum() else '_' for c in str(x)) for x in encoded_test.columns]

In [None]:
# build a LightGBM model to select important features
d_train = lgb.Dataset(encoded_train, label=target_train)
param = {'max_depth': 5, 'learning_rate' : 0.1, 'num_leaves': 900, 'n_estimators': 100}
model = lgb.train(params=param,train_set=d_train)
ax = lgb.plot_importance(model, max_num_features=15)
plt.show()

### Correlation

In [None]:
correlation_threshold = 0.8

# Calculating an absolute value correlation matrix
corr_mat = encoded_train[LGBM_features_columns].corr().abs()

# Getting upper triangle of this matrix only
upper = pd.DataFrame(np.triu(corr_mat, k=1), columns=encoded_train[LGBM_features_columns].columns)

# Select columns with correlations above threshold
corr_col_drop = [col for col in upper.columns if any(upper[col] > correlation_threshold)]

print(f'There are {len(corr_col_drop)} columns to remove out of {len(encoded_train[LGBM_features_columns].columns)}.')

In [None]:

encoded_train_if = encoded_train[LGBM_features_columns].drop(corr_col_drop, axis=1)
encoded_test_if = encoded_test[LGBM_features_columns].drop(corr_col_drop, axis=1)

### IV. PCA

In [None]:
# define the number of components
n_comp=.95

# create a pca pipeline with median imputation
pipeline = Pipeline(steps = [('scaler', StandardScaler()),
                             ('imputer', SimpleImputer(strategy = 'median')),
                             ('pca', PCA(n_components=n_comp, svd_solver='full', random_state=1))])

pca = pipeline.named_steps['pca']

pipeline.fit(encoded_train)

train_pca = pipeline.transform(encoded_train)
test_pca = pipeline.transform(encoded_test)

plt.figure(figsize = (8, 5))
plt.plot(list(range(train_pca.shape[1])), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of PC'); plt.ylabel('Cumulative Variance Explained');
plt.title('Cumulative Variance Explained with PCA');

In [None]:
# Select top n princple features
total_variance_explained = 0
print('Individual variance contributions:')

for j in range(12):
    print(pca.explained_variance_ratio_[j])
    total_variance_explained += pca.explained_variance_ratio_[j]
print('Explained variance: %.4f' % total_variance_explained)

In [None]:
df_pca_train = pd.DataFrame(data=train_pca)
df_pca_test = pd.DataFrame(data=test_pca)