Instruction   
1. Recommandation environment: Anaconda virtual environment (python 3.6).
2. Install tensorflow 1.8.0 (newest version is 1.9.0). 
3. Install all the other package. Install feature_selector for https://github.com/WillKoehrsen/feature-selector. (put it in current directory).
4. Dataset from Kaggle competition https://www.kaggle.com/c/home-credit-default-risk/data.

In [None]:
import os
import tempfile
import numpy as np
import tensorflow as tf 
import pandas as pd
import gc
import time
from contextlib import contextmanager
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold, StratifiedKFold
#download feature_selector package from https://github.com/WillKoehrsen/feature-selector
from feature_selector import FeatureSelector
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [None]:
SRC_PATH = "./dataset/home_credit/sources"
OUT_PATH = "./dataset/home_credit/outputs"
print(os.listdir(SRC_PATH))

# Functions

In [None]:
# Fill nan values
def fillna_df(df, verbose=False):
    cat_cols_object = df.dtypes[df.dtypes == 'object'].index
    cat_cols_int = df.dtypes[df.dtypes == 'int64'].index
    numeric_cols = df.dtypes[df.dtypes == 'float64'].index
    if verbose:
        display(get_misstable(df[cat_cols_object]))
        display(get_misstable(df[cat_cols_int]))
        display(get_misstable(df[numeric_cols]))
    df[cat_cols_object] = df[cat_cols_object].fillna('etc')
    df[cat_cols_int] = df[cat_cols_int].fillna(0)
    df[numeric_cols] = df[numeric_cols].fillna(0)  
    return df, cat_cols_object, cat_cols_int, numeric_cols

def cross_validation(df,train_labels):
    from sklearn.model_selection import train_test_split 
    train_X, val_X, train_y, val_y = train_test_split(df, train_labels, test_size=0.25, random_state=42)
    return train_X, val_X, train_y, val_y

def separate_columns(df):
    cate_columns = []
    num_columns = []
    #separate columns
    for column in df.columns:
        if column in list(df.select_dtypes(include=['object']).columns):
            cate_columns.append(column)
        if column in list(df.select_dtypes(exclude=['object']).columns):
            num_columns.append(column)
    return cate_columns,num_columns

def conv_feature_columns(df):
    cate_columns, num_columns=separate_columns(df)
    tf_num_feature_column=[]
    tf_cate_feature_column=[]
    for column in num_columns:
        column_name =str(column)
        column_name = tf.feature_column.numeric_column(column)
        tf_num_feature_column.append(column_name)
    for column in cate_columns:
        column_name =str(column)
        vocabulary_list_c=df[column].unique().tolist()
        column_name = tf.feature_column.categorical_column_with_vocabulary_list(column,vocabulary_list_c)
        tf_cate_feature_column.append(column_name)     
    #hashing from categories to numerical use API 
    #transformation using bucketized for numerical to categories use API
    return tf_num_feature_column,tf_cate_feature_column

def indicator_deep_column(tf_cate_feature_column):
    tf_cate_feature_column_indicator=[]
    for column in tf_cate_feature_column:
        column_indicator=tf.feature_column.indicator_column(column)
        tf_cate_feature_column_indicator.append(column_indicator)
    return tf_cate_feature_column_indicator

def cross_feature_selection(df):
    crossed_col=[]
    #1
    DAYS_BIRTH_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DAYS_BIRTH'),
                                               boundaries = [-25000,-20000,-15000,-10000])
    CNT_CHILDREN_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('CNT_CHILDREN'),
                                                 boundaries = [2,4,6,8,10])
    crossed_col_1 = tf.feature_column.crossed_column( [DAYS_BIRTH_c, CNT_CHILDREN_c], 5000)
    crossed_col.append(crossed_col_1) 
    #2
    DEF_30_CNT_SOCIAL_CIRCLE_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DEF_30_CNT_SOCIAL_CIRCLE'),
                                                             boundaries = [2,4,6,8])
    OBS_30_CNT_SOCIAL_CIRCLE_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('OBS_30_CNT_SOCIAL_CIRCLE'),
                                                 boundaries = [5, 10,15,20,25,30])
    crossed_col_2 = tf.feature_column.crossed_column([DEF_30_CNT_SOCIAL_CIRCLE_c,OBS_30_CNT_SOCIAL_CIRCLE_c], 5000)
    crossed_col.append(crossed_col_2) 
#     #5 too much missing value
#     YEARS_BUILD_AVG_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('YEARS_BUILD_AVG'),
#                                                boundaries = [0,0.2,0.4,0.6,0.8])
#     APARTMENTS_AVG_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('APARTMENTS_AVG'),
#                                                  boundaries = [0,0.2,0.4,0.6,0.8])
#     crossed_col_5 = tf.feature_column.crossed_column( [YEARS_BUILD_AVG_c, APARTMENTS_AVG_c], 5000)
#     crossed_col.append(crossed_col_5) 
#     #3
#     DEF_30_CNT_SOCIAL_CIRCLE_c2=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DEF_30_CNT_SOCIAL_CIRCLE'),
#                                                             boundaries = [2,4,6,8])
#     OBS_60_CNT_SOCIAL_CIRCLE_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('OBS_60_CNT_SOCIAL_CIRCLE'),
#                                                  boundaries = [5, 10,15,20,25,30])
#     crossed_col_3 = tf.feature_column.crossed_column([DEF_30_CNT_SOCIAL_CIRCLE_c2,OBS_30_CNT_SOCIAL_CIRCLE_c], 5000)
#     crossed_col.append(crossed_col_3) 
#     #4
#     DAYS_REGISTRATION_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DAYS_REGISTRATION'),
#                                                boundaries = [-25000,-20000,-15000,-10000,-5000,0])
#     DAYS_BIRTH_c2=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DAYS_BIRTH'),
#                                                boundaries = [-25000,-20000,-15000,-10000])
#     crossed_col_4 = tf.feature_column.crossed_column( [DAYS_BIRTH_c2, DAYS_REGISTRATION_c], 5000)
#     crossed_col.append(crossed_col_4) 
#     crossed_col = crossed_col_1 + crossed_col_2 + crossed_col_3 + crossed_col_4
    return crossed_col

def wide_deep_columns(df):
    tf_num_feature_column,tf_cate_feature_column=conv_feature_columns(df)
    deep_column_indicator_part = indicator_deep_column(tf_cate_feature_column) 
    #categories in base_column
    base_column = tf_cate_feature_column
    #categories types with 0.3-0.7 cor
    crossed_column = []
    wide_column = []
    deep_column = []    
    crossed_column=cross_feature_selection(df)
    wide_column = base_column + crossed_column
    deep_column = tf_num_feature_column + deep_column_indicator_part 
    return wide_column,deep_column


def grid_selection(train,train_labels):
    fs = FeatureSelector(data = train, labels = train_labels)
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.95, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    train_removed_all_once = fs.remove(methods = 'all', keep_one_hot = False)
    fs.feature_importances.head()
    fs.record_collinear.head()
    return train_removed_all_once


# Load data and preprocessing and get feature columns

In [None]:
#load dataset
train = pd.read_csv(SRC_PATH + '/application_train.csv')
test= pd.read_csv(SRC_PATH + '/application_test.csv')

train_labels = train['TARGET']
#y_df = pd.Series(y, index=X.index)

#drop label and user Id columns
train = train.drop(columns = ['TARGET','SK_ID_CURR'])

#preprocessing 
train_removed_all_once = grid_selection(train,train_labels)
df, cat_cols_object, cat_cols_int, numeric_cols=fillna_df(train_removed_all_once, verbose=False)

# test.columns = X.columns

#scaling
df_scale=df
scale_column=df_scale.select_dtypes(exclude=['object']).columns
scaler = StandardScaler().fit(df_scale[scale_column])
df_scale.loc[:,scale_column] = scaler.transform(df_scale[scale_column])

#get columns
wide_columns,deep_columns = wide_deep_columns(df_scale)

#train and validation separate
train_X, val_X, train_y, val_y = cross_validation(df_scale,train_labels)

# Config 

In [None]:
model_dir ='./widendeep8'

run_config = tf.estimator.RunConfig(model_dir=model_dir,
                                    save_checkpoints_secs=300,
                                    keep_checkpoint_max=3)

In [None]:
estimator = tf.estimator.DNNLinearCombinedClassifier(linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns,
                                                    dnn_hidden_units=[500,150,50], dnn_activation_fn=tf.nn.relu,
                                                    dnn_dropout=0.5,config=run_config)  

# estimator = tf.estimator.DNNLinearCombinedClassifier(linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns,
#                                                     dnn_hidden_units=[500,250,150,50], dnn_activation_fn=tf.nn.relu,
#                                                     linear_optimizer = tf.train.FtrlOptimizer(0.0001,l1_regularization_strength=0.005),
#                                                     dnn_dropout=0.5,config=run_config)    
# estimator = tf.estimator.DNNLinearCombinedClassifier(
#         model_dir=model_dir,
#         linear_feature_columns=wide_columns,
#         dnn_feature_columns=deep_columns,
#         dnn_hidden_units=[100, 75, 50,25],
#         config=run_config,
#         linear_optimizer = tf.train.FtrlOptimizer(learning_rate=0.0001,l1_regularization_strength=0.005,l2_regularization_strength=0.001),
#         dnn_optimizer=tf.train.ProximalAdagradOptimizer(0.000 1,initial_accumulator_value=0.1,l1_regularization_strength=0.005,l2_regularization_strength=0.001))

# input fn

In [None]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(train_X, train_y, batch_size = 128, num_threads=3, shuffle=True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(val_X, val_y, batch_size = 5000, shuffle=False)
pred_input_fn = tf.estimator.inputs.pandas_input_fn(val_X, val_y, batch_size = len(val_X), shuffle=False)

# Train and evaluation

In [None]:
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=10000, hooks=None)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=10, start_delay_secs=240, throttle_secs=600,
                                  exporters=None, hooks=None)

In [None]:
%%time
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

In [None]:
valid_results = estimator.evaluate(input_fn=pred_input_fn, steps=1)
print()
print("-------------------------------------------------------------------------------------------------------------")
print("# Valid Measures: {}".format(valid_results))
print("-------------------------------------------------------------------------------------------------------------")

# Prediction

In [None]:
#tf.estimator.prediction()

# --------------------------------------------------------------------------------------------

# Experiment on crossed feature columns selected

In [None]:
fs = FeatureSelector(data = train, labels = train_labels)
fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.3, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})

# Missing value < 60%

In [None]:
fs.identify_missing(missing_threshold = 0.6)

In [None]:
miss_df=fs.missing_stats

In [None]:
miss_df_above=miss_df[miss_df['missing_fraction']<0.6]

In [None]:
#miss_df_above

# Feature importance Top 50

In [None]:
#fs.feature_importances.head(50)

In [None]:
# fs.record_collinear.head()

In [None]:
fea_importance=fs.feature_importances.head(50)

In [None]:
list_fea_importance=fea_importance['feature']

In [None]:
#list_fea_importance

In [None]:
#fs.record_collinear

# Select corr 0.3-0.4

In [None]:
df_pre_cross = fs.record_collinear

In [None]:
df2=df_pre_cross[df_pre_cross['corr_value']>=0.3]

In [None]:
df3=df2[df2['corr_value']<=0.4]

In [None]:
#df3

In [None]:
df4=df3[df3['drop_feature'].isin(list_fea_importance)]

In [None]:
df5=df4[df4['drop_feature'].isin(list_fea_importance)]

In [None]:
crossed_fea_A=df4['drop_feature']
crossed_fea_B=df4['corr_feature'] 

# Creat crossed column 

In [None]:
YEARS_BUILD_AVG_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('YEARS_BUILD_AVG'),
                                               boundaries = [0,0.2,0.4,0.6,0.8])
APARTMENTS_AVG_c=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('APARTMENTS_AVG'),
                                                 boundaries = [0,0.2,0.4,0.6,0.8])
crossed_col_5 = tf.feature_column.crossed_column( [YEARS_BUILD_AVG_c, APARTMENTS_AVG_c], 5000)

In [None]:
DAYS_BIRTH=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DAYS_BIRTH'),
                                               boundaries = [-25000,-20000,-15000,-10000])
CNT_CHILDREN=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('CNT_CHILDREN'),
                                                 boundaries = [2,4,6,8,10])
crossed_col_1 = tf.feature_column.crossed_column( [DAYS_BIRTH, CNT_CHILDREN], 5000)

In [None]:
DEF_30_CNT_SOCIAL_CIRCLE=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DEF_30_CNT_SOCIAL_CIRCLE'),
                                                             boundaries = [2,4,6,8])
OBS_30_CNT_SOCIAL_CIRCLE=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('OBS_30_CNT_SOCIAL_CIRCLE'),
                                                 boundaries = [5, 10,15,20,25,30])
crossed_col_2 = tf.feature_column.crossed_column([DEF_30_CNT_SOCIAL_CIRCLE,OBS_30_CNT_SOCIAL_CIRCLE], 5000)

In [None]:
DEF_30_CNT_SOCIAL_CIRCLE=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DEF_30_CNT_SOCIAL_CIRCLE'),
                                                            boundaries = [2,4,6,8])
OBS_60_CNT_SOCIAL_CIRCLE=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('OBS_60_CNT_SOCIAL_CIRCLE'),
                                                 boundaries = [5, 10,15,20,25,30])
crossed_col_3 = tf.feature_column.crossed_column([DEF_30_CNT_SOCIAL_CIRCLE,OBS_30_CNT_SOCIAL_CIRCLE], 5000)

In [None]:
DAYS_REGISTRATION=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DAYS_REGISTRATION'),
                                               boundaries = [-25000,-20000,-15000,-10000,-5000,0])
DAYS_BIRTH=tf.feature_column.bucketized_column(tf.feature_column.numeric_column('DAYS_BIRTH'),
                                               boundaries = [-25000,-20000,-15000,-10000])
crossed_col_4 = tf.feature_column.crossed_column( [DAYS_BIRTH, DAYS_REGISTRATION], 5000)