In [60]:
import os
import tempfile
import numpy as np
import tensorflow as tf 
import pandas as pd
import gc
import time
from contextlib import contextmanager
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold, StratifiedKFold
#download feature_selector package from https://github.com/WillKoehrsen/feature-selector
from feature_selector import FeatureSelector
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [2]:
SRC_PATH = "./dataset/home_credit/sources"
OUT_PATH = "./dataset/home_credit/outputs"
print(os.listdir(SRC_PATH))

['application_test.csv', '.DS_Store', 'application_train.csv']


# Functions

In [52]:
# Fill nan values
def fillna_df(df, verbose=False):
    cat_cols_object = df.dtypes[df.dtypes == 'object'].index
    cat_cols_int = df.dtypes[df.dtypes == 'int64'].index
    numeric_cols = df.dtypes[df.dtypes == 'float64'].index
    if verbose:
        display(get_misstable(df[cat_cols_object]))
        display(get_misstable(df[cat_cols_int]))
        display(get_misstable(df[numeric_cols]))
    df[cat_cols_object] = df[cat_cols_object].fillna('etc')
    df[cat_cols_int] = df[cat_cols_int].fillna(0)
    df[numeric_cols] = df[numeric_cols].fillna(0)  
    return df, cat_cols_object, cat_cols_int, numeric_cols

def cross_validation(df,train_labels):
    from sklearn.model_selection import train_test_split 
    train_X, val_X, train_y, val_y = train_test_split(df, train_labels, test_size=0.25, random_state=42)
    return train_X, val_X, train_y, val_y

#def scaling():
# df2=df.values
# df2
# #normalization
# scaler = StandardScaler().fit(df2)
# all_cols = list(df2.columns)

# numeric_cols = [col for col in df.columns if df2[col].dtype == 'float64']
# for cols in all_cols:
#     if cols in numeric_cols:
#         df2[cols] = scaler.transform(df2[cols])

# from sklearn.preprocessing import MinMaxScaler

# def normalize_age(data):
#     scaler = MinMaxScaler()
#     data["Age"] = scaler.fit_transform(data["Age"].values.reshape(-1,1))
#     return data
# train_data = normalize_age(train_data)
# test_data = normalize_age(test_data)
# train_data.head()

def separate_columns(df):
    cate_columns = []
    num_columns = []
    #separate columns
    for column in df.columns:
        if column in list(df.select_dtypes(include=['object']).columns):
            cate_columns.append(column)
        if column in list(df.select_dtypes(exclude=['object']).columns):
            num_columns.append(column)
#     print(cate_columns)
#     print(num_columns)
    return cate_columns,num_columns


def conv_feature_columns(df):
    cate_columns, num_columns=separate_columns(df)
    tf_num_feature_column=[]
    tf_cate_feature_column=[]
    for column in num_columns:
        column_name =str(column)
        column_name = tf.feature_column.numeric_column(column)
        tf_num_feature_column.append(column_name)
    for column in cate_columns:
        column_name =str(column)
        vocabulary_list_c=df[column].unique().tolist()
        column_name = tf.feature_column.categorical_column_with_vocabulary_list(column,vocabulary_list_c)
        tf_cate_feature_column.append(column_name)     
    #hashing from categories to numerical use API 
    #transformation using bucketized for numerical to categories use API
    return tf_num_feature_column,tf_cate_feature_column

def indicator_deep_column(tf_cate_feature_column):
    tf_cate_feature_column_indicator=[]
    for column in tf_cate_feature_column:
        column_indicator=tf.feature_column.indicator_column(column)
        tf_cate_feature_column_indicator.append(column_indicator)
    return tf_cate_feature_column_indicator

# def cross_feature_selection():
#     return 

def wide_deep_columns(df):
    tf_num_feature_column,tf_cate_feature_column=conv_feature_columns(df)
    deep_column_indicator_part = indicator_deep_column(tf_cate_feature_column) 
    #categories in base_column
    base_column = tf_cate_feature_column
    #categories types with 0.3-0.7 cor
    crossed_column = []
    wide_column = []
    deep_column = []    
    wide_column = base_column + crossed_column
    deep_column = tf_num_feature_column + deep_column_indicator_part 
    return wide_column,deep_column


def grid_selection(train,train_labels):
    fs = FeatureSelector(data = train, labels = train_labels)
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.80, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    train_removed_all_once = fs.remove(methods = 'all', keep_one_hot = False)
    fs.feature_importances.head()
    fs.record_collinear.head() 
    return train_removed_all_once

def get_model():
    return tf.estimator.DNNLinearCombinedClassifier(
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100, 75, 50, 25],
        linear_optimizer = tf.train.FtrlOptimizer(learning_rate=0.00001,l1_regularization_strength=0.005,l2_regularization_strength=0.001),
        dnn_optimizer=tf.train.ProximalAdagradOptimizer(0.00001,initial_accumulator_value=0.1,l1_regularization_strength=0.005,l2_regularization_strength=0.001))

def get_model2():
    return tf.contrib.learn.DNNLinearCombinedClassifier(
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 75, 50, 25],
    linear_optimizer = tf.train.FtrlOptimizer(learning_rate=0.00001,l1_regularization_strength=0.005,l2_regularization_strength=0.001),
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(0.00001,initial_accumulator_value=0.1,l1_regularization_strength=0.005,l2_regularization_strength=0.001)
)

#partial_fit API
# def get_model():
#     return tf.contrib.learn.DNNLinearCombinedEstimator()

# Main 

In [64]:
#load dataset
train = pd.read_csv(SRC_PATH + '/application_train.csv')
test= pd.read_csv(SRC_PATH + '/application_test.csv')
train_labels = train['TARGET']
train = train.drop(columns = ['TARGET','SK_ID_CURR'])

#preprocessing 
#train_removed_all_once = grid_selection(train,train_labels)
df, cat_cols_object, cat_cols_int, numeric_cols=fillna_df(train_removed_all_once, verbose=False)

#get columns
wide_columns,deep_columns = wide_deep_columns(df)

# print("this is wide column")
# print(wide_columns)
# print("this is deep column")
# print(deep_columns)

df2=df
input_fn_train = tf.estimator.inputs.pandas_input_fn(
    df,
    y=None,
    batch_size=128,
    num_epochs=10,
    shuffle= False,
    queue_capacity=1000,
    num_threads=1,
    target_column='TARGET'
)

model_dir = tempfile.mkdtemp()

m = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=model_dir,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 50])

m.train(input_fn=input_fn_train, steps=100)




# #run the model
# model=get_model()

# train_X, val_X, train_y, val_y = cross_validation(df,train_labels)

# model.train(train_X, train_y)

# model.fit()
# model.partial_fit()


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/t4/chry13wj0k3gpdsc6r060rj40000gn/T/tmpbiqmmeas', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a21f3bac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


ValueError: You must provide a labels Tensor. Given: None. Suggested troubleshooting steps: Check that your data contain your label feature. Check that your input_fn properly parses and returns labels.

In [62]:
df1=df

df1['e'] = Series(np.random.randn(sLength), index=df1.index)
df1['e'] = p.Series(np.random.randn(sLength), index=df1.index)

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,...,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_16,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,M,N,Y,0,202500.000,406597.5,24700.5,Unaccompanied,Working,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,Cash loans,F,N,N,0,270000.000,1293502.5,35698.5,Family,State servant,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,Revolving loans,M,Y,Y,0,67500.000,135000.0,6750.0,Unaccompanied,Working,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,Cash loans,F,N,Y,0,135000.000,312682.5,29686.5,Unaccompanied,Working,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,Cash loans,M,N,Y,0,121500.000,513000.0,21865.5,Unaccompanied,Working,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,Cash loans,M,N,Y,0,99000.000,490495.5,27517.5,"Spouse, partner",State servant,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
6,Cash loans,F,Y,Y,1,171000.000,1560726.0,41301.0,Unaccompanied,Commercial associate,...,0,1,0,0,0.0,0.0,0.0,1.0,1.0,2.0
7,Cash loans,M,Y,Y,0,360000.000,1530000.0,42075.0,Unaccompanied,State servant,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
8,Cash loans,F,N,Y,0,112500.000,1019610.0,33826.5,Children,Pensioner,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
9,Revolving loans,M,N,Y,0,135000.000,405000.0,20250.0,Unaccompanied,Working,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
