In [1]:
# Import all the libraries ever
import time as time

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import tensorflow as tf
from tensorflow.contrib.layers import batch_norm, fully_connected, dropout

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
#now = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')
#root_logdir = 'tf_logs'
#logdir = '{}/run-{}/'.format(root_logdir, now)

In [3]:
# Load data

data_dir = '/home/ryan/data/ccard_defaults/'

data = pd.read_csv(data_dir + 'ccard_defaults.csv', index_col='ID')

target = data.iloc[:, -1]
data.drop('default payment next month', axis=1, inplace=True)

In [4]:
data.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0
3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0
5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0


In [5]:
target.value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

In [6]:
pay_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

for col in pay_cols:
    col_ord = data.loc[:, col].copy()
    col_ord.loc[col_ord < 0] = 0
    
    col_cat = data.loc[:, col].copy()
    col_cat.loc[col_cat > 0] = 1
    
    data[col + '_ORD'] = col_ord
    data[col + '_CAT'] = col_cat
    
    data.drop(col, axis=1, inplace=True)

In [7]:
cat_cols = ['SEX',       'EDUCATION', 'MARRIAGE', 
            'PAY_0_CAT', 'PAY_2_CAT', 'PAY_3_CAT', 
            'PAY_4_CAT', 'PAY_5_CAT', 'PAY_6_CAT']
num_cols = np.setdiff1d(list(data.columns), cat_cols)

In [8]:
def name_to_pos(index, names):
    index_lookup = {name : n for n, name in enumerate(index)}
    return [index_lookup[name] for name in names]

In [9]:
cat_cols_pos = name_to_pos(data.columns, cat_cols)
num_cols_pos = name_to_pos(data.columns, num_cols)

In [10]:
cat_cols_pos

[1, 2, 3, 18, 20, 22, 24, 26, 28]

In [11]:
# Define some helper classes

class ArrayColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attrs):
        self.attrs = attrs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[:, self.attrs]


class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, labels=None):
        self.encoders = []
        self.labels = labels
        
    def fit(self, X, y=None):
        self.encoders = []
        for col in range(X.shape[1]):
            le = LabelEncoder()
            if self.labels:
                le.fit(self.labels[col])
            else:
                le.fit(X[:, col])
            self.encoders.append(le)
        return self
    
    def transform(self, X):
        if self.encoders:
            out = np.zeros([X.shape[0], 1])
            for col, le in enumerate(self.encoders):
                out = np.append(out, le.transform(X[:, col]).reshape((-1, 1)), axis=1)
            return out[:, 1:]
        else:
            return X

cat_labels = [np.unique(data.iloc[:, col]) for col in cat_cols_pos]
        
numeric_pipeline = Pipeline([
    ('selector', ArrayColumnSelector(num_cols_pos)),
    ('principal_components', PCA(n_components=.9, svd_solver='full')),
    ('std_scaler', StandardScaler())
    ])

categ_pipeline = Pipeline([
    ('selector', ArrayColumnSelector(cat_cols_pos)),
    ('label_encoder', MultiColumnLabelEncoder(cat_labels)),
    ('onehot_encoder', OneHotEncoder())
    ])

preproc_pipeline = FeatureUnion(transformer_list=[
    ('numeric_pipeline', numeric_pipeline),
    ('categ_pipeline', categ_pipeline)
    ],
    n_jobs=2)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.values, target, 
                                                    test_size=0.1, 
                                                    random_state=49413,
                                                    stratify=target)

In [13]:
X_train.shape

(27000, 29)

In [14]:
X_train = preproc_pipeline.fit_transform(X_train)
X_test  = preproc_pipeline.transform(X_test)

X_train = np.float32(X_train.toarray())
X_test  = np.float32(X_test.toarray())

y_train = np.int64(y_train)
y_test = np.int64(y_test)

In [15]:
def get_batch(epoch_num, batch_num, batch_size, n_batches, n_rows, X, y):
    np.random.seed(epoch_num * n_batches + batch_num)
    idx = np.random.choice(n_rows, batch_size, replace=False)
    X_batch = X[idx, :]
    y_batch = y[idx]
    return X_batch, y_batch

def mlp_accs(X_train, X_test, y_train, y_test, n_epochs, batch_size, learning_rate, 
             reg_param, keep_prob, decay, n_hidden1, n_hidden2, n_hidden3, n_hidden4,
             activation1, activation2, activation3, activation4):

    tf.reset_default_graph()
    
    rows_train, cols_train = X_train.shape

    n_batches = int(np.ceil(rows_train / batch_size))
    n_outputs = 2
    
    X = tf.placeholder(tf.float32, shape=(None, cols_train), name='X')
    y = tf.placeholder(tf.int64,   shape=(None),             name='y')

    is_training = tf.placeholder(tf.bool, shape=(), name='is_training')
    bnorm_params = {'is_training': is_training,
                    'decay': decay,
                    'scale': True,
                    'updates_collections': None}

    he_init = tf.contrib.layers.variance_scaling_initializer(mode='FAN_AVG')
    
    with tf.contrib.framework.arg_scope([fully_connected],
                                        normalizer_fn=batch_norm,
                                        normalizer_params=bnorm_params,
                                        weights_initializer=he_init,
                                        weights_regularizer=tf.contrib.layers.l2_regularizer(scale=reg_param)):
        hidden1 = fully_connected(X,            n_hidden1, activation_fn=activation1, scope='hidden1')
        hidden1_drop = dropout(hidden1, keep_prob=keep_prob, is_training=is_training)

        hidden2 = fully_connected(hidden1_drop, n_hidden2, activation_fn=activation2, scope='hidden2')
        hidden2_drop = dropout(hidden2, keep_prob=keep_prob, is_training=is_training)

        hidden3 = fully_connected(hidden2_drop, n_hidden3, activation_fn=activation3, scope='hidden3')
        hidden3_drop = dropout(hidden3, keep_prob=keep_prob, is_training=is_training)

        hidden4 = fully_connected(hidden3_drop, n_hidden4, activation_fn=activation4, scope='hidden4')
        hidden4_drop = dropout(hidden4, keep_prob=keep_prob, is_training=is_training)

        logits  = fully_connected(hidden4_drop, n_outputs, activation_fn=None,       scope='outputs')
        
    with tf.name_scope('loss'):
        cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        base_loss = tf.reduce_mean(cross_ent, name='loss')
        reg_loss  = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        loss      = tf.add_n([base_loss] + reg_loss, name='loss')
        
    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate)
        training_op = optimizer.minimize(loss)
    
    with tf.name_scope('eval'):
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    init = tf.global_variables_initializer()
    
    train_accs = []
    test_accs = []
    
    with tf.Session() as sess:
        init.run()
        for epoch in range(n_epochs):
            for batch in range(n_batches):
                X_batch, y_batch = get_batch(epoch, batch, batch_size, n_batches, rows_train, X_train, y_train)
                sess.run(training_op, feed_dict={X: X_batch, y: y_batch, is_training: True})
            train_accs.append(1. - accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training: False}))
            test_accs.append(1. - accuracy.eval(feed_dict={X: X_test,  y: y_test,  is_training: False}))
        #if epoch % 100 == 0:
        #    print('Epoch {:04d}\n  Accuracy (Train): {:f}\n  Accuracy (Test) : {:f}'.format(epoch, train_acc, test_acc))
            
    #test_acc  = accuracy.eval(feed_dict={X: X_test,  y: y_test,  is_training: False})
    #print('\n\nFinal Accuracy (Test): {:f}'.format(test_acc))
    
        softmax_out_train = hidden4.eval(feed_dict={X: X_train, y: y_train, is_training: False})
        softmax_out_test  = hidden4.eval(feed_dict={X: X_test,  y: y_test,  is_training: False})
    
    return train_accs, test_accs, softmax_out_train, softmax_out_test

In [16]:
class ledger():
    
    def __init__(self):
        self.inc = 0
        self.hist = []
        
    def __call__(self, d, l, n):
        self.hist.append((d, l, n))
        print('{:04d}\t{}'.format(self.inc, time.asctime()))
        self.inc += 1
        
led = ledger()

In [17]:
def mlp_objective(param_dict):
    
    n_epochs         = 250
    batch_size       = int(param_dict['batch_size'])
    n_hidden1        = int(param_dict['n_hidden1'])
    n_hidden2        = int(param_dict['n_hidden2'])
    n_hidden3        = int(param_dict['n_hidden3'])
    n_hidden4        = int(param_dict['n_hidden4'])
    learning_rate    = param_dict['learning_rate']
    reg_param        = param_dict['reg_param']
    keep_prob        = param_dict['keep_prob']
    decay            = param_dict['decay']
    activation1      = param_dict['activation1']
    activation2      = param_dict['activation2']
    activation3      = param_dict['activation3']
    activation4      = param_dict['activation4']
    
    _, test_loss, _, _ = mlp_accs(X_train, X_test, y_train, y_test, 
                                  n_epochs, batch_size, learning_rate, 
                                  reg_param, keep_prob, decay, 
                                  n_hidden1, n_hidden2, n_hidden3, n_hidden4,
                                  activation1, activation2, activation3, activation4)
    
    best_idx = np.argmin(test_loss)
    loss = min(test_loss)
    
    led(d=param_dict, l=loss, n=best_idx)
    
    return loss

def objective(x):
    return {'loss': mlp_objective(x),
            'status': STATUS_OK,
            'eval_time': time.time()}

trials = Trials()

In [19]:
space = {
     'batch_size':    hp.quniform('batch_size',   200,  1000, 50),
     'n_hidden1':     hp.quniform('n_hidden1',    200,  1000, 5),
     'n_hidden2':     hp.quniform('n_hidden2',    200,  1000, 5),
     'n_hidden3':     hp.quniform('n_hidden3',    200,  1000, 5),
     'n_hidden4':     hp.quniform('n_hidden4',    200,  1000, 5),
     'learning_rate': hp.uniform('learning_rate', 1e-5, 1e-2),
     'reg_param':     hp.uniform('reg_param',     1e-5, 1e1),
     'keep_prob':     hp.uniform('keep_prob',     0.4,  0.95),
     'decay':         hp.uniform('decay',         0.9,  0.9999),
     'activation1':   hp.choice('activation1',    [tf.nn.relu, tf.nn.elu, tf.nn.tanh, tf.sigmoid, tf.nn.softplus]),
     'activation2':   hp.choice('activation2',    [tf.nn.relu, tf.nn.elu, tf.nn.tanh, tf.sigmoid, tf.nn.softplus]),
     'activation3':   hp.choice('activation3',    [tf.nn.relu, tf.nn.elu, tf.nn.tanh, tf.sigmoid, tf.nn.softplus]),
     'activation4':   hp.choice('activation4',    [tf.nn.relu, tf.nn.elu, tf.nn.tanh, tf.sigmoid, tf.nn.softplus])
    }

In [20]:
led(None, None, None)

best = fmin(fn=objective, 
            space=space, 
            algo=tpe.suggest, 
            max_evals=25,
            trials=trials)

0000	Wed Jul 12 20:10:51 2017
0001	Wed Jul 12 20:11:47 2017
0002	Wed Jul 12 20:12:56 2017
0003	Wed Jul 12 20:14:21 2017
0004	Wed Jul 12 20:15:30 2017
0005	Wed Jul 12 20:16:49 2017
0006	Wed Jul 12 20:17:47 2017
0007	Wed Jul 12 20:18:44 2017
0008	Wed Jul 12 20:19:57 2017
0009	Wed Jul 12 20:20:52 2017
0010	Wed Jul 12 20:22:18 2017
0011	Wed Jul 12 20:23:44 2017
0012	Wed Jul 12 20:24:40 2017
0013	Wed Jul 12 20:25:46 2017
0014	Wed Jul 12 20:27:26 2017
0015	Wed Jul 12 20:28:14 2017
0016	Wed Jul 12 20:29:31 2017
0017	Wed Jul 12 20:30:25 2017
0018	Wed Jul 12 20:31:24 2017
0019	Wed Jul 12 20:32:31 2017
0020	Wed Jul 12 20:33:23 2017
0021	Wed Jul 12 20:35:55 2017
0022	Wed Jul 12 20:38:21 2017
0023	Wed Jul 12 20:40:36 2017
0024	Wed Jul 12 20:42:08 2017
0025	Wed Jul 12 20:43:11 2017


In [21]:
print(best)

{'activation1': 4, 'activation2': 2, 'activation3': 1, 'activation4': 0, 'batch_size': 200.0, 'decay': 0.9404959259504099, 'keep_prob': 0.7441456645169185, 'learning_rate': 8.436449239381113e-05, 'n_hidden1': 585.0, 'n_hidden2': 835.0, 'n_hidden3': 370.0, 'n_hidden4': 410.0, 'reg_param': 0.07524309738417223}


In [18]:
train, test, logits_train, logits_test = mlp_accs(X_train, X_test, y_train, y_test, 
                                                  296,  200,  8.4e-5, 
                                                  .075, .074, .94, 
                                                  585,  835,  370, 410,
                                                  tf.nn.softplus, tf.nn.tanh, tf.nn.relu, tf.nn.elu)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators = 100, max_depth=None)
gbc.fit(logits_train, y_train)

In [None]:
1 - np.sum(gbc.predict(logits_test) == y_test) / len(y_test)