In [1]:
import os
import pandas as pd
import numpy as np
import json

from keras.utils import to_categorical
from keras.engine import Input, Model
from keras.layers import Dense, Flatten
from keras.layers.core import Activation
from keras.optimizers import Adam
import keras.backend as K

Using TensorFlow backend.


In [2]:
# helper functions to turn arbitrary numpy arrays into a single tensor

from itertools import zip_longest

def find_shape(seq):
    try:
        len_ = len(seq)
    except TypeError:
        return ()
    shapes = [find_shape(subseq) for subseq in seq]
    return (len_,) + tuple(max(sizes) for sizes in zip_longest(*shapes,
                                                                fillvalue=1))

def fill_array(arr, seq):
    if arr.ndim == 1:
        try:
            len_ = len(seq)
        except TypeError:
            len_ = 0
        arr[:len_] = seq
        arr[len_:] = 0
    else:
        for subarr, subseq in zip_longest(arr, seq, fillvalue=()):
            fill_array(subarr, subseq)
# convert np array back into interpretable string
def interpret_tensor(tensor, row, feature_dicts, feature):
    return feature_dicts[feature][np.argmax(tensor[list(feature_dicts.keys())
                                                    .index(feature), row])]

In [3]:
DATA_DIR = "data"
test_bank_csv = os.path.join(DATA_DIR, "test_bank.csv")

In [4]:
df = pd.read_csv(test_bank_csv, sep=';')

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [6]:
# test regression: predict balance by age, job, marital, education, housing, duration of last contac
columns = df.columns
keep = ['job', 
        'marital', 
        'education', 
        'housing', 
        'duration', 
        'balance']
# prune unused features
df = df[keep]

# set up variables
y = df['balance']
X = df.drop('balance', axis=1)

In [7]:
feature_dicts = {}
X_new = []
for col in X.columns:
    feature_dicts[col] = dict(enumerate(X['job'].astype('category').cat.categories))
    X_new.append(to_categorical(X[col].astype('category').cat.codes))
    
# convert list into np array
X_train = np.empty(find_shape(X_new))
fill_array(X_train, X_new) # no return, fills by reference

# reshape such that samples is first element
# (num_samples, num_features, one-hot length)
X_train = np.rollaxis(X_train,1,0)

In [8]:
X_train.shape

(4521, 5, 875)

In [13]:
# build basic dnn
def dnn(input_shape, model_path, lr=1e-4, verbose=0):
    inputs = Input(shape=input_shape[1:])

    x = Dense(64, activation='relu')(inputs)
    x = Flatten()(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(1)(x)
    
    outputs = Activation('relu')(x)
    
    model = Model(inputs=inputs, outputs=outputs)

    # dice as a human-readble metric 
    model.compile(optimizer=Adam(lr=lr),
                  metrics=['mae'],
                  loss='mse')

    # save json
    json_string = model.to_json()
    with open(model_path, 'w') as f:
        json.dump(json_string, f)

    # selectively print model
    if verbose:
        print(model.summary())

    return model

In [14]:
model = dnn(X_train.shape, 'dnn.json', verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 5, 875)            0         
_________________________________________________________________
dense_5 (Dense)              (None, 5, 64)             56064     
_________________________________________________________________
flatten_2 (Flatten)          (None, 320)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                5136      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 17        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 61,217
Trainable params: 61,217
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
model.fit(X_train, y, 
          epochs=10,
          batch_size=64,
          validation_split=0,
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2559bced550>

In [21]:
# predict and update model
results = model.predict(X_train, batch_size=64, verbose=1)



In [22]:
results.shape

(4521, 1)

In [25]:
df_new = df.copy()

In [26]:
df_new['predictions'] = results

In [27]:
df_new.head()

Unnamed: 0,job,marital,education,housing,duration,balance,predictions
0,unemployed,married,primary,no,79,1787,1312.040894
1,services,married,secondary,yes,220,4789,1337.067139
2,management,single,tertiary,yes,185,1350,1536.342163
3,management,married,tertiary,yes,199,1476,1525.445679
4,blue-collar,married,secondary,yes,226,0,1396.865112
