In [1]:
import os
import pandas as pd
import numpy as np
import json

from keras.utils import to_categorical
from keras.engine import Input, Model
from keras.layers import Dense, Flatten
from keras.layers.core import Activation
from keras.optimizers import Adam
import keras.backend as K

Using TensorFlow backend.


In [2]:
# helper functions to turn arbitrary numpy arrays into a single tensor

from itertools import zip_longest

def find_shape(seq):
    try:
        len_ = len(seq)
    except TypeError:
        return ()
    shapes = [find_shape(subseq) for subseq in seq]
    return (len_,) + tuple(max(sizes) for sizes in zip_longest(*shapes,
                                                                fillvalue=1))

def fill_array(arr, seq):
    if arr.ndim == 1:
        try:
            len_ = len(seq)
        except TypeError:
            len_ = 0
        arr[:len_] = seq
        arr[len_:] = 0
    else:
        for subarr, subseq in zip_longest(arr, seq, fillvalue=()):
            fill_array(subarr, subseq)
# convert np array back into interpretable string
def interpret_tensor(tensor, row, feature_dicts, feature):
    return feature_dicts[feature][np.argmax(tensor[list(feature_dicts.keys())
                                                    .index(feature), row])]

In [4]:
DATA_DIR = "data"
quiet_csv = os.path.join(DATA_DIR, "quiet.csv")

In [5]:
df = pd.read_csv(quiet_csv)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,task_id,start_time,end_time,music,num_interruptions,workspace_volume,num_meetings,num_breaks,progress
0,0,1.488769e+19,1541228000000.0,1541228000000.0,0.0,3.0,0.145156,5.0,6.0,0.678035
1,1,1.488769e+19,1541228000000.0,1541228000000.0,1.0,7.0,0.819426,4.0,2.0,0.532048
2,2,1.488769e+19,1541228000000.0,1541228000000.0,0.0,5.0,0.56876,2.0,6.0,0.836014
3,3,1.488769e+19,1541228000000.0,1541228000000.0,1.0,5.0,1.0,2.0,3.0,0.808164
4,4,1.488769e+19,1541228000000.0,1541228000000.0,0.0,3.0,0.626914,1.0,10.0,0.35439


In [9]:
# test regression: predict balance by age, job, marital, education, housing, duration of last contac
columns = df.columns
drop = ['Unnamed: 0', 
        'task_id', 
        'start_time', 
        'end_time',]
# prune unused features
df = df.drop(drop, axis=1)

# set up variables
y = df['progress']
X = df.drop('progress', axis=1)

In [16]:
# feature engineering could go here
# set up duration, split start time into categorical or integer value
# that corresponds to time of day (morn/aft/night)
# and adjust the X datarrame appropriately

# normalize X with column-wise linear downscale
X = X.divide(X.max(axis=0))

In [17]:
'''
# only necessary if categorical data
feature_dicts = {}
X_new = []
for col in X.columns:
    feature_dicts[col] = dict(enumerate(X['job'].astype('category').cat.categories))
    X_new.append(to_categorical(X[col].astype('category').cat.codes))
    
# convert list into np array
X_train = np.empty(find_shape(X_new))
fill_array(X_train, X_new) # no return, fills by reference

# reshape such that samples is first element
# (num_samples, num_features, one-hot length)
X_train = np.rollaxis(X_train,1,0)
'''

"\n# only necessary if categorical data\nfeature_dicts = {}\nX_new = []\nfor col in X.columns:\n    feature_dicts[col] = dict(enumerate(X['job'].astype('category').cat.categories))\n    X_new.append(to_categorical(X[col].astype('category').cat.codes))\n    \n# convert list into np array\nX_train = np.empty(find_shape(X_new))\nfill_array(X_train, X_new) # no return, fills by reference\n\n# reshape such that samples is first element\n# (num_samples, num_features, one-hot length)\nX_train = np.rollaxis(X_train,1,0)\n"

In [18]:
X.shape

(1000, 5)

In [24]:
# build basic dnn
def dnn(input_shape, model_path, lr=1e-4, verbose=0):
    inputs = Input(shape=input_shape[1:])

    x = Dense(64, activation='relu')(inputs)
    x = Dense(16, activation='relu')(x)
    x = Dense(1)(x)
    
    outputs = Activation('sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)

    # dice as a human-readble metric 
    model.compile(optimizer=Adam(lr=lr),
                  metrics=['mae'],
                  loss='mse')

    # save json
    json_string = model.to_json()
    with open(model_path, 'w') as f:
        json.dump(json_string, f)

    # selectively print model
    if verbose:
        print(model.summary())

    return model

In [25]:
model_path = os.path.join("models", "dnn.json")
model = dnn(X.shape, model_path, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 5)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                384       
_________________________________________________________________
dense_5 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 1,441
Trainable params: 1,441
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
model.fit(X, y, 
          epochs=1000,
          batch_size=128,
          validation_split=0.2,
          verbose=0)

<keras.callbacks.History at 0x1c460e0a320>

In [28]:
# predict and update model
results = model.predict(X, batch_size=64, verbose=1)



In [29]:
results.shape

(1000, 1)

In [30]:
df_new = df.copy()

In [31]:
df_new['predictions'] = results

In [32]:
df_new.head()

Unnamed: 0,music,num_interruptions,workspace_volume,num_meetings,num_breaks,progress,predictions
0,0.0,3.0,0.145156,5.0,6.0,0.678035,0.713078
1,1.0,7.0,0.819426,4.0,2.0,0.532048,0.697032
2,0.0,5.0,0.56876,2.0,6.0,0.836014,0.707898
3,1.0,5.0,1.0,2.0,3.0,0.808164,0.730441
4,0.0,3.0,0.626914,1.0,10.0,0.35439,0.683142


In [33]:
df_new.to_csv("predicted.csv")
