In [1]:
import pandas
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
import plotly
from datetime import datetime, timedelta
import logging
logging.getLogger().setLevel(logging.INFO)


In [2]:
responses = pandas.read_csv('C:/local datasets/RAD/rad response level 20180716.csv',
                                 parse_dates = ['EVENT_DATE', 'DATE_CALLSTART', 'FIRST_NFD_DT',"CLI_DATE","DATE_ACCEPTED","DATE_DISPATCH_FD","DATE_DISPATCH_FDA","DATE_DISPATCH_FA","DATE_ARRIVED_FD","DATE_ARRIVED_FDA","DATE_ARRIVED_FA","DATE_FIRST_P0","DATE_FIRST_C1","DATE_FIRST_C2","DATE_FIRST_C3"],
                                 infer_datetime_format = True)

In [3]:
print(responses.dtypes)

EVENT_DATE_WID                           int64
EVENT_DATE                      datetime64[ns]
COMMONEVENTID                            int64
COMM_EVENT_WID                           int64
N_EVENT_NUMBERS                          int64
REFCOM_DISPATCH                         object
GEO_UCL                                 object
GEO_CATCHMENT                           object
GEO_REGION                              object
GEO_LONGITUDE                          float64
GEO_LATITUDE                           float64
N_EVENT_TYPES                            int64
FIRST_EVENT_TYPE                        object
LAST_EVENT_TYPE                         object
FINAL_EVENT_TYPE_CARD                   object
FIRST_PRIORITY_CODE                      int64
LAST_PRIORITY_CODE                       int64
MIN_PRIORITY_CODE                        int64
N_DISPATCHED                             int64
N_ARRIVED                                int64
DATE_CALLSTART                  datetime64[ns]
FIRST_NFD_DT 

In [4]:
responses.dropna(subset = ['DATE_CALLSTART'], inplace = True)

In [30]:
responses['DATE_CALLSTARThour'] = responses.DATE_CALLSTART.dt.hour.astype(int)
responses['DATE_CALLSTARTmonth'] = responses.DATE_CALLSTART.dt.month.astype(int)
responses['DATE_CALLSTARTweekday'] = responses.DATE_CALLSTART.dt.weekday.astype(int)
responses.GEO_CATCHMENT = responses.GEO_CATCHMENT.fillna('Other')
responses.GEO_REGION = responses.GEO_REGION.fillna('Other')

responses['training'] = responses.DATE_CALLSTART < responses.DATE_CALLSTART.max() - timedelta(days = 30)
responses['weight'] = responses.REFERRED_TO_DM.apply(lambda x: 1 if x == 'Y' else 2)
responses.weight[responses.training] = 1.5



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [6]:
def make_dataset(batch_sz, x, y=None, shuffle=False, shuffle_buffer_size=1000):
    """Create a slice Dataset from a pandas DataFrame and labels"""

    def input_fn():
        if y is not None:
            dataset = tf.data.Dataset.from_tensor_slices((dict(x), y))
        else:
            dataset = tf.data.Dataset.from_tensor_slices(dict(x))
        if shuffle:
            dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_sz).repeat()
        else:
            dataset = dataset.batch(batch_sz)
        return dataset.make_one_shot_iterator().get_next()

    return input_fn


In [40]:
DATE_CALLSTARThour = tf.feature_column.categorical_column_with_identity(key='DATE_CALLSTARThour', num_buckets = 25)
DATE_CALLSTARTmonth = tf.feature_column.categorical_column_with_identity(key='DATE_CALLSTARTmonth', num_buckets = 13)
DATE_CALLSTARTweekday = tf.feature_column.categorical_column_with_identity(key='DATE_CALLSTARTweekday', num_buckets = 8)
GEO_CATCHMENT = tf.feature_column.categorical_column_with_hash_bucket(key="GEO_CATCHMENT", hash_bucket_size=50)
FIRST_EVENT_TYPE = tf.feature_column.categorical_column_with_hash_bucket(key="FIRST_EVENT_TYPE", hash_bucket_size=50)
REFCOM_DISPATCH = tf.feature_column.categorical_column_with_vocabulary_list(key="REFCOM_DISPATCH", vocabulary_list = ['Y', 'N'])
GEO_REGION = tf.feature_column.categorical_column_with_vocabulary_list(key="REFCOM_DISPATCH", vocabulary_list = responses.GEO_REGION.unique())
weight = tf.feature_column.numeric_column(key='weight')

column_names = {
    'DATE_CALLSTARThour',
    'DATE_CALLSTARTmonth',
    'DATE_CALLSTARTweekday',
    'GEO_CATCHMENT',
    'FIRST_EVENT_TYPE',
    'REFCOM_DISPATCH',
    'GEO_REGION',
    'weight'
}
feature_columns = [
    tf.feature_column.indicator_column(tf.feature_column.crossed_column([DATE_CALLSTARThour, DATE_CALLSTARTweekday], hash_bucket_size=50)),
    #tf.feature_column.indicator_column(tf.feature_column.crossed_column([GEO_REGION, DATE_CALLSTARTmonth], hash_bucket_size=50)),
    tf.feature_column.indicator_column(GEO_CATCHMENT),
    tf.feature_column.indicator_column(FIRST_EVENT_TYPE),
    tf.feature_column.indicator_column(REFCOM_DISPATCH),
    tf.feature_column.indicator_column(DATE_CALLSTARThour),
    tf.feature_column.indicator_column(DATE_CALLSTARTmonth),
    tf.feature_column.indicator_column(DATE_CALLSTARTweekday),
    weight
    
]


In [41]:
batch_size = 100
# Provide the training input dataset.
train_set = responses[responses.training == True]
test_set = responses[responses.training == False]

train_x = train_set[list(column_names)]
train_y = train_set['REFERRED_TO_DM'] == 'Y'

test_x = test_set[list(column_names)]
test_y = test_set['REFERRED_TO_DM'] == 'Y'

train_input_fn = make_dataset(batch_size, train_x, train_y, True, 1000)

test_input_fn = make_dataset(batch_size, test_x, test_y)

In [42]:
model = tf.estimator.DNNClassifier(hidden_units=[1024, 512, 256],
                                   feature_columns=feature_columns,
                                   weight_column = 'weight',
                                   optimizer=tf.train.ProximalAdagradOptimizer(
                                       learning_rate=0.1,
                                       l1_regularization_strength=0.001,
                                       l2_regularization_strength=0.001
                                   ))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\ri072731\\AppData\\Local\\Temp\\tmpkutnfzpw', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002800283E550>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [43]:

hooks = [tf.train.SummarySaverHook(scaffold=tf.train.Scaffold(summary_op=tf.summary.merge_all()),
          save_steps = 5,
          output_dir='/tmp/tf'
          )]
classifier = model.train(input_fn=train_input_fn, steps=100, hooks = hooks)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\ri072731\AppData\Local\Temp\tmpkutnfzpw\model.ckpt.
INFO:tensorflow:loss = 100.940384, step = 0
INFO:tensorflow:Saving checkpoints for 100 into C:\Users\ri072731\AppData\Local\Temp\tmpkutnfzpw\model.ckpt.
INFO:tensorflow:Loss for final step: 43.31354.


In [44]:
eval_result = classifier.evaluate(input_fn=test_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-02-27-02:46:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ri072731\AppData\Local\Temp\tmpkutnfzpw\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-02-27-02:46:46
INFO:tensorflow:Saving dict for global step 100: accuracy = 0.9613662, accuracy_baseline = 0.9623504, auc = 0.5030465, auc_precision_recall = 0.03806567, average_loss = 0.17974292, global_step = 100, label/mean = 0.037649564, loss = 34.63843, precision = 0.031847134, prediction/mean = 0.058173094, recall = 0.000889205
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: C:\Users\ri072731\AppData\Local\Temp\tmpkutnfzpw\model.ckpt-100


In [45]:
print(eval_result)

{'accuracy': 0.9613662, 'accuracy_baseline': 0.9623504, 'auc': 0.5030465, 'auc_precision_recall': 0.03806567, 'average_loss': 0.17974292, 'label/mean': 0.037649564, 'loss': 34.63843, 'precision': 0.031847134, 'prediction/mean': 0.058173094, 'recall': 0.000889205, 'global_step': 100}


In [46]:
predictions=list(classifier.predict(input_fn=test_input_fn))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ri072731\AppData\Local\Temp\tmpkutnfzpw\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [47]:
out_y = [p['class_ids'][0] for p in predictions]

In [48]:
with tf.Session():
    print(tf.Tensor.eval(tf.confusion_matrix(labels=list(test_y), predictions=out_y)))

[[71788    76]
 [ 5618     5]]
