In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
def parse(example):
    features = tf.parse_single_example(
        example,
        features={
            'index': tf.FixedLenFeature([], tf.string),
            'values': tf.FixedLenFeature([], tf.string),
            'target': tf.FixedLenFeature([], tf.float32),
            'is_train': tf.FixedLenFeature([], tf.float32),
        })
    
    index = features['index']
    target = features['target']
    values = tf.decode_raw(features['values'], tf.float32)
    values = tf.reshape(values, [4991])
    
    is_train = tf.cast(features['is_train'], tf.int32)
    
    return index, values, target, is_train

def train_input_fn(params):
    if 'batch_size' not in params:
        params['batch_size'] = 128
    batch_size = params['batch_size']

    dataset = tf.data.TFRecordDataset('train.tfrecord')
    dataset = dataset.map(parse, num_parallel_calls=5)
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(1)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    index, values, labels, is_train = next_element
    features = {
        'indexes': index,
        'values': values,
        'is_train': is_train,
        'labels': labels
    }
    return features, labels

def valid_input_fn(params):
    if 'batch_size' not in params:
        params['batch_size'] = 128
    batch_size = params['batch_size']

    dataset = tf.data.TFRecordDataset('valid.tfrecord')
    dataset = dataset.map(parse, num_parallel_calls=5)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(1)
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    index, values, labels, is_train = next_element
    features = {
        'indexes': index,
        'values': values,
        'is_train': is_train,
        'labels': labels
    }
    return features, labels

def predict_input_fn(params):
    if 'batch_size' not in params:
        params['batch_size'] = 128
    batch_size = params['batch_size']

    dataset = tf.data.TFRecordDataset(['valid.tfrecord', 'train.tfrecord'])
    dataset = dataset.map(parse, num_parallel_calls=5)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(1)
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    index, values, labels, is_train = next_element
    features = {
        'indexes': index,
        'values': values,
        'is_train': is_train,
        'labels': labels
    }
    return features, labels




In [3]:
# https://github.com/GoogleCloudPlatform/tf-estimator-tutorials/blob/master/05_Autoencoding/02.0%20-%20Dimensionality%20Reduction%20-%20Autoencoding%20%2B%20Custom%20Estimator.ipynb

def model_fn(features, labels, mode, params):

    default_params = {
        'noise_level': .5,
        'learning_rate':.001,
        'dropout_rate':0.1,
        'l2_regularizer':0.0001,
        'activation':tf.nn.relu,
    }
    default_params.update(params)
    params = default_params
    
    encoder_hidden_units = params['hidden_units'][:]
    Z_units = encoder_hidden_units.pop()
    decoder_hidden_units = encoder_hidden_units[:]
    decoder_hidden_units.reverse()
    
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    
    input_layer = features['values']
    
    input_size = input_layer.get_shape().as_list()[1]
    
    net = input_layer
    
    if params['noise_level']:
        net = net + tf.random_normal(tf.shape(input_layer), stddev=params['noise_level'])

    net = tf.layers.dropout(net, rate=params['dropout_rate'], training=is_training)

    regularizer = tf.contrib.layers.l2_regularizer(scale=params['l2_regularizer'])

    for units in encoder_hidden_units:
        net = tf.layers.dense(net, units, activation=params['activation'], kernel_regularizer=regularizer)

    net = tf.layers.dense(net, Z_units, activation=None, kernel_regularizer=regularizer)
    encoding = net
    

    for units in decoder_hidden_units:
        net = tf.layers.dense(net, units, activation=params['activation'], kernel_regularizer=regularizer)

    reconstruction = tf.layers.dense(net, units=input_size, activation=None)
    
    if mode == tf.estimator.ModeKeys.PREDICT:

        predictions = {
            'encoding': encoding,
            'reconstruction': reconstruction,
            'indexes': features['indexes'],
            'is_train': features['is_train'],
            'labels':features['labels'],
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    loss = tf.losses.mean_squared_error(input_layer, reconstruction)
    
    loss = loss + tf.losses.get_regularization_loss()
    
    tf.summary.histogram('encoding', encoding)
    tf.summary.scalar('loss', loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss)
    
    optimizer = tf.train.AdamOptimizer(params['learning_rate'])
    train_op = optimizer.minimize(
        loss=loss, global_step=tf.train.get_global_step())

    estimator_spec = tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, train_op=train_op)
    return estimator_spec

In [4]:
#!rm -fr model_dirs/ed

In [5]:
estimator = tf.estimator.Estimator(
    model_dir='model_dirs/ed',
    model_fn=model_fn,
    params=dict(
        hidden_units=[200,100,50,25,10],
        learning_rate=.0001
    )
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_global_id_in_cluster': 0, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_model_dir': 'model_dirs/ed', '_keep_checkpoint_max': 5, '_is_chief': True, '_device_fn': None, '_train_distribute': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f30c76c4d30>, '_session_config': None, '_service': None, '_evaluation_master': '', '_task_type': 'worker', '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_num_ps_replicas': 0, '_task_id': 0, '_tf_random_seed': None, '_save_checkpoints_steps': None, '_master': '', '_num_worker_replicas': 1}


In [6]:
tf.logging.set_verbosity(tf.logging.WARN)

In [8]:
max_not_improve = 5

best_loss = np.inf
best_global_step = 0

not_improve_counter = 0
while not_improve_counter < max_not_improve:
    
    estimator.train(input_fn=train_input_fn, steps=100)
    eval_result = estimator.evaluate(input_fn=valid_input_fn)
    print(eval_result)
    if eval_result['loss'] < best_loss:
        best_loss = eval_result['loss']
        best_global_step = eval_result['global_step']
        not_improve_counter = 0
    else:
        not_improve_counter += 1
best_global_step

{'loss': 2.674037, 'global_step': 13300}
{'loss': 2.6743405, 'global_step': 13400}
{'loss': 2.6745453, 'global_step': 13500}
{'loss': 2.6738298, 'global_step': 13600}
{'loss': 2.6744816, 'global_step': 13700}
{'loss': 2.6740952, 'global_step': 13800}
{'loss': 2.6745055, 'global_step': 13900}
{'loss': 2.6746054, 'global_step': 14000}
{'loss': 2.6744936, 'global_step': 14100}


13600

In [15]:
for i in range(100):
    estimator.train(input_fn=train_input_fn, steps=100)
    print(estimator.evaluate(input_fn=valid_input_fn))

{'loss': 2.6739094, 'global_step': 13000}
{'loss': 2.6742392, 'global_step': 13100}
{'loss': 2.6740763, 'global_step': 13200}


KeyboardInterrupt: 

In [18]:
pred

NameError: name 'pred' is not defined

In [26]:
result = pd.DataFrame()
for pred in estimator.predict(input_fn=predict_input_fn):
    row = pd.DataFrame([pred['encoding']], index=[pred['indexes'].decode()])
    row['is_train'] = pred['is_train']
    row['label'] = pred['labels']
    result = result.append(row)

In [28]:
result.max()

0           1.040512e+02
1           3.864991e+02
2           2.449245e+02
3           1.177961e+02
4           1.943235e+02
5           1.788980e+02
6           2.095470e+02
7           2.443124e+02
8           5.424358e+01
9           2.294001e+02
is_train    1.000000e+00
label       4.000000e+07
dtype: float64

In [30]:
result = result.astype(np.float32)
result['is_train'] = result['is_train'].astype(bool)
result.to_hdf('data/encoding.h5', 'data')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_items] [items->None]

  f(store)


In [29]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,is_train,label
963527a30,-6.069702,8.863812,9.248442,-6.863579,1.531899,1.841633,4.088735,3.819022,-4.530791,-1.502972,0,0.0
a72857156,-12.292958,12.584062,16.552395,-12.429898,6.82753,6.994571,1.646915,10.453823,-9.560718,-1.596989,0,0.0
6e4053c09,-0.36595,-0.065784,0.869526,-1.11746,0.973589,1.07016,0.023595,2.345379,-0.874985,-0.953466,0,0.0
09b1ce445,-0.16217,-0.252848,0.308378,0.026218,0.168719,0.015531,-0.12698,0.212688,-0.233593,-0.14761,0,0.0
2fee6539d,-2.018758,2.266798,3.365917,-2.826028,1.026311,1.409894,1.315633,2.846865,-2.241588,-1.142895,0,0.0
