In [1]:
! gsutil cp gs://pjm-pipeline-train/OL653374/20200916/* .

Copying gs://pjm-pipeline-train/OL653374/20200916/OL653374_FM2_BS20200916205228.model_training.bin.tfrecord.gz...

Operation completed over 1 objects/164.2 MiB.                                    


In [2]:
! ls | grep tfrecord

OL653374_FM2_BS20200916205228.model_training.bin.tfrecord.gz


In [3]:
import tensorflow as tf

In [4]:
class SparseConstructorLayer(tf.keras.layers.Layer):
    """
    This layer takes a (batch of) 1d variable-length tensor of indices
    and returns a 2d sparse tensor in which every named index has value 1, 
    0 otherwise.
    """
    
    def __init__(self, n):
        self.n = n
        super(SparseConstructorLayer, self).__init__()
        

    def call(self, inputs):
        row_inds = inputs.indices[:,0]
        col_inds = tf.cast(inputs.values, tf.int64)
        
        indices = tf.transpose(tf.stack([row_inds, col_inds]))
        values = tf.ones(tf.shape(inputs.values))
        dense_shape = [tf.shape(inputs)[0], tf.cast(self.n, tf.int64)]
        
        return tf.SparseTensor(indices=indices, values=values, dense_shape=dense_shape)
        

    def get_config(self):
        return {'n': self.n}


In [5]:
FEATURE_SPEC = {'indices': tf.io.VarLenFeature(tf.int64),
                'values': tf.io.VarLenFeature(tf.float32),
                'label': tf.io.FixedLenFeature([], tf.int64, default_value=0)}

def _gzip_reader_fn(filenames):
  """Small utility returning a record reader that can read gzip'ed files."""
  return tf.data.TFRecordDataset(
      filenames,
      compression_type='GZIP')
      
tfde = tf.data.experimental.make_batched_features_dataset(file_pattern='OL653374_FM2_BS20200916205228.model_training.bin.tfrecord.gz',
                                                         batch_size=1024,
                                                         features=FEATURE_SPEC,
                                                         reader=_gzip_reader_fn,
                                                         label_key='label')
# tfde = tfde.map(lambda x: {'label':x['label'],'indices':x['indices']})                                             

In [6]:
for i in tfde.take(3):
    print(i)

({'indices': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f0cb3099b90>, 'values': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f0cb3090e10>}, <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([0, 0, 0, ..., 0, 0, 0])>)
({'indices': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f0cb3096f90>, 'values': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f0d18536290>}, <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([0, 0, 0, ..., 0, 1, 0])>)
({'indices': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f0cadfc1f50>, 'values': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f0cadf63ed0>}, <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([0, 0, 1, ..., 0, 1, 0])>)


In [7]:
"""
Normally we'd use tft.create_and_apply_vocabulary to convert the inputs
from a very sparse 50MM-dimensional space down to something more tractable 
like 40,000. As is, if we don't transform it still works.
"""
MAX_IDX=int(50.01e6)

input_layer = tf.keras.layers.Input(int(50.01e6), sparse=True, name='indices')

sparsed_input = SparseConstructorLayer(MAX_IDX)(input_layer)

lin_fn = tf.keras.layers.Dense(1, 
               activation='sigmoid', 
            #    kernel_regularizer=tf.keras.regularizers.l2() # 0.001
                              )(sparsed_input)

reg_model = tf.keras.Model(inputs = input_layer,
                           outputs = lin_fn)

reg_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), 
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC()])
reg_model.summary()    

history = reg_model.fit(tfde, epochs=10, steps_per_epoch=20)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
indices (InputLayer)         [(None, 50010000)]        0         
_________________________________________________________________
sparse_constructor_layer (Sp (None, 50010000)          0         
_________________________________________________________________
dense (Dense)                (None, 1)                 50010001  
Total params: 50,010,001
Trainable params: 50,010,001
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
