In [None]:
import glob
import logging
import pandas as pd
import os
import json
from ml4ir.base.io import file_io
from ml4ir.base.data import tfrecord_writer
from sklearn.datasets import load_iris
from ml4ir.base.features.feature_config import parse_config
from ml4ir.base.features.feature_config import ExampleFeatureConfig
from ml4ir.base.config.keys import TFRecordTypeKey


# Setup logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("Logger is initialized...")


In [None]:
# Create the a dataframe for iris
df = load_iris()
df.feature_names = [x[0]+x.split()[1] for x in df.feature_names] # making feature names shorter e.g., sepal length (cm) -> s_length 
data = pd.DataFrame(df.data, columns=df.feature_names)
data['label'] = df['target']
data['query_key'] = data.index

In [None]:
data.head()

In [None]:
feature_config_yaml = '''
query_key: 
  name: query_key
  node_name: query_key
  trainable: false
  dtype: int64
  log_at_inference: true
  feature_layer_info:
    type: numeric
    shape: null
  serving_info:
    required: false
    default_value: 0
  tfrecord_type: context
label:
  name: label
  node_name: label
  trainable: false
  dtype: int64
  log_at_inference: true
  feature_layer_info:
    type: numeric
    shape: null
  serving_info:
    required: false
    default_value: 0
  tfrecord_type: sequence
features:
  - name: slength
    node_name: slength
    trainable: true
    dtype: float
    log_at_inference: false
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0.0
    tfrecord_type: sequence
  - name: swidth
    node_name: swidth
    trainable: true
    dtype: float
    log_at_inference: false
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0.0
    tfrecord_type: sequence
  - name: plength
    node_name: plength
    trainable: true
    dtype: float
    log_at_inference: false
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0.0
    tfrecord_type: sequence
  - name: pwidth
    node_name: pwidth
    trainable: true
    dtype: float
    log_at_inference: false
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0.0
    tfrecord_type: sequence
'''

In [None]:
feature_config: ExampleFeatureConfig = parse_config(TFRecordTypeKey.EXAMPLE, feature_config_yaml, logger=logger)

In [None]:

# Save as TFRecord SequenceExample/Example
TFRECORD_DIR = '/tmp/classification/'
if not os.path.exists(TFRECORD_DIR):
    os.makedirs(TFRECORD_DIR)
tfrecord_writer.write_from_df(d,
                              tfrecord_file=os.path.join(TFRECORD_DIR, 'file_0.tfrecord'),
                              feature_config=feature_config,
                              tfrecord_type=TFRecordTypeKey.EXAMPLE)

# Let's see what it looks like
df.head()