In [1]:
# a bit theory about tf data structure

`tf.train.Feature` is part of `TensorFlow's protocol buffers` (protobuf) format, which is a `way to serialize structured data`.
It `represents a single feature`, which can be of three types: `BytesList`, `FloatList`, or `Int64List`. These types are used to store byte strings, floating-point numbers, and integers, respectively.

 Types of tf.train.Feature:

    BytesList: Used for storing byte strings, like images or text data encoded as bytes.
    FloatList: Used for storing floating-point numbers, such as numerical features.
    Int64List: Used for storing integer values, commonly used for categorical or numerical features represented as integers.

In [6]:
import tensorflow as tf

# Assume we have some data about a person
person_data = {
    'name': 'John Doe',
    'age': 30,
    'height': 175.5,
    'image_bytes': b'...',  # Assume this is the byte representation of an image
}

# Convert each feature to tf.train.Feature
name_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[person_data['name'].encode('utf-8')]))
age_feature = tf.train.Feature(int64_list=tf.train.Int64List(value=[person_data['age']]))
height_feature = tf.train.Feature(float_list=tf.train.FloatList(value=[person_data['height']]))
image_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[person_data['image_bytes']]))

# Create a TensorFlow Example using these features
example = tf.train.Example(features=tf.train.Features(feature={
    'name': name_feature,
    'age': age_feature,
    'height': height_feature,
    'image': image_feature,
}))

# Serialize the Example to string
example_str = example.SerializeToString()


In [7]:
print(example)
print(example_str)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 30
      }
    }
  }
  feature {
    key: "height"
    value {
      float_list {
        value: 175.5
      }
    }
  }
  feature {
    key: "image"
    value {
      bytes_list {
        value: "..."
      }
    }
  }
  feature {
    key: "name"
    value {
      bytes_list {
        value: "John Doe"
      }
    }
  }
}

b'\nJ\n\x14\n\x04name\x12\x0c\n\n\n\x08John Doe\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x1e\n\x12\n\x06height\x12\x08\x12\x06\n\x04\x00\x80/C\n\x10\n\x05image\x12\x07\n\x05\n\x03...'


`name_feature, age_feature, height_feature, and image_feature` are instances of tf.train.Feature, each representing a different type of feature.
We then create a `TensorFlow Example` using these features and `serialize it into a string (example_str)` that can be `stored or used in TensorFlow pipelines.`

`serialization` is often part of the final steps before feeding data into models. It encapsulates the processed data into a `serialized format` that is ready to be consumed by training or inference processes.


Serialization is the process of `converting complex data structures, such as objects or data sets, into a format that can be easily stored, transmitted, or reconstructed later`. It's like packaging information into a compact and standardized format for efficient handling.

In [1]:
import tensorflow as tf
import csv # TO READ CSV FILES
import os

In [3]:
# Define functions for creating TFRecord features
def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

#The function's purpose is to convert a single floating-point number (value) into a specific format that TensorFlow can understand when working with data.
#This format is a tf.train.Feature object, which is a way to represent a feature (piece of data) within a TensorFlow record.

#When you call the encode() method on a string, it converts the string into a bytes-like object using a specified encoding (e.g., UTF-8). 
#This conversion is necessary when passing a string to a function that expects bytes, such as _bytes_feature in TensorFlow.

In [3]:
base_dir = os.getcwd()
data_dir = os.path.join(os.pardir, "tfrecord_data/housing.tfrecord")
data_dir

'..\\tfrecord_data/housing.tfrecord'

In [5]:
original_data_file = os.path.join("data", "housing.csv")
print(f"original_data_file: {original_data_file}")
tfrecord_file_path =os.path.join(base_dir, data_dir)
print(f"tfrecord_file_path: {tfrecord_file_path}")


original_data_file: data\housing.csv
tfrecord_file_path: C:\Users\ASUS\building-machine-learning-pipelines\Untitled Folder\..\tfrecord_data/housing.tfrecord


In [14]:
# tf_record_writer = tf.io.TFRecordWriter(tfrecord_filename)


In [17]:
with open(original_data_file, mode='r', newline='') as csv_file:
    reader = csv.DictReader(csv_file)
    with tf.io.TFRecordWriter(tfrecord_file_path) as writer:# writing the tfrecord file in desired path using  tf.io.TFRecordWriter
        for row in reader:
            # Check if the value is not empty before converting to float
            total_bedrooms = float(row["total_bedrooms"]) if row["total_bedrooms"] else 0.0 # since we have missing values in this column
            # Convert each row to TFRecord format
            example = tf.train.Example(features=tf.train.Features(feature={
                "longitude": _float_feature(float(row["longitude"])),
                "latitude": _float_feature(float(row["latitude"])),
                "housing_median_age": _float_feature(float(row["housing_median_age"])),
                "total_rooms": _float_feature(float(row["total_rooms"])),
                "total_bedrooms": _float_feature(float(row["total_bedrooms"]) if row["total_bedrooms"] else 0.0),  # Use the checked value
                "population": _float_feature(float(row["population"])),
                "households": _float_feature(float(row["households"])),
                "median_income": _float_feature(float(row["median_income"])),
                "median_house_value": _float_feature(float(row["median_house_value"])),
                "ocean_proximity": _bytes_feature(row["ocean_proximity"])
            }))
            # Write the serialized example to the TFRecord file
            writer.write(example.SerializeToString())