In [None]:
jupyter nbextension enable --py widgetsnbextension --sys-prefix 
jupyter nbextension install --py --symlink tensorflow_model_analysis --sys-prefix 
jupyter nbextension enable --py tensorflow_model_analysis --sys-prefix 

In [None]:
# Upgrade pip to the latest version and install required packages
!pip install -U pip
!pip install --use-deprecated=legacy-resolver tensorflow_data_validation==1.1.0
!pip install --use-deprecated=legacy-resolver tensorflow-transform==1.0.0
!pip install --use-deprecated=legacy-resolver tensorflow-model-analysis==0.32.0
!pip install apache-beam

In [None]:
# Checking the installation

In [None]:
# Import packages and print versions
import tensorflow as tf
import tensorflow_model_analysis as tfma
import tensorflow_data_validation as tfdv

print('TF version: {}'.format(tf.__version__))
print('TFMA version: {}'.format(tfma.__version__))
print('TFDV version: {}'.format(tfdv.__version__))

Loading datasets

In [None]:
import os

# String variables for file and directory names
URL = 'path to file.tar.gz'
TAR_NAME = 'C3_W4_Lab_1_starter_files.tar.gz'
BASE_DIR = 'starter_files'
DATA_DIR = os.path.join(BASE_DIR, 'data')
CSV_DIR = os.path.join(DATA_DIR, 'csv')
TFRECORD_DIR = os.path.join(DATA_DIR, 'tfrecord')
MODELS_DIR = os.path.join(BASE_DIR, 'models')
SCHEMA_FILE = os.path.join(BASE_DIR, 'schema.pbtxt')

In [None]:
# uncomment this line if you've downloaded the files before and want to reset
# !rm -rf {BASE_DIR}

# Download the tar file from GCP
!wget {URL}

# Extract the tar file to the base directory
!tar xzf {TAR_NAME}

# Delete tar file
!rm {TAR_NAME}

In [None]:
print("Here's what we downloaded:")
!ls {BASE_DIR}

In [None]:
# Preview the dataset
# Path to the full test set
TEST_DATA_PATH = os.path.join(CSV_DIR, 'data_test.csv')

# Preview the first few rows
!head {TEST_DATA_PATH}

In [None]:
# Load the schema as a protocol buffer
SCHEMA = tfdv.load_schema_text(SCHEMA_FILE)

# Display the schema
tfdv.display_schema(SCHEMA)

In [None]:
# imports for helper function
import csv
from tensorflow.core.example import example_pb2
from tensorflow_metadata.proto.v0 import schema_pb2

def csv_to_tfrecord(schema, csv_file, tfrecord_file):
  ''' Converts a csv file into a tfrecord
  Args:
    schema (schema_pb2) - Schema protobuf from TFDV
    csv_file (string) - file to convert to tfrecord
    tfrecord_file (string) - filename of tfrecord to create

  Returns:
    filename of tfrecord
  '''

  # Open CSV file for reading. Each row is mapped as a dictionary.
  reader = csv.DictReader(open(csv_file, 'r'))
  
  # Initialize TF examples list
  examples = []

  # For each row in CSV, create a TF Example based on
  # the Schema and append to the list
  for line in reader:

    # Intialize example
    example = example_pb2.Example()

    # Loop through features in the schema
    for feature in schema.feature:

      # Get current feature name
      key = feature.name

      # Populate values based on data type of current feature
      if feature.type == schema_pb2.FLOAT:
        example.features.feature[key].float_list.value[:] = (
            [float(line[key])] if len(line[key]) > 0 else [])
      elif feature.type == schema_pb2.INT:
        example.features.feature[key].int64_list.value[:] = (
            [int(line[key])] if len(line[key]) > 0 else [])
      elif feature.type == schema_pb2.BYTES:
        example.features.feature[key].bytes_list.value[:] = (
            [line[key].encode('utf8')] if len(line[key]) > 0 else [])
        
    # Append to the list
    examples.append(example)

  # Write examples to tfrecord file
  with tf.io.TFRecordWriter(tfrecord_file) as writer:
    for example in examples:
      writer.write(example.SerializeToString())
  
  return tfrecord_file

In [None]:
# Create tfrecord directory
!mkdir {TFRECORD_DIR}

# Create list of tfrecord files
tfrecord_files = [csv_to_tfrecord(SCHEMA, f'{CSV_DIR}/{name}', f"{TFRECORD_DIR}/{name.replace('csv','tfrecord')}") 
  for name in os.listdir(CSV_DIR)]

# Print created files
print(f'files created: {tfrecord_files}')

# Create variables for each tfrecord
TFRECORD_FULL = os.path.join(TFRECORD_DIR, 'data_test.tfrecord')
TFRECORD_DAY1 = os.path.join(TFRECORD_DIR, 'data_test_1.tfrecord')
TFRECORD_DAY2 = os.path.join(TFRECORD_DIR, 'data_test_2.tfrecord')
TFRECORD_DAY3 = os.path.join(TFRECORD_DIR, 'data_test_3.tfrecord')

# Delete unneeded variable
del tfrecord_files

In [None]:
# list model directories
!ls {MODELS_DIR}

# Create string variables for each model directory
MODEL1_FILE = os.path.join(MODELS_DIR, 'model1')
MODEL2_FILE = os.path.join(MODELS_DIR, 'model2')
MODEL3_FILE = os.path.join(MODELS_DIR, 'model3')

In [None]:
# Load model 1
model = tf.keras.models.load_model(MODEL1_FILE)

# Print summary. You can ignore the warnings at the start.
model.summary()

In [None]:
# Transformation layer can be accessed in two ways. These are equivalent.
model.get_layer('transform_features_layer') is model.tft_layers

In [None]:
from tensorflow_transform.tf_metadata import schema_utils

# Load one tfrecord
tfrecord_file = tf.data.TFRecordDataset(TFRECORD_DAY1)

# Parse schema object as a feature spec
feature_spec = schema_utils.schema_as_feature_spec(SCHEMA).feature_spec

# Create a batch from the dataset
for records in tfrecord_file.batch(1).take(1):

  # Parse the batch to get a dictionary of raw features
  parsed_examples = tf.io.parse_example(records, feature_spec)

  # Print the results
  print("\nRAW FEATURES:")
  for key, value in parsed_examples.items():
    print(f'{key}: {value.numpy()}')
  
  # Pop the label since the model does not expect a label input
  parsed_examples.pop('label')

  # Transform the rest of the raw features using the transform layer
  transformed_examples = model.tft_layer(parsed_examples)

  # Print the input to the model
  print("\nTRANSFORMED FEATURES:")
  for key, value in transformed_examples.items():
    print(f'{key}: {value.numpy()}')

In [None]:
from tensorflow_transform.tf_metadata import schema_utils

# Load one tfrecord
tfrecord_file = tf.data.TFRecordDataset(TFRECORD_DAY1)

# Parse schema object as a feature spec
feature_spec = schema_utils.schema_as_feature_spec(SCHEMA).feature_spec

# Create a batch from the dataset
for records in tfrecord_file.batch(5).take(1):

  # Get the label values from the raw input
  parsed_examples = tf.io.parse_example(records, feature_spec)
  y_true = parsed_examples.pop('label')
  print(f'labels:\n {y_true.numpy()}\n')
  
  # Transform the raw features and pass to the model to get predictions
  transformed_examples = model.tft_layer(parsed_examples)
  y_pred = model(transformed_examples)
  print(f'predictions:\n {y_pred.numpy()}\n')
  
  # Measure the binary accuracy
  metric = tf.keras.metrics.BinaryAccuracy(threshold=0.3)
  metric.update_state(y_true, y_pred)
  print(f'binary accuracy: {metric.result().numpy()}\n')

In [None]:
# Load one tfrecord
tfrecord_file = tf.data.TFRecordDataset(TFRECORD_DAY1)

# Print available signatures
print(f'model signatures: {model.signatures}\n')

# Create a batch
for records in tfrecord_file.batch(5).take(1):

  # Pass the batch to the model serving signature to get predictions
  output = model.signatures['serving_default'](examples=records)

  # Print results
  print(f"predictions:\n {output['output_0']}\n")