# TFDV with Titanic Data

## 1. Import Package

In [1]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
print('TF version : ',tf.__version__)
print('TFDV version : ',tfdv.__version__)

TF version :  2.8.0
TFDV version :  1.7.0


## 2. Set Data Path

In [2]:
import os

BASE_DIR = os.path.join('D:\Study\ML\So1s-Study\일섭\week2', 'titanic')
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
TRAIN_DATA = os.path.join(DATA_DIR, 'train.csv')
TEST_DATA = os.path.join(DATA_DIR, 'test.csv')

os.path.isdir(BASE_DIR), os.path.isdir(DATA_DIR), os.path.isdir(OUTPUT_DIR), os.path.isfile(TRAIN_DATA), os.path.isfile(TEST_DATA)

(True, True, True, True, True)

## 3. Train Data Statistics

In [3]:
train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
tfdv.visualize_statistics(train_stats)





Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


## 4. Train Data Schema Check

In [4]:
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Name',BYTES,required,,-
'Sex',STRING,required,,'Sex'
'Ticket',BYTES,required,,-
'Cabin',BYTES,optional,single,-
'Embarked',STRING,optional,single,'Embarked'
'PassengerId',INT,required,,-
'Survived',INT,required,,-
'Pclass',INT,required,,-
'Age',FLOAT,optional,single,-
'SibSp',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Sex',"'female', 'male'"
'Embarked',"'C', 'Q', 'S'"


## 5. Compare Train with Test

In [5]:
test_stats = tfdv.generate_statistics_from_csv(data_location=TEST_DATA)
tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats,
                         lhs_name="TEST_DATA", rhs_name="TRAIN_DATA")



## 6. Check Anomalies (Test Data)

In [6]:
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Survived',Column dropped,Column is completely missing
'Fare',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.997608"


In [7]:
options = tfdv.StatsOptions(schema=schema, infer_type_from_schema = True)
test_stats = tfdv.generate_statistics_from_csv(TEST_DATA, stats_options=options)
test_anomalies = tfdv.validate_statistics(test_stats, schema)

schema.default_environment.append('TRAIN')
schema.default_environment.append('TEST')

tfdv.get_feature(schema, 'Survived').not_in_environment.append('TEST')
test_anomalies_with_env = tfdv.validate_statistics(test_stats, schema, environment='TEST')

tfdv.display_anomalies(test_anomalies_with_env)



Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Fare',Multiple errors,"The feature has a shape, but it's not always present (if the feature is nested, then it should always be present at each nested level) or its value lengths vary. The feature was present in fewer examples than expected: minimum fraction = 1.000000, actual = 0.997608"


In [8]:
# Add skew comparator for 'payment_type' feature.
sex = tfdv.get_feature(schema, 'Sex')
sex.skew_comparator.infinity_norm.threshold = 0.01

# # Add drift comparator for 'company' feature.
# sex2=tfdv.get_feature(schema, 'Sex')
# sex2.drift_comparator.infinity_norm.threshold = 0.01

skew_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          serving_statistics=test_stats)

tfdv.display_anomalies(skew_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Sex',High Linfty distance between training and serving,"The Linfty distance between training and serving is 0.0112233 (up to six significant digits), above the threshold 0.01. The feature value with maximum difference is: male"
