In [None]:
##Create a dummy data set

In [1]:
import pandas as pd
import numpy as np
import tensorflow_data_validation as tfdv
 
df=pd.DataFrame({'Name':np.random.choice(['Tom','John','Harry'],100),'Age': np.random.choice([15,27,18,20,25,16,10],100),'Region': np.random.choice(['America','Europe'],100)})
 
df.head()

Unnamed: 0,Name,Age,Region
0,Harry,18,America
1,Harry,27,America
2,John,20,America
3,Harry,20,America
4,John,25,America


In [2]:
df.shape

(100, 3)

Create schema

In [4]:
df_stats = tfdv.generate_statistics_from_dataframe(df)
 
schema = tfdv.infer_schema(df_stats)
 
schema

feature {
  name: "Name"
  type: BYTES
  domain: "Name"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Age"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Region"
  type: BYTES
  domain: "Region"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
string_domain {
  name: "Name"
  value: "Harry"
  value: "John"
  value: "Tom"
}
string_domain {
  name: "Region"
  value: "America"
  value: "Europe"
}

##Display the schema in a nicer tabular format which is easy to read

In [5]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Name',STRING,required,,'Name'
'Age',INT,required,,-
'Region',STRING,required,,'Region'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Name',"'Harry', 'John', 'Tom'"
'Region',"'America', 'Europe'"


##Create test dummy data, add a new column, some newvalues in the existing columns

In [7]:
df_test=pd.DataFrame({'Name':np.random.choice(['Tom','John','Harry','Sam'],50),'Age':np.random.randn(50),'Region': np.random.choice(['America','Europe','Asia'],50)})
df_test['gender']='Male'
 
df_test.head()

Unnamed: 0,Name,Age,Region,gender
0,Harry,-1.722624,Asia,Male
1,Sam,-0.345447,America,Male
2,Sam,0.858938,Asia,Male
3,Sam,0.193204,America,Male
4,John,1.037678,America,Male


In [9]:
##Detect anamolies using TFDV

In [10]:
new_stats = tfdv.generate_statistics_from_dataframe(df_test)
     
anomalies = tfdv.validate_statistics(statistics=new_stats, schema=schema)
 
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'gender',New column,New column (column in data but not in schema)
'Region',Unexpected string values,Examples contain values missing from the schema: Asia (~26%).
'Age',Unexpected data type,Expected data of type: INT but got FLOAT
'Name',Unexpected string values,Examples contain values missing from the schema: Sam (~30%).
