In [1]:
try:
  import colab
  !pip install --upgrade pip
except:
  pass

In [2]:
print('Installing TensorFlow Data Validation')
!pip install --upgrade 'tensorflow_data_validation[visualization]<2'

Installing TensorFlow Data Validation
Collecting tensorflow_data_validation<2 (from tensorflow_data_validation[visualization]<2)
  Downloading tensorflow_data_validation-1.16.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pandas<2,>=1.0 (from tensorflow_data_validation<2->tensorflow_data_validation[visualization]<2)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pyarrow<11,>=10 (from tensorflow_data_validation<2->tensorflow_data_validation[visualization]<2)
  Downloading pyarrow-10.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pyfarmhash<0.4,>=0.2.2 (from tensorflow_data_validation<2->tensorflow_data_validation[visualization]<2)
  Downloading pyfarmhash-0.3.2.tar.gz (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.9/99.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... 

In [3]:
import os
import pandas as pd
import tensorflow as tf
import tempfile, urllib, zipfile
import tensorflow_data_validation as tfdv

tf.get_logger().setLevel('ERROR')

In [37]:
df = pd.read_csv('laptop_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [38]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
Company,0
TypeName,0
Inches,0
ScreenResolution,0
Cpu,0
Ram,0
Memory,0
Gpu,0
OpSys,0


In [39]:
def prepare_data_splits_from_dataframe(df):
    '''
    Splits a Pandas Dataframe into training, evaluation and serving sets.

    Parameters:
            df : pandas dataframe to split

    Returns:
            train_df: Training dataframe(70% of the entire dataset)
            eval_df: Evaluation dataframe (15% of the entire dataset)
            serving_df: Serving dataframe (15% of the entire dataset, label column dropped)
    '''

    # 70% of records for generating the training set
    train_len = int(len(df) * 0.7)

    # Remaining 30% of records for generating the evaluation and serving sets
    eval_serv_len = len(df) - train_len

    # Half of the 30%, which makes up 15% of total records, for generating the evaluation set
    eval_len = eval_serv_len // 2

    # Remaining 15% of total records for generating the serving set
    serv_len = eval_serv_len - eval_len

    # Split the dataframe into the three subsets
    train_df = df.iloc[:train_len].reset_index(drop=True)
    eval_df = df.iloc[train_len: train_len + eval_len].reset_index(drop=True)
    serving_df = df.iloc[train_len + eval_len: train_len + eval_len + serv_len].reset_index(drop=True)

    # Serving data emulates the data that would be submitted for predictions, so it should not have the label column.
    serving_df = serving_df.drop(['Price'], axis=1)

    return train_df, eval_df, serving_df

In [40]:
train_df, eval_df, serving_df = prepare_data_splits_from_dataframe(df)
print('Training dataset has {} records\nValidation dataset has {} records\nServing dataset has {} records'.format(len(train_df),len(eval_df),len(serving_df)))

Training dataset has 912 records
Validation dataset has 195 records
Serving dataset has 196 records


In [41]:
features_to_remove = {"Unnamed: 0"}

allowed_cols = [col for col in df.columns if col not in features_to_remove]

stats_options = tfdv.StatsOptions(feature_allowlist=allowed_cols)

for feature in stats_options.feature_allowlist:
    print(feature)

Company
TypeName
Inches
ScreenResolution
Cpu
Ram
Memory
Gpu
OpSys
Weight
Price


### Visualize stats

In [42]:
train_stats = tfdv.generate_statistics_from_dataframe(train_df, stats_options=stats_options)
tfdv.visualize_statistics(train_stats)

In [43]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Company',STRING,required,,'Company'
'TypeName',STRING,required,,'TypeName'
'Inches',FLOAT,required,,-
'ScreenResolution',STRING,required,,'ScreenResolution'
'Cpu',STRING,required,,'Cpu'
'Ram',STRING,required,,'Ram'
'Memory',STRING,required,,'Memory'
'Gpu',STRING,required,,'Gpu'
'OpSys',STRING,required,,'OpSys'
'Weight',BYTES,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Company',"'Acer', 'Apple', 'Asus', 'Chuwi', 'Dell', 'Fujitsu', 'Google', 'HP', 'Huawei', 'LG', 'Lenovo', 'MSI', 'Mediacom', 'Microsoft', 'Razer', 'Samsung', 'Toshiba', 'Vero', 'Xiaomi'"
'TypeName',"'2 in 1 Convertible', 'Gaming', 'Netbook', 'Notebook', 'Ultrabook', 'Workstation'"
'ScreenResolution',"'1366x768', '1440x900', '1600x900', '1920x1080', '2560x1440', '4K Ultra HD / Touchscreen 3840x2160', '4K Ultra HD 3840x2160', 'Full HD / Touchscreen 1920x1080', 'Full HD 1920x1080', 'IPS Panel 1366x768', 'IPS Panel 2560x1440', 'IPS Panel 4K Ultra HD / Touchscreen 3840x2160', 'IPS Panel 4K Ultra HD 3840x2160', 'IPS Panel Full HD / Touchscreen 1920x1080', 'IPS Panel Full HD 1366x768', 'IPS Panel Full HD 1920x1080', 'IPS Panel Full HD 1920x1200', 'IPS Panel Full HD 2160x1440', 'IPS Panel Full HD 2560x1440', 'IPS Panel Quad HD+ / Touchscreen 3200x1800', 'IPS Panel Quad HD+ 2560x1440', 'IPS Panel Quad HD+ 3200x1800', 'IPS Panel Retina Display 2304x1440', 'IPS Panel Retina Display 2560x1600', 'IPS Panel Retina Display 2736x1824', 'IPS Panel Retina Display 2880x1800', 'IPS Panel Touchscreen / 4K Ultra HD 3840x2160', 'IPS Panel Touchscreen 1366x768', 'IPS Panel Touchscreen 1920x1200', 'IPS Panel Touchscreen 2560x1440', 'Quad HD+ / Touchscreen 3200x1800', 'Touchscreen / Full HD 1920x1080', 'Touchscreen / Quad HD+ 3200x1800', 'Touchscreen 1366x768', 'Touchscreen 2256x1504', 'Touchscreen 2400x1600', 'Touchscreen 2560x1440'"
'Cpu',"'AMD A10-Series 9600P 2.4GHz', 'AMD A10-Series 9620P 2.5GHz', 'AMD A10-Series A10-9620P 2.5GHz', 'AMD A12-Series 9700P 2.5GHz', 'AMD A12-Series 9720P 2.7GHz', 'AMD A12-Series 9720P 3.6GHz', 'AMD A4-Series 7210 2.2GHz', 'AMD A6-Series 7310 2GHz', 'AMD A6-Series 9220 2.5GHz', 'AMD A6-Series 9220 2.9GHz', 'AMD A6-Series A6-9220 2.5GHz', 'AMD A8-Series 7410 2.2GHz', 'AMD A9-Series 9420 2.9GHz', 'AMD A9-Series 9420 3GHz', 'AMD A9-Series A9-9420 3GHz', 'AMD E-Series 6110 1.5GHz', 'AMD E-Series 7110 1.8GHz', 'AMD E-Series 9000e 1.5GHz', 'AMD E-Series E2-6110 1.5GHz', 'AMD E-Series E2-9000 2.2GHz', 'AMD E-Series E2-9000e 1.5GHz', 'AMD FX 9830P 3GHz', 'AMD Ryzen 1600 3.2GHz', 'AMD Ryzen 1700 3GHz', 'Intel Atom Z8350 1.92GHz', 'Intel Atom x5-Z8300 1.44GHz', 'Intel Atom x5-Z8350 1.44GHz', 'Intel Atom x5-Z8550 1.44GHz', 'Intel Celeron Dual Core 3205U 1.5GHz', 'Intel Celeron Dual Core 3855U 1.6GHz', 'Intel Celeron Dual Core N3050 1.6GHz', 'Intel Celeron Dual Core N3060 1.60GHz', 'Intel Celeron Dual Core N3060 1.6GHz', 'Intel Celeron Dual Core N3350 1.1GHz', 'Intel Celeron Dual Core N3350 2.0GHz', 'Intel Celeron Quad Core N3160 1.6GHz', 'Intel Celeron Quad Core N3450 1.1GHz', 'Intel Celeron Quad Core N3710 1.6GHz', 'Intel Core M 1.2GHz', 'Intel Core M 6Y75 1.2GHz', 'Intel Core M 7Y30 1.0GHz', 'Intel Core M M3-6Y30 0.9GHz', 'Intel Core M m3 1.2GHz', 'Intel Core M m3-7Y30 2.2GHz', 'Intel Core i3 6006U 2.0GHz', 'Intel Core i3 6006U 2.2GHz', 'Intel Core i3 6006U 2GHz', 'Intel Core i3 6100U 2.1GHz', 'Intel Core i3 6100U 2.3GHz', 'Intel Core i3 7100U 2.4GHz', 'Intel Core i3 7130U 2.7GHz', 'Intel Core i5 1.3GHz', 'Intel Core i5 1.6GHz', 'Intel Core i5 1.8GHz', 'Intel Core i5 2.0GHz', 'Intel Core i5 2.3GHz', 'Intel Core i5 2.9GHz', 'Intel Core i5 3.1GHz', 'Intel Core i5 6200U 2.3GHz', 'Intel Core i5 6260U 1.8GHz', 'Intel Core i5 6300HQ 2.3GHz', 'Intel Core i5 6300U 2.4GHz', 'Intel Core i5 6440HQ 2.6GHz', 'Intel Core i5 7200U 2.5GHz', 'Intel Core i5 7300HQ 2.5GHz', 'Intel Core i5 7300U 2.6GHz', 'Intel Core i5 7440HQ 2.8GHz', 'Intel Core i5 7500U 2.7GHz', 'Intel Core i5 7Y54 1.2GHz', 'Intel Core i5 7Y57 1.2GHz', 'Intel Core i5 8250U 1.6GHz', 'Intel Core i7 2.2GHz', 'Intel Core i7 2.7GHz', 'Intel Core i7 2.8GHz', 'Intel Core i7 2.9GHz', 'Intel Core i7 6500U 2.5GHz', 'Intel Core i7 6600U 2.6GHz', 'Intel Core i7 6700HQ 2.6GHz', 'Intel Core i7 6820HK 2.7GHz', 'Intel Core i7 6820HQ 2.7GHz', 'Intel Core i7 6920HQ 2.9GHz', 'Intel Core i7 7500U 2.5GHz', 'Intel Core i7 7500U 2.7GHz', 'Intel Core i7 7560U 2.4GHz', 'Intel Core i7 7600U 2.8GHz', 'Intel Core i7 7660U 2.5GHz', 'Intel Core i7 7700HQ 2.7GHz', 'Intel Core i7 7700HQ 2.8GHz', 'Intel Core i7 7820HK 2.9GHz', 'Intel Core i7 7820HQ 2.9GHz', 'Intel Core i7 7Y75 1.3GHz', 'Intel Core i7 8550U 1.8GHz', 'Intel Core i7 8650U 1.9GHz', 'Intel Pentium Dual Core 4405U 2.1GHz', 'Intel Pentium Dual Core N4200 1.1GHz', 'Intel Pentium Quad Core N3710 1.6GHz', 'Intel Pentium Quad Core N4200 1.1GHz', 'Intel Xeon E3-1505M V6 3GHz', 'Intel Xeon E3-1535M v5 2.9GHz', 'Intel Xeon E3-1535M v6 3.1GHz'"
'Ram',"'12GB', '16GB', '24GB', '2GB', '32GB', '4GB', '6GB', '8GB'"
'Memory',"'1.0TB Hybrid', '128GB Flash Storage', '128GB HDD', '128GB SSD', '128GB SSD + 1TB HDD', '128GB SSD + 2TB HDD', '16GB Flash Storage', '16GB SSD', '180GB SSD', '1TB HDD', '1TB HDD + 1TB HDD', '1TB SSD', '1TB SSD + 1TB HDD', '240GB SSD', '256GB Flash Storage', '256GB SSD', '256GB SSD + 1TB HDD', '256GB SSD + 256GB SSD', '256GB SSD + 2TB HDD', '256GB SSD + 500GB HDD', '2TB HDD', '32GB Flash Storage', '32GB HDD', '32GB SSD', '500GB HDD', '512GB Flash Storage', '512GB SSD', '512GB SSD + 1TB HDD', '512GB SSD + 256GB SSD', '512GB SSD + 2TB HDD', '512GB SSD + 512GB SSD', '64GB Flash Storage', '64GB Flash Storage + 1TB HDD', '64GB SSD'"
'Gpu',"'AMD FirePro W4190M ', 'AMD FirePro W5130M', 'AMD R17M-M1-70', 'AMD R4 Graphics', 'AMD Radeon 520', 'AMD Radeon 530', 'AMD Radeon 540', 'AMD Radeon Pro 455', 'AMD Radeon Pro 555', 'AMD Radeon Pro 560', 'AMD Radeon R2', 'AMD Radeon R2 Graphics', 'AMD Radeon R3', 'AMD Radeon R4', 'AMD Radeon R4 Graphics', 'AMD Radeon R5', 'AMD Radeon R5 430', 'AMD Radeon R5 520', 'AMD Radeon R5 M420', 'AMD Radeon R5 M420X', 'AMD Radeon R5 M430', 'AMD Radeon R7', 'AMD Radeon R7 Graphics', 'AMD Radeon R7 M440', 'AMD Radeon R7 M445', 'AMD Radeon R7 M460', 'AMD Radeon R7 M465', 'AMD Radeon RX 540', 'AMD Radeon RX 550', 'AMD Radeon RX 560', 'AMD Radeon RX 580', 'Intel Graphics 620', 'Intel HD Graphics', 'Intel HD Graphics 400', 'Intel HD Graphics 405', 'Intel HD Graphics 500', 'Intel HD Graphics 505', 'Intel HD Graphics 510', 'Intel HD Graphics 515', 'Intel HD Graphics 520', 'Intel HD Graphics 530', 'Intel HD Graphics 5300', 'Intel HD Graphics 540', 'Intel HD Graphics 6000', 'Intel HD Graphics 615', 'Intel HD Graphics 620', 'Intel HD Graphics 630', 'Intel Iris Graphics 540', 'Intel Iris Graphics 550', 'Intel Iris Plus Graphics 640', 'Intel Iris Plus Graphics 650', 'Intel Iris Pro Graphics', 'Intel UHD Graphics 620', 'Nvidia GTX 980 SLI', 'Nvidia GeForce 150MX', 'Nvidia GeForce 920', 'Nvidia GeForce 920M', 'Nvidia GeForce 920MX', 'Nvidia GeForce 920MX ', 'Nvidia GeForce 930M', 'Nvidia GeForce 930MX', 'Nvidia GeForce 930MX ', 'Nvidia GeForce 940M', 'Nvidia GeForce 940MX', 'Nvidia GeForce GT 940MX', 'Nvidia GeForce GTX 1050', 'Nvidia GeForce GTX 1050 Ti', 'Nvidia GeForce GTX 1050M', 'Nvidia GeForce GTX 1050Ti', 'Nvidia GeForce GTX 1060', 'Nvidia GeForce GTX 1070', 'Nvidia GeForce GTX 1070M', 'Nvidia GeForce GTX 1080', 'Nvidia GeForce GTX 930MX', 'Nvidia GeForce GTX 940M', 'Nvidia GeForce GTX 940MX', 'Nvidia GeForce GTX 950M', 'Nvidia GeForce GTX 960', 'Nvidia GeForce GTX 960<U+039C>', 'Nvidia GeForce GTX 960M', 'Nvidia GeForce GTX 965M', 'Nvidia GeForce GTX 970M', 'Nvidia GeForce GTX 980M', 'Nvidia GeForce GTX1050 Ti', 'Nvidia GeForce GTX1060', 'Nvidia GeForce GTX1080', 'Nvidia GeForce MX130', 'Nvidia GeForce MX150', 'Nvidia Quadro M1000M', 'Nvidia Quadro M1200', 'Nvidia Quadro M2000M', 'Nvidia Quadro M2200', 'Nvidia Quadro M2200M', 'Nvidia Quadro M520M', 'Nvidia Quadro M620', 'Nvidia Quadro M620M'"
'OpSys',"'Android', 'Chrome OS', 'Linux', 'Mac OS X', 'No OS', 'Windows 10', 'Windows 10 S', 'Windows 7', 'macOS'"


In [44]:
eval_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=eval_df,
    stats_options=stats_options
)

tfdv.visualize_statistics(
    rhs_statistics=eval_stats,
    rhs_name="EVAL_STATS",
    lhs_statistics=train_stats,
    lhs_name="TRAIN_STATS"
)

### Check anomalies in the eval set

In [45]:
def calculate_and_display_anomalies(statistics, schema):
    anomalies = tfdv.validate_statistics(statistics, schema)
    tfdv.display_anomalies(anomalies)

In [46]:
calculate_and_display_anomalies(statistics=eval_stats, schema=schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Memory',Unexpected string values,"Examples contain values missing from the schema: 1.0TB HDD (<1%), 508GB Hybrid (<1%), 512GB SSD + 1.0TB Hybrid (<1%), 8GB SSD (<1%)."
'ScreenResolution',Unexpected string values,Examples contain values missing from the schema: Quad HD+ 3200x1800 (~1%).
'Cpu',Unexpected string values,"Examples contain values missing from the schema: AMD FX 8800P 2.1GHz (<1%), Intel Atom X5-Z8350 1.44GHz (<1%), Intel Core M 1.1GHz (<1%), Intel Core M M7-6Y75 1.2GHz (<1%), Intel Core M m7-6Y75 1.2GHz (<1%), Intel Core i5 7200U 2.50GHz (<1%), Intel Core i5 7200U 2.70GHz (<1%), Intel Core i5 7200U 2.7GHz (<1%), Intel Core i7 6560U 2.2GHz (<1%), Intel Pentium Dual Core 4405Y 1.5GHz (<1%), Intel Pentium Quad Core N3700 1.6GHz (<1%)."
'Gpu',Unexpected string values,"Examples contain values missing from the schema: AMD FirePro W4190M (<1%), AMD FirePro W6150M (<1%), AMD Radeon R5 M330 (<1%), AMD Radeon R7 M365X (<1%), AMD Radeon R9 M385 (<1%), Intel HD Graphics 620 (<1%), Nvidia GeForce GTX 980 (<1%), Nvidia Quadro 3000M (<1%)."
'Ram',Unexpected string values,Examples contain values missing from the schema: 64GB (<1%).


In [47]:
ram_domain = tfdv.get_domain(schema, 'Ram')
print(ram_domain)
ram_domain.value.append("64GB")

screen_resolution_domain = tfdv.get_domain(schema, 'ScreenResolution')
print(screen_resolution_domain)
screen_resolution_domain.value.append("Quad HD+ 3200x1800")

calculate_and_display_anomalies(eval_stats, schema=schema)

name: "Ram"
value: "12GB"
value: "16GB"
value: "24GB"
value: "2GB"
value: "32GB"
value: "4GB"
value: "6GB"
value: "8GB"

name: "ScreenResolution"
value: "1366x768"
value: "1440x900"
value: "1600x900"
value: "1920x1080"
value: "2560x1440"
value: "4K Ultra HD / Touchscreen 3840x2160"
value: "4K Ultra HD 3840x2160"
value: "Full HD / Touchscreen 1920x1080"
value: "Full HD 1920x1080"
value: "IPS Panel 1366x768"
value: "IPS Panel 2560x1440"
value: "IPS Panel 4K Ultra HD / Touchscreen 3840x2160"
value: "IPS Panel 4K Ultra HD 3840x2160"
value: "IPS Panel Full HD / Touchscreen 1920x1080"
value: "IPS Panel Full HD 1366x768"
value: "IPS Panel Full HD 1920x1080"
value: "IPS Panel Full HD 1920x1200"
value: "IPS Panel Full HD 2160x1440"
value: "IPS Panel Full HD 2560x1440"
value: "IPS Panel Quad HD+ / Touchscreen 3200x1800"
value: "IPS Panel Quad HD+ 2560x1440"
value: "IPS Panel Quad HD+ 3200x1800"
value: "IPS Panel Retina Display 2304x1440"
value: "IPS Panel Retina Display 2560x1600"
value: "IPS Pa

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Memory',Unexpected string values,"Examples contain values missing from the schema: 1.0TB HDD (<1%), 508GB Hybrid (<1%), 512GB SSD + 1.0TB Hybrid (<1%), 8GB SSD (<1%)."
'Gpu',Unexpected string values,"Examples contain values missing from the schema: AMD FirePro W4190M (<1%), AMD FirePro W6150M (<1%), AMD Radeon R5 M330 (<1%), AMD Radeon R7 M365X (<1%), AMD Radeon R9 M385 (<1%), Intel HD Graphics 620 (<1%), Nvidia GeForce GTX 980 (<1%), Nvidia Quadro 3000M (<1%)."
'Cpu',Unexpected string values,"Examples contain values missing from the schema: AMD FX 8800P 2.1GHz (<1%), Intel Atom X5-Z8350 1.44GHz (<1%), Intel Core M 1.1GHz (<1%), Intel Core M M7-6Y75 1.2GHz (<1%), Intel Core M m7-6Y75 1.2GHz (<1%), Intel Core i5 7200U 2.50GHz (<1%), Intel Core i5 7200U 2.70GHz (<1%), Intel Core i5 7200U 2.7GHz (<1%), Intel Core i7 6560U 2.2GHz (<1%), Intel Pentium Dual Core 4405Y 1.5GHz (<1%), Intel Pentium Quad Core N3700 1.6GHz (<1%)."


In [53]:
gpu_list = ["AMD FirePro W4190M", "AMD FirePro W6150M", "AMD Radeon R5 M330", "AMD Radeon R7 M365X",
            "AMD Radeon R9 M385", "Intel HD Graphics 620 ", "Nvidia GeForce GTX 980",
            "Nvidia Quadro 3000M", "Nvidia GeForce GTX 980 "]

gpu_domain = tfdv.get_domain(schema, 'Gpu')
for gpu in gpu_list:
    gpu_domain.value.append(gpu)

memory_list = ["1.0TB HDD", "508GB Hybrid", "512GB SSD + 1.0TB Hybrid",  "8GB SSD"]
memory_domain = tfdv.get_domain(schema, 'Memory')
for memory in memory_list:
    memory_domain.value.append(memory)

cpu_list = ["AMD FX 8800P 2.1GHz", "Intel Atom X5-Z8350 1.44GHz", "Intel Core M 1.1GHz", "Intel Core M M7-6Y75 1.2GHz",
            "Intel Core M m7-6Y75 1.2GHz", "Intel Core i5 7200U 2.50GHz", "Intel Core i5 7200U 2.70GHz",
            "Intel Core i5 7200U 2.7GHz", "Intel Core i7 6560U 2.2GHz", "Intel Pentium Dual Core 4405Y 1.5GHz",
            "Intel Pentium Quad Core N3700 1.6GHz"]

cpu_domain = tfdv.get_domain(schema, 'Cpu')
for cpu in cpu_list:
    cpu_domain.value.append(cpu)

calculate_and_display_anomalies(eval_stats, schema=schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Memory',Unexpected string values,Examples contain values missing from the schema: 512GB SSD + 1.0TB Hybrid (<1%).


In [54]:
memory = tfdv.get_feature(
    feature_path="Memory",
    schema=schema
)

memory.distribution_constraints.min_domain_mass = 0.9

calculate_and_display_anomalies(eval_stats, schema=schema)

In [55]:
print(gpu_domain)
print("===============================================")

print(cpu_domain)
print("===============================================")

print(memory_domain)

name: "Gpu"
value: "AMD FirePro W4190M "
value: "AMD FirePro W5130M"
value: "AMD R17M-M1-70"
value: "AMD R4 Graphics"
value: "AMD Radeon 520"
value: "AMD Radeon 530"
value: "AMD Radeon 540"
value: "AMD Radeon Pro 455"
value: "AMD Radeon Pro 555"
value: "AMD Radeon Pro 560"
value: "AMD Radeon R2"
value: "AMD Radeon R2 Graphics"
value: "AMD Radeon R3"
value: "AMD Radeon R4"
value: "AMD Radeon R4 Graphics"
value: "AMD Radeon R5"
value: "AMD Radeon R5 430"
value: "AMD Radeon R5 520"
value: "AMD Radeon R5 M420"
value: "AMD Radeon R5 M420X"
value: "AMD Radeon R5 M430"
value: "AMD Radeon R7"
value: "AMD Radeon R7 Graphics"
value: "AMD Radeon R7 M440"
value: "AMD Radeon R7 M445"
value: "AMD Radeon R7 M460"
value: "AMD Radeon R7 M465"
value: "AMD Radeon RX 540"
value: "AMD Radeon RX 550"
value: "AMD Radeon RX 560"
value: "AMD Radeon RX 580"
value: "Intel Graphics 620"
value: "Intel HD Graphics"
value: "Intel HD Graphics 400"
value: "Intel HD Graphics 405"
value: "Intel HD Graphics 500"
value: "

### Check anomalies in the serving set



In [56]:
options = tfdv.StatsOptions(
    schema = schema,
    infer_type_from_schema = True,
    feature_allowlist = allowed_cols
)

serving_stats = tfdv.generate_statistics_from_dataframe(serving_df, stats_options=options)
calculate_and_display_anomalies(serving_stats, schema=schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Price',Column dropped,Column is completely missing
'ScreenResolution',Unexpected string values,"Examples contain values missing from the schema: IPS Panel Touchscreen 2400x1600 (<1%), Touchscreen / 4K Ultra HD 3840x2160 (<1%)."
'Cpu',Unexpected string values,"Examples contain values missing from the schema: AMD A9-Series 9410 2.9GHz (~1%), AMD E-Series 9000 2.2GHz (<1%), Intel Celeron Dual Core N3350 2GHz (~1%), Intel Core M 6Y30 0.9GHz (~1%), Intel Core M 6Y54 1.1GHz (<1%), Intel Core i7 6500U 2.50GHz (~1%), Samsung Cortex A72&A53 2.0GHz (<1%)."
'Gpu',Unexpected string values,"Examples contain values missing from the schema: AMD Radeon R5 M315 (<1%), AMD Radeon R7 M360 (<1%), ARM Mali T860 MP4 (<1%), Nvidia GeForce 960M (<1%), Nvidia Quadro M3000M (<1%), Nvidia Quadro M500M (<1%)."


In [58]:
screen_resolution = tfdv.get_feature(schema, "ScreenResolution")
screen_resolution.distribution_constraints.min_domain_mass = 0.9

cpu = tfdv.get_feature(schema, "Cpu")
cpu.distribution_constraints.min_domain_mass = 0.9

gpu = tfdv.get_feature(schema, "Gpu")
gpu.distribution_constraints.min_domain_mass = 0.9

calculate_and_display_anomalies(serving_stats, schema=schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Price',Column dropped,Column is completely missing


The `Price` feature (which is the label column) is showing up as an anomaly ('Column dropped').

Since labels are not expected in the serving data, as we are going to predict its value, we need to tell TFDV to ignore this detected anomaly.

In [62]:
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

tfdv.get_feature(schema, 'Price').not_in_environment.append('SERVING')

serving_anomalies_with_env = tfdv.validate_statistics(serving_stats, schema, environment='SERVING')
tfdv.display_anomalies(serving_anomalies_with_env)

### Freeze the schema

In [63]:
schema_file = os.path.join("./", 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)