In [1]:
import tensorflow_data_validation as tfdv
from google.protobuf import text_format
import tensorflow as tf
import pandas as pd
import numpy as np
import os

import sys
sys.path.append('..')

from analyzers import DataType 

np.random.seed = 1

from ssc.hilda.datasets import *
from ssc.hilda.perturbations import *
from ssc.hilda.learners import *
from ssc.hilda.experiments import *

import warnings
warnings.simplefilter("ignore")

In [2]:
# Pick a dataset
# dataset = CardioDataset()
dataset = BalancedAdultDataset()
# dataset = AdultDataset()

def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.0, 0.05, 0.25, 0.5, 0.75, 0.99]:
            for _ in range(100):
                columns_affected = np.random.choice(dataset.categorical_columns, num_columns_affected)
                yield MissingValues(fraction_of_values_to_delete, columns_affected, -1)

# generate a bunch of perturbations for training
perturbations_for_training = gen_perturbations()

# generate a bunch of perturbations for evaluation
perturbations_for_evaluation = gen_perturbations()

# name the perturbations
perturbations_name = "missing_values_at_random"

# define the learner
# learner = DNN('accuracy')
# learner = LogisticRegression('roc_auc')
learner = LogisticRegression('accuracy')

# run an experiment
log_line, model, mse, mae = reapply_perturbations(dataset, learner, perturbations_for_training,
                                                  perturbations_for_evaluation, perturbations_name)

# print("----------------------------------------------------------------------------------------------")
# print(log_line)


Training model on perturbed data.
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.7min finished


('accuracy', 'on train data: ', 0.8055001992825827)
('accuracy', 'on test data: ', 0.8188775510204082)
('accuracy', 'on target data: ', 0.7954110898661568)

Training meta regressor on perturbed test data.

Evaluating meta regressor on perturbed target data.
MSE 0.00043, MAE 0.0182
Writing plot to /home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf
/home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../results/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.tsv
reapply_perturbations	adult_income_balanced	0.8055001992825827	0.8188775510204082	0.7954110898661568	logistic_regression	accuracy	missing_values_at_random	0.00043434546984068523	0.018177884558061585	/home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf


### TFDV checks, data corruption - 40% of missing values in 3 columns

In [3]:
data_path, file_name = "/".join(dataset.path.split('/')[:-1]), dataset.path.split('/')[-1]

X_train, X_test, X_target = learner.split(dataset.df)

# save train and test data and generate tfdv stats based on these csv files
X_train.to_csv(os.path.join(data_path, 'tmp/X_train.csv'))
X_test.to_csv(os.path.join(data_path, 'tmp/X_test.csv'))

train_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_train.csv'), delimiter=',')
test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_test.csv'), delimiter=',')

# 40% missing values to 3 columns from the list of categorical features, saved to a csv file 
columns_affected = np.random.choice(dataset.categorical_columns, 3)
corrupted_X_test = MissingValues(.4, columns_affected, -1).transform(X_test)
corrupted_X_test.to_csv(os.path.join(data_path, 'tmp/corrupted_test_missing.csv'))

# Generating stats basen on the corrupted data
corrupted_test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/corrupted_test_missing.csv'), delimiter=',')

# Inferring the schema and checking for schema violation
schema = tfdv.infer_schema(train_stats)
anomalies = tfdv.validate_statistics(statistics=corrupted_test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'occupation',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).
'marital_status',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).
'workclass',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).


### Data-linter helping utilities

In [4]:
class TFRecordHelper:
    class __TFRecordHelper:
        def __init__(self):
            self.foo = dict({
                DataType.STRING: lambda x, y: x.bytes_list.value.extend([y]),
                DataType.INTEGER: lambda x, y: x.int64_list.value.extend([y]),
                DataType.FLOAT: lambda x, y: x.float_list.value.extend([y]),
                DataType.OBJECT: lambda x, y: x.bytes_list.value.extend([y])
            })
            self.data_type = dict({
                'int': DataType.INTEGER,
                'int32': DataType.INTEGER,
                'int64': DataType.INTEGER,
                'float': DataType.FLOAT,
                'float32': DataType.FLOAT,
                'float64': DataType.FLOAT,
                'byte': DataType.OBJECT,
                # 'string': DataType.STRING,
                'object': DataType.OBJECT
            })

        def run(self, example, feature_name, dtype, val):
            if not isinstance(dtype, DataType):
                dtype = self.data_type[str(dtype)]
            return self.foo[dtype](example.features.feature[feature_name], val)

    instance = None

    def __init__(self):
        if not TFRecordHelper.instance:
            TFRecordHelper.instance = TFRecordHelper.__TFRecordHelper()

    def __getattr__(self, name):
        return getattr(self.instance, name)


def convert_csv_to_tfrecord(data_path, file_name, dtypes=None):
    filename = os.path.join(data_path, file_name.split('.')[0] + '.tfrecords')
    data = pd.read_csv(os.path.join(data_path, file_name))
    helper = TFRecordHelper()
    columns = data.columns
    if dtypes is None:
        dtypes = data.dtypes
    with tf.python_io.TFRecordWriter(filename) as writer:
        for i in range(data.shape[0]):
            example = tf.train.Example()
            for j in range(data.shape[1]):
                helper.run(example, columns[j], dtypes[j], data.iloc[i, j])
            writer.write(example.SerializeToString())
    return filename

In [5]:
# converting csv files into tfrecords to work with data-linter
train_tfrecord_filename = convert_csv_to_tfrecord(data_path, 'tmp/X_train.csv')
test_tfrecord_filename = convert_csv_to_tfrecord(data_path, 'tmp/X_test.csv')

In [6]:
# python2 required to work with data-linter
python = ["/home/reds/install/miniconda3/envs/python2/bin/python"]

dir_path = os.path.join(globals()['_dh'][0], '../third_party/data-linter')

!{python[0]} {dir_path}/demo/summarize_data.py --dataset_path {train_tfrecord_filename} \
  --stats_path /tmp/adult_summary.bin \
  --dataset_name adult

!{python[0]} {dir_path}/data_linter_main.py --dataset_path {test_tfrecord_filename} \
  --stats_path /tmp/adult_summary.bin \
  --results_path /tmp/datalinter/results/lint_results.bin

!{python[0]} {dir_path}/lint_explorer_main.py --results_path /tmp/datalinter/results/lint_results.bin

# linters are activated, detect anomalies in the data, and explain how these anomalies might be handled

The following linter(s) triggered on your dataset:
* NonNormalNumericFeatureDetector
* TailedDistributionDetector


NonNormalNumericFeatureDetector
A feature flagged by this linter has a distribution that varies significantly
from the other numeric features.
Especially for linear models, poorly scaled features with high variance
(e.g., all but one are in the range [-10, 10] but one is in [0, 100000])
can wash out the effects of the other features.

Quickfix: use the [standard score](https://en.wikipedia.org/wiki/Standard_score)
of (at least) the flagged features.
-----
A 'typical' numeric feature in the dataset has mean 2.97e+04 and std dev 17801 but
* fnlwgt had mean = 1.8911e+05, std_dev = 1.0417e+05


TailedDistributionDetector
A feature flagged by this linter has an extremal value that significantly
affects the mean. This may be because the value is an outlier but it may also
be due to the extremal value being very common. In either case, however, it
would be beneficial to check th

In [7]:
from jupyter_decorator import validate_on

def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.01, 0.05, 0.25, 0.4]:
            for _ in range(100):
                columns_affected = np.random.choice(dataset.categorical_columns, num_columns_affected)
                yield MissingValues(fraction_of_values_to_delete, columns_affected, -1)

y_test = dataset.labels_from(X_test)
y_target = dataset.labels_from(X_target)

# A decorator over the func that creates and trains the model, returns learner
@validate_on(X_test, y_test, X_target, y_target)
def learner_foo():
    dataset = BalancedAdultDataset()
    learner = LogisticRegression('accuracy')
    # X_train = pd.read_csv(os.path.join(data_path, 'tmp/X_train.csv'))
    model = learner.fit(dataset, X_train)
    learner.model = model
    learner.perturbations = gen_perturbations()
    return learner


learner = learner_foo()

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.5min finished



Training meta regressor on perturbed test data.

Evaluating meta regressor on perturbed target data.
(array([0.04170154]), array([0.05135771]))


### 70% missing values

In [8]:
columns_affected = dataset.categorical_columns
new_corrupted_X_test = MissingValues(.7, columns_affected, -1).transform(X_test)
new_corrupted_X_test.to_csv(os.path.join(data_path, 'tmp/new_corrupted_test.csv'))

train_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_train.csv'), delimiter=',')
test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_test.csv'), delimiter=',')

corrupted_test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/new_corrupted_test.csv'), delimiter=',')

schema = tfdv.infer_schema(train_stats)
anomalies = tfdv.validate_statistics(statistics=corrupted_test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'occupation',Unexpected string values,Examples contain values missing from the schema: -1 (~70%).
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~70%).
'marital_status',Unexpected string values,Examples contain values missing from the schema: -1 (~70%).
'workclass',Unexpected string values,Examples contain values missing from the schema: -1 (~70%).


In [10]:
def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.7, 0.8, 0.9]:
            for _ in range(100):
                columns_affected = dataset.categorical_columns
                yield MissingValues(fraction_of_values_to_delete, columns_affected, -1)

@validate_on(X_test, y_test, X_target, y_target)
def learner_foo():
    dataset = BalancedAdultDataset()
    learner = LogisticRegression('accuracy')
    # X_train = pd.read_csv(os.path.join(data_path, 'tmp/X_train.csv'))
    model = learner.fit(dataset, X_train)
    learner.model = model
    learner.perturbations = gen_perturbations()
    return learner


learner = learner_foo()

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.8min finished



Training meta regressor on perturbed test data.

Evaluating meta regressor on perturbed target data.
(array([0.20845324]), array([0.25672146]))


In [None]:
columns_affected = dataset.numerical_columns
new_corrupted_X_test = Outliers(.1, columns_affected).transform(X_test)
new_corrupted_X_test.to_csv(os.path.join(data_path, 'tmp/corrupted_test_anomalies.csv'))

train_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_train.csv'), delimiter=',')
test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_test.csv'), delimiter=',')

corrupted_test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/corrupted_test_anomalies.csv'), delimiter=',')

schema = tfdv.infer_schema(train_stats)
anomalies = tfdv.validate_statistics(statistics=corrupted_test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.1, 0.05, 0.25]:
            for _ in range(100):
                columns_affected = dataset.numerical_columns
                yield Outliers(fraction_of_values_to_delete, columns_affected)

@validate_on(X_test, y_test, X_target, y_target)
def learner_foo():
    dataset = BalancedAdultDataset()
    learner = LogisticRegression('accuracy')
    # X_train = pd.read_csv(os.path.join(data_path, 'tmp/X_train.csv'))
    model = learner.fit(dataset, X_train)
    learner.model = model
    learner.perturbations = gen_perturbations()
    return learner


learner = learner_foo()

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.4min finished



Training meta regressor on perturbed test data.


In [None]:
dataset = TrollingDataset()
data_path, file_name = "/".join(dataset.path.split('/')[:-1]), dataset.path.split('/')[-1]
X_train, X_test, X_target = learner.split(dataset.df)
X_train.to_csv(os.path.join(data_path, 'tmp/X_train.csv'), sep='\t')
X_test.to_csv(os.path.join(data_path, 'tmp/X_test.csv'), sep='\t')
new_corrupted_X_test = Leetspeak(.01, 'content', 'label', 1).transform(X_test)
new_corrupted_X_test.to_csv(os.path.join(data_path, 'tmp/corrupted_test_adversarial.csv'))

train_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_train.csv'), delimiter='\t')
test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_test.csv'), delimiter='\t')

corrupted_test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/corrupted_test_adversarial.csv'), delimiter=',')

schema = tfdv.infer_schema(train_stats)
anomalies = tfdv.validate_statistics(statistics=corrupted_test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

y_test = dataset.labels_from(X_test)
y_target = dataset.labels_from(X_target)

def gen_perturbations():
    for fraction_of_values_to_delete in [0.1, 0.05, 0.25]:
        for _ in range(500):
            yield Leetspeak(fraction_of_values_to_delete, 'content', 'label', 1)

@validate_on(X_test, y_test, X_target, y_target)
def learner_foo():
    dataset = TrollingDataset()
    learner = LogisticRegression('accuracy')
    # X_train = pd.read_csv(os.path.join(data_path, 'tmp/X_train.csv'))
    model = learner.fit(dataset, X_train)
    learner.model = model
    learner.perturbations = gen_perturbations()
    return learner


learner = learner_foo()