In [1]:
import tensorflow_data_validation as tfdv
from google.protobuf import text_format
import tensorflow as tf
import pandas as pd
import numpy as np
import os

import sys
sys.path.append('..')

from analyzers import DataType 

np.random.seed = 1

In [2]:
from ssc.hilda.datasets import *
from ssc.hilda.perturbations import *
from ssc.hilda.learners import *
from ssc.hilda.experiments import *

import warnings
warnings.simplefilter("ignore")

# Pick a dataset
# dataset = CardioDataset()
dataset = BalancedAdultDataset()
# dataset = AdultDataset()

def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.0, 0.05, 0.25, 0.5, 0.75, 0.99]:
            for _ in range(100):
                columns_affected = np.random.choice(dataset.categorical_columns, num_columns_affected)
                yield MissingValues(fraction_of_values_to_delete, columns_affected, -1)

# generate a bunch of perturbations for training
perturbations_for_training = list(gen_perturbations())

# generate a bunch of perturbations for evaluation
perturbations_for_evaluation = list(gen_perturbations())

# name the perturbations
perturbations_name = "missing_values_at_random"

# define the learner
# learner = DNN('accuracy')
# learner = LogisticRegression('roc_auc')
learner = LogisticRegression('accuracy')

# run an experiment
log_line, model, mse, mae = reapply_perturbations(dataset, learner, perturbations_for_training,
                                                  perturbations_for_evaluation, perturbations_name)

# print("----------------------------------------------------------------------------------------------")
# print(log_line)


Training model on perturbed data.
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.5min finished


('accuracy', 'on train data: ', 0.8038262255878836)
('accuracy', 'on test data: ', 0.7895408163265306)
('accuracy', 'on target data: ', 0.8068833652007649)

Training meta regressor on perturbed test data.

Evaluating meta regressor on perturbed target data.
MSE 0.00034, MAE 0.0173
Writing plot to /home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf
/home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../results/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.tsv
reapply_perturbations	adult_income_balanced	0.8038262255878836	0.7895408163265306	0.8068833652007649	logistic_regression	accuracy	missing_values_at_random	0.00033986933018817896	0.01733073822761259	/home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf


In [5]:
# def train_test_split_csv(data_path, file_name, test_ratio=.2):
#     data = pd.read_csv(os.path.join(data_path, file_name))
#     train, test = train_test_split(data, test_size=test_ratio, random_state=1)
#     if not os.path.exists(os.path.join(data_path, 'tmp')):
#         os.makedirs(os.path.join(data_path, 'tmp'))
#     train.to_csv(os.path.join(data_path, 'tmp/train.csv'))
#     test.to_csv(os.path.join(data_path, 'tmp/test.csv'))

data_path, file_name = "/".join(dataset.path.split('/')[:-1]), dataset.path.split('/')[-1]
# train_test_split_csv(data_path, file_name, test_ratio=.2)

X_train, X_test, X_target = learner.split(dataset.df)

columns_affected = np.random.choice(dataset.categorical_columns, 3)
corrupted_X_test = MissingValues(.4, columns_affected, -1).transform(X_test)
corrupted_X_test.to_csv(os.path.join(data_path, 'tmp/corrupted_test.csv'))

X_train.to_csv(os.path.join(data_path, 'tmp/X_train.csv'))
X_test.to_csv(os.path.join(data_path, 'tmp/X_test.csv'))

train_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_train.csv'), delimiter=',')
test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_test.csv'), delimiter=',')

corrupted_test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/corrupted_test.csv'), delimiter=',')

schema = tfdv.infer_schema(train_stats)
# print(schema)
# tfdv.display_schema(schema)
anomalies = tfdv.validate_statistics(statistics=corrupted_test_stats, schema=schema)
# print(anomalies)
tfdv.display_anomalies(anomalies)
# print(text_format.MessageToString(anomalies))

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).
'marital_status',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).


In [6]:
class TFRecordHelper:
    class __TFRecordHelper:
        def __init__(self):
            self.foo = dict({
                DataType.STRING: lambda x, y: x.bytes_list.value.extend([y]),
                DataType.INTEGER: lambda x, y: x.int64_list.value.extend([y]),
                DataType.FLOAT: lambda x, y: x.float_list.value.extend([y]),
                DataType.OBJECT: lambda x, y: x.bytes_list.value.extend([y])
            })
            self.data_type = dict({
                'int': DataType.INTEGER,
                'int32': DataType.INTEGER,
                'int64': DataType.INTEGER,
                'float': DataType.FLOAT,
                'float32': DataType.FLOAT,
                'float64': DataType.FLOAT,
                'byte': DataType.OBJECT,
                # 'string': DataType.STRING,
                'object': DataType.OBJECT
            })

        def run(self, example, feature_name, dtype, val):
            if not isinstance(dtype, DataType):
                dtype = self.data_type[str(dtype)]
            return self.foo[dtype](example.features.feature[feature_name], val)

    instance = None

    def __init__(self):
        if not TFRecordHelper.instance:
            TFRecordHelper.instance = TFRecordHelper.__TFRecordHelper()

    def __getattr__(self, name):
        return getattr(self.instance, name)


def convert_csv_to_tfrecord(data_path, file_name, dtypes=None):
    filename = os.path.join(data_path, file_name.split('.')[0] + '.tfrecords')
    data = pd.read_csv(os.path.join(data_path, file_name))
    helper = TFRecordHelper()
    columns = data.columns
    if dtypes is None:
        dtypes = data.dtypes
    with tf.python_io.TFRecordWriter(filename) as writer:
        for i in range(data.shape[0]):
            example = tf.train.Example()
            for j in range(data.shape[1]):
                helper.run(example, columns[j], dtypes[j], data.iloc[i, j])
            writer.write(example.SerializeToString())
    return filename

In [7]:
train_tfrecord_filename = convert_csv_to_tfrecord(data_path, 'tmp/X_train.csv')
test_tfrecord_filename = convert_csv_to_tfrecord(data_path, 'tmp/X_test.csv')

In [None]:
python = ["/home/reds/install/miniconda3/envs/python2/bin/python"]

dir_path = os.path.join(globals()['_dh'][0], '../third_party/data-linter')

!{python[0]} {dir_path}/demo/summarize_data.py --dataset_path {train_tfrecord_filename} \
  --stats_path /tmp/adult_summary.bin \
  --dataset_name adult

!{python[0]} {dir_path}/data_linter_main.py --dataset_path {test_tfrecord_filename} \
  --stats_path /tmp/adult_summary.bin \
  --results_path /tmp/datalinter/results/lint_results.bin

!{python[0]} {dir_path}/lint_explorer_main.py --results_path /tmp/datalinter/results/lint_results.bin

In [43]:
def validate_on(X_test, X_target, evaluation=True):
    def percentiles_of_probas(predictions):
        probs_class_a = np.transpose(predictions)[0]
        probs_class_b = np.transpose(predictions)[1]
        features_a = np.percentile(probs_class_a, np.arange(0, 101, 5))
        features_b = np.percentile(probs_class_b, np.arange(0, 101, 5))
        return np.concatenate((features_a, features_b), axis=0)
    
    def validate(f):
        def wrapper(*args):
            y_test = dataset.labels_from(X_test)
            y_target = dataset.labels_from(X_target)

            learner = f(*args)
            model = learner.model

            # Training
            print("\nTraining meta regressor on perturbed test data.")
            perturbations_for_training = gen_perturbations()
            meta_regressor = train_random_forest_regressor(X_test, y_test,
                                                           perturbations_for_training,
                                                           model, learner)

            # Evaluation
            print("\nEvaluating meta regressor on perturbed target data.")
            predicted_scores, true_scores = [], []

            for perturbation in learner.perturbations:
                corrupted_target_data = perturbation.transform(X_target)

                predictions = model.predict_proba(corrupted_target_data)
                features = percentiles_of_probas(predictions)

                score_on_corrupted_target_data = learner.score(y_target, model.predict(corrupted_target_data))
                predicted_score_on_corrupted_target_data = meta_regressor.predict([features])

                predicted_scores.append(predicted_score_on_corrupted_target_data)
                true_scores.append(score_on_corrupted_target_data)

            from sklearn.metrics import mean_squared_error, mean_absolute_error

            mse = mean_squared_error(true_scores, predicted_scores)
            mae = mean_absolute_error(true_scores, predicted_scores)

            print("MSE %.5f, MAE %.5f" % (mse, mae))
            
            learner.meta_regressor = meta_regressor
            
            threshold = .01 # 1%
                        
            # Check whether mean of true_scores makes sense
            print(np.mean(true_scores), mae / np.mean(true_scores))
            if mae / np.mean(true_scores) > threshold:
                print("WARNING! Performance drop")
            else:
                print("Everything is fine")
            
            return learner
        return wrapper
    return validate

In [37]:
@validate_on(X_test, X_target, evaluation=False)
def learner_foo():
    dataset = BalancedAdultDataset()
    learner = LogisticRegression('accuracy')
    # X_train = pd.read_csv(os.path.join(data_path, 'tmp/X_train.csv'))
    model = learner.fit(dataset, X_train)
    learner.model = model
    learner.perturbations = gen_perturbations()
    return learner


learner = learner_foo()

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.8min finished



Training meta regressor on perturbed test data.

Evaluating meta regressor on perturbed target data.
MSE 0.00007, MAE 0.00572


In [44]:
columns_affected = dataset.categorical_columns
new_corrupted_X_test = MissingValues(.004, columns_affected, -1).transform(X_test)
new_corrupted_X_test.to_csv(os.path.join(data_path, 'tmp/new_corrupted_test.csv'))

train_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_train.csv'), delimiter=',')
test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/X_test.csv'), delimiter=',')

corrupted_test_stats = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/new_corrupted_test.csv'), delimiter=',')

schema = tfdv.infer_schema(train_stats)
# print(schema)
# tfdv.display_schema(schema)
anomalies = tfdv.validate_statistics(statistics=corrupted_test_stats, schema=schema)
# print(anomalies)
tfdv.display_anomalies(anomalies)
# print(text_format.MessageToString(anomalies))

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'marital_status',Unexpected string values,Examples contain values missing from the schema: -1 (<1%).
'workclass',Unexpected string values,Examples contain values missing from the schema: -1 (<1%).
'occupation',Unexpected string values,Examples contain values missing from the schema: -1 (<1%).
'education',Unexpected string values,Examples contain values missing from the schema: -1 (<1%).


In [None]:
def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.7, 0.8, 0.9]:
            for _ in range(100):
                columns_affected = dataset.categorical_columns
                yield MissingValues(fraction_of_values_to_delete, columns_affected, -1)

@validate_on(X_test, X_target, evaluation=False)
def learner_foo():
    dataset = BalancedAdultDataset()
    learner = LogisticRegression('accuracy')
    # X_train = pd.read_csv(os.path.join(data_path, 'tmp/X_train.csv'))
    model = learner.fit(dataset, X_train)
    learner.model = model
    learner.perturbations = gen_perturbations()
    return learner


learner = learner_foo()

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.6min finished



Training meta regressor on perturbed test data.
