In [1]:
from sklearn.model_selection import train_test_split
import tensorflow_data_validation as tfdv
from google.protobuf import text_format
import tensorflow as tf
import pandas as pd
import numpy as np
import os

import sys
sys.path.append('..')

from analyzers import DataType 

np.random.seed = 1

In [2]:
from ssc.hilda.datasets import *
from ssc.hilda.perturbations import *
from ssc.hilda.learners import *
from ssc.hilda.experiments import *

import warnings
warnings.simplefilter("ignore")

# Pick a dataset
# dataset = CardioDataset()
dataset = BalancedAdultDataset()
# dataset = AdultDataset()

def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.0, 0.05, 0.25, 0.5, 0.75, 0.99]:
            for _ in range(100):
                columns_affected = np.random.choice(dataset.categorical_columns, num_columns_affected)
                yield MissingValues(fraction_of_values_to_delete, columns_affected, -1)

# generate a bunch of perturbations for training
perturbations_for_training = list(gen_perturbations())

# generate a bunch of perturbations for evaluation
perturbations_for_evaluation = list(gen_perturbations())

# name the perturbations
perturbations_name = "missing_values_at_random"

# define the learner
# learner = DNN('accuracy')
# learner = LogisticRegression('roc_auc')
learner = LogisticRegression('accuracy')

# run an experiment
log_line, model = reapply_perturbations(dataset, learner, perturbations_for_training,
                                        perturbations_for_evaluation, perturbations_name)

print("----------------------------------------------------------------------------------------------")
print(log_line)


Training model on perturbed data.
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   35.2s finished


('accuracy', 'on train data: ', 0.8038262255878836)
('accuracy', 'on test data: ', 0.8150510204081632)
('accuracy', 'on target data: ', 0.8030592734225621)

Training meta regressor on perturbed test data.

Evaluating meta regressor on perturbed target data.
MSE 0.00009, MAE 0.0075
Writing plot to /home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf
reapply_perturbations	adult_income_balanced	0.8038262255878836	0.8150510204081632	0.8030592734225621	logistic_regression	accuracy	missing_values_at_random	9.237058083013448e-05	0.007518214974007451	/home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf
----------------------------------------------------------------------------------------------
reapply_perturbations	adult_income_balanced	0.8038262255878836	0.8150510204081632	0.8030592734225621	logistic_regression	ac

In [3]:
def train_test_split_csv(data_path, file_name, test_ratio=.2):
    data = pd.read_csv(os.path.join(data_path, file_name))
    train, test = train_test_split(data, test_size=test_ratio, random_state=1)
    if not os.path.exists(os.path.join(data_path, 'tmp')):
        os.makedirs(os.path.join(data_path, 'tmp'))
    train.to_csv(os.path.join(data_path, 'tmp/train.csv'))
    test.to_csv(os.path.join(data_path, 'tmp/test.csv'))

data_path, file_name = "/".join(dataset.path.split('/')[:-1]), dataset.path.split('/')[-1]
train_test_split_csv(data_path, file_name, test_ratio=.2)

columns_affected = np.random.choice(dataset.categorical_columns, 3)
(MissingValues(.4, columns_affected, -1)
    .transform(pd.read_csv(os.path.join(data_path, 'tmp/test.csv')))
    .to_csv(os.path.join(data_path, 'tmp/corrupted_test.csv')))

train = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/train.csv'), delimiter=',')
test = tfdv.generate_statistics_from_csv(os.path.join(data_path, 'tmp/corrupted_test.csv'), delimiter=',')
schema = tfdv.infer_schema(train)
# print(schema)
# tfdv.display_schema(schema)
anomalies = tfdv.validate_statistics(statistics=test, schema=schema)
# print(anomalies)
tfdv.display_anomalies(anomalies)
# print(text_format.MessageToString(anomalies))

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).
'marital_status',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).
'workclass',Unexpected string values,Examples contain values missing from the schema: -1 (~39%).
'Unnamed: 0',New column,New column (column in data but not in schema)
'native_country',Unexpected string values,Examples contain values missing from the schema: Holand-Netherlands (<1%).


In [4]:
class TFRecordHelper:
    class __TFRecordHelper:
        def __init__(self):
            self.foo = dict({
                DataType.STRING: lambda x, y: x.bytes_list.value.extend([y]),
                DataType.INTEGER: lambda x, y: x.int64_list.value.extend([y]),
                DataType.FLOAT: lambda x, y: x.float_list.value.extend([y]),
                DataType.OBJECT: lambda x, y: x.bytes_list.value.extend([y])
            })
            self.data_type = dict({
                'int': DataType.INTEGER,
                'int32': DataType.INTEGER,
                'int64': DataType.INTEGER,
                'float': DataType.FLOAT,
                'float32': DataType.FLOAT,
                'float64': DataType.FLOAT,
                'byte': DataType.OBJECT,
                # 'string': DataType.STRING,
                'object': DataType.OBJECT
            })

        def run(self, example, feature_name, dtype, val):
            if not isinstance(dtype, DataType):
                dtype = self.data_type[str(dtype)]
            return self.foo[dtype](example.features.feature[feature_name], val)

    instance = None

    def __init__(self):
        if not TFRecordHelper.instance:
            TFRecordHelper.instance = TFRecordHelper.__TFRecordHelper()

    def __getattr__(self, name):
        return getattr(self.instance, name)


def convert_csv_to_tfrecord(data_path, file_name, dtypes=None):
    filename = os.path.join(data_path, file_name.split('.')[0] + '.tfrecords')
    data = pd.read_csv(os.path.join(data_path, file_name))
    helper = TFRecordHelper()
    columns = data.columns
    if dtypes is None:
        dtypes = data.dtypes
    with tf.python_io.TFRecordWriter(filename) as writer:
        for i in range(data.shape[0]):
            example = tf.train.Example()
            for j in range(data.shape[1]):
                helper.run(example, columns[j], dtypes[j], data.iloc[i, j])
            writer.write(example.SerializeToString())
    return filename

In [5]:
train_tfrecord_filename = convert_csv_to_tfrecord(data_path, 'tmp/train.csv')
test_tfrecord_filename = convert_csv_to_tfrecord(data_path, 'tmp/test.csv')

In [6]:
dir_path = os.path.join(globals()['_dh'][0], '../third_party/data-linter')
python = '/home/reds/install/miniconda3/envs/python2/bin/python'

!{python} {dir_path}/demo/summarize_data.py --dataset_path {train_tfrecord_filename} \
  --stats_path /tmp/adult_summary.bin \
  --dataset_name adult

!{python} {dir_path}/data_linter_main.py --dataset_path {test_tfrecord_filename} \
  --stats_path /tmp/adult_summary.bin \
  --results_path /tmp/datalinter/results/lint_results.bin

!{python} {dir_path}/lint_explorer_main.py --results_path /tmp/datalinter/results/lint_results.bin

The following linter(s) triggered on your dataset:
* NonNormalNumericFeatureDetector
* TailedDistributionDetector


NonNormalNumericFeatureDetector
A feature flagged by this linter has a distribution that varies significantly
from the other numeric features.
Especially for linear models, poorly scaled features with high variance
(e.g., all but one are in the range [-10, 10] but one is in [0, 100000])
can wash out the effects of the other features.

Quickfix: use the [standard score](https://en.wikipedia.org/wiki/Standard_score)
of (at least) the flagged features.
-----
A 'typical' numeric feature in the dataset has mean 2.97e+04 and std dev 17589 but
* fnlwgt had mean = 1.8999e+05, std_dev = 1.0595e+05


TailedDistributionDetector
A feature flagged by this linter has an extremal value that significantly
affects the mean. This may be because the value is an outlier but it may also
be due to the extremal value being very common. In either case, however, it
would be beneficial to check th

In [7]:
from profilers import DataFrameProfiler, SklearnPipelineProfiler
from test_suite import AutomatedTestSuite 

automated_suite = AutomatedTestSuite()
data_profile = DataFrameProfiler().on(dataset.df)
pipeline_profile = SklearnPipelineProfiler().on(model)

<class 'sklearn.model_selection._search.GridSearchCV'>
<class 'sklearn.pipeline.Pipeline'>
