In [1]:
from sklearn.model_selection import train_test_split
import tensorflow_data_validation as tfdv
from google.protobuf import text_format
# import apache_beam as beam
import tensorflow as tf
import pandas as pd
import numpy as np
import os

import sys
sys.path.append('..')

from analyzers import DataType
from error_generation import ExplicitMissingValues

np.random.seed = 1

In [2]:
class TFRecordHelper:
    class __TFRecordHelper:
        def __init__(self):
            self.foo = dict({
                DataType.STRING: lambda x, y: x.bytes_list.value.extend([y]),
                DataType.INTEGER: lambda x, y: x.int64_list.value.extend([y]),
                DataType.FLOAT: lambda x, y: x.float_list.value.extend([y]),
                DataType.OBJECT: lambda x, y: x.bytes_list.value.extend([y])
            })
            self.data_type = dict({
                'int': DataType.INTEGER,
                'int32': DataType.INTEGER,
                'int64': DataType.INTEGER,
                'float': DataType.FLOAT,
                'float32': DataType.FLOAT,
                'float64': DataType.FLOAT,
                'byte': DataType.OBJECT,
                # 'string': DataType.STRING,
                'object': DataType.OBJECT
            })

        def run(self, example, feature_name, dtype, val):
            if not isinstance(dtype, DataType):
                dtype = self.data_type[str(dtype)]
            return self.foo[dtype](example.features.feature[feature_name], val)

    instance = None

    def __init__(self):
        if not TFRecordHelper.instance:
            TFRecordHelper.instance = TFRecordHelper.__TFRecordHelper()

    def __getattr__(self, name):
        return getattr(self.instance, name)


def convert_csv_to_tfrecord(data_path, file_name, dtypes=None):
    filename = os.path.join(data_path, file_name.split('.')[0] + '.tfrecords')
    data = pd.read_csv(os.path.join(data_path, file_name))
    helper = TFRecordHelper()
    columns = data.columns
    if dtypes is None:
        dtypes = data.dtypes
    with tf.python_io.TFRecordWriter(filename) as writer:
        for i in range(data.shape[0]):
            example = tf.train.Example()
            for j in range(data.shape[1]):
                helper.run(example, columns[j], dtypes[j], data.iloc[i, j])
            writer.write(example.SerializeToString())


def train_test_split_csv(data_path, file_name):
    data = pd.read_csv(os.path.join(data_path, file_name))
    train, test = train_test_split(data, test_size=0.33, random_state=1)
    train.to_csv(os.path.join(data_path, 'train.csv'))
    test.to_csv(os.path.join(data_path, 'test.csv'))

In [3]:
def data_validation(data_path):
    train = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    test = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    schema = tfdv.infer_schema(train)
    # print(schema)
    # tfdv.display_schema(schema)
    anomalies = tfdv.validate_statistics(statistics=test, schema=schema)
    # print(anomalies)
    tfdv.display_anomalies(anomalies)
    # print(text_format.MessageToString(anomalies))

In [4]:
data_path = os.path.join('../resources/data/', 'wine-quality')
if not os.path.exists(data_path):
    os.makedirs(data_path)
file_name = 'wine-quality-red.csv'
convert_csv_to_tfrecord(data_path, file_name)
# train_test_split_csv(data_path, file_name)
ExplicitMissingValues().on(pd.read_csv(os.path.join(data_path, 'test_old.csv'))).to_csv(os.path.join(data_path, 'test.csv'))
data_validation(data_path)

In [5]:
from ssc.hilda.datasets import *
from ssc.hilda.perturbations import *
from ssc.hilda.learners import *
from ssc.hilda.experiments import *

import warnings
warnings.simplefilter("ignore")

# Pick a dataset
# dataset = CardioDataset()
dataset = BalancedAdultDataset()
# dataset = AdultDataset()

def gen_perturbations():
    for num_columns_affected in range(1, 5):
        for fraction_of_values_to_delete in [0.0, 0.05, 0.25, 0.5, 0.75, 0.99]:
            for _ in range(100):
                columns_affected = np.random.choice(dataset.categorical_columns, num_columns_affected)
                yield MissingValues(fraction_of_values_to_delete, columns_affected, -1)

# generate a bunch of perturbations for training
perturbations_for_training = list(gen_perturbations())

# generate a bunch of perturbations for evaluation
perturbations_for_evaluation = list(gen_perturbations())

# name the perturbations
perturbations_name = "missing_values_at_random"

# define the learner
# learner = DNN('accuracy')
# learner = LogisticRegression('roc_auc')
learner = LogisticRegression('accuracy')

# run an experiment
log_line = reapply_perturbations(dataset, learner, perturbations_for_training,
                                 perturbations_for_evaluation, perturbations_name)

print("----------------------------------------------------------------------------------------------")
print(log_line)


Training model on perturbed data.
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   25.1s finished


('accuracy', 'on train data: ', 0.8016739736946991)
('accuracy', 'on test data: ', 0.8086734693877551)
('accuracy', 'on target data: ', 0.798597833014659)

Training meta regressor on perturbed test data.

Evaluating meta regressor on perturbed target data.
MSE 0.00017, MAE 0.0098
Writing plot to /home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf
reapply_perturbations	adult_income_balanced	0.8016739736946991	0.8086734693877551	0.798597833014659	logistic_regression	accuracy	missing_values_at_random	0.00016677128574140687	0.009775168259225333	/home/reds/myrepo/unit-tests-ml-python/ssc/hilda/../figures/adult_income_balanced__missing_values_at_random__logistic_regression__accuracy.pdf
----------------------------------------------------------------------------------------------
reapply_perturbations	adult_income_balanced	0.8016739736946991	0.8086734693877551	0.798597833014659	logistic_regression	accu