In [1]:
from sklearn.model_selection import train_test_split
import tensorflow_data_validation as tfdv
from google.protobuf import text_format
# import apache_beam as beam
import tensorflow as tf
import pandas as pd
import numpy as np
import os

import sys
sys.path.append('..')

from analyzers import DataType
from error_generation import ExplicitMissingValues

np.random.seed = 1

  from ._conv import register_converters as _register_converters
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p
  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters


In [5]:
class TFRecordHelper:
    class __TFRecordHelper:
        def __init__(self):
            self.foo = dict({
                DataType.STRING: lambda x, y: x.bytes_list.value.extend([y]),
                DataType.INTEGER: lambda x, y: x.int64_list.value.extend([y]),
                DataType.FLOAT: lambda x, y: x.float_list.value.extend([y]),
                DataType.OBJECT: lambda x, y: x.bytes_list.value.extend([y])
            })
            self.data_type = dict({
                'int': DataType.INTEGER,
                'int32': DataType.INTEGER,
                'int64': DataType.INTEGER,
                'float': DataType.FLOAT,
                'float32': DataType.FLOAT,
                'float64': DataType.FLOAT,
                'byte': DataType.OBJECT,
                # 'string': DataType.STRING,
                'object': DataType.OBJECT
            })

        def run(self, example, feature_name, dtype, val):
            if not isinstance(dtype, DataType):
                dtype = self.data_type[str(dtype)]
            return self.foo[dtype](example.features.feature[feature_name], val)

    instance = None

    def __init__(self):
        if not TFRecordHelper.instance:
            TFRecordHelper.instance = TFRecordHelper.__TFRecordHelper()

    def __getattr__(self, name):
        return getattr(self.instance, name)


def convert_csv_to_tfrecord(data_path, file_name, dtypes=None):
    filename = os.path.join(data_path, file_name.split('.')[0] + '.tfrecords')
    data = pd.read_csv(os.path.join(data_path, file_name))
    helper = TFRecordHelper()
    columns = data.columns
    if dtypes is None:
        dtypes = data.dtypes
    with tf.python_io.TFRecordWriter(filename) as writer:
        for i in range(data.shape[0]):
            example = tf.train.Example()
            for j in range(data.shape[1]):
                helper.run(example, columns[j], dtypes[j], data.iloc[i, j])
            writer.write(example.SerializeToString())


def train_test_split_csv(data_path, file_name):
    data = pd.read_csv(os.path.join(data_path, file_name))
    train, test = train_test_split(data, test_size=0.33, random_state=1)
    train.to_csv(os.path.join(data_path, 'train.csv'))
    test.to_csv(os.path.join(data_path, 'test.csv'))

In [6]:
def data_validation(data_path):
    train = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    test = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    schema = tfdv.infer_schema(train)
    # print(schema)
    # tfdv.display_schema(schema)
    anomalies = tfdv.validate_statistics(statistics=test, schema=schema)
    # print(anomalies)
    tfdv.display_anomalies(anomalies)
    # print(text_format.MessageToString(anomalies))

In [7]:
data_path = os.path.join('../resources/data/', 'wine-quality')
if not os.path.exists(data_path):
    os.makedirs(data_path)
file_name = 'wine-quality-red.csv'
convert_csv_to_tfrecord(data_path, file_name)
# train_test_split_csv(data_path, file_name)
ExplicitMissingValues().on(pd.read_csv(os.path.join(data_path, 'test_old.csv'))).to_csv(os.path.join(data_path, 'test.csv'))
data_validation(data_path)