Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Update orc reader and writer fuzz tests #7357

Merged
merged 5 commits into from
Feb 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions python/cudf/cudf/_fuzz_testing/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def generate_input(self):
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
)

self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2 ** 32 - 1)
self._current_params["seed"] = seed
Expand Down Expand Up @@ -106,7 +107,6 @@ def set_rand_params(self, params):
elif param == "stripes":
f = io.BytesIO(self._current_buffer)
reader = pyorc.Reader(f)
print("READ: ", reader.num_of_stripes)
stripes = [i for i in range(reader.num_of_stripes)]
params_dict[param] = np.random.choice(
[
Expand All @@ -125,10 +125,10 @@ def set_rand_params(self, params):
)
elif param == "use_index":
params_dict[param] = np.random.choice([True, False])
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
[None, self._rand(len(self._df))]
)
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
[None, self._rand(len(self._df))]
)
else:
if not isinstance(values, list):
raise TypeError("values must be of type list")
Expand All @@ -143,12 +143,16 @@ def __init__(
max_rows=100_000,
max_columns=1000,
max_string_length=None,
max_lists_length=None,
max_lists_nesting_depth=None,
):
super().__init__(
dirs=dirs,
max_rows=max_rows,
max_columns=max_columns,
max_string_length=max_string_length,
max_lists_length=max_lists_length,
max_lists_nesting_depth=max_lists_nesting_depth,
)
self._df = None

Expand All @@ -163,11 +167,18 @@ def generate_input(self):
else:
dtypes_list = list(
cudf.utils.dtypes.ALL_TYPES
- {"category"}
# TODO: Remove "bool" from below
# list after following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6763
- {"category", "bool"}
# Following dtypes are not supported by orc
# https://orc.apache.org/specification/ORCv0/
- cudf.utils.dtypes.TIMEDELTA_TYPES
- cudf.utils.dtypes.UNSIGNED_TYPES
# TODO: Remove `DATETIME_TYPES` once
# following bug is fixed:
# https://github.com/rapidsai/cudf/issues/7355
- cudf.utils.dtypes.DATETIME_TYPES
)

dtypes_meta, num_rows, num_cols = _generate_rand_meta(
Expand Down
20 changes: 10 additions & 10 deletions python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
orc_to_pandas,
run_test,
)
from cudf.tests.utils import assert_eq


@pythonfuzz(
Expand All @@ -24,19 +23,16 @@
"use_index": ALL_POSSIBLE_VALUES,
},
)
def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
# TODO: Remove skiprows=0 after
# following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6563
skiprows = 0

def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
pdf, file_buffer = input_tuple
expected_pdf = pdf.iloc[skiprows:]
if num_rows is not None:
expected_pdf = expected_pdf.head(num_rows)
if skiprows is not None or num_rows is not None:
expected_pdf.reset_index(drop=True, inplace=True)
if columns is not None:
if columns is not None and len(columns) > 0:
# ORC reader picks columns if only
# there are any elements in `columns`
expected_pdf = expected_pdf[columns]
if use_index is False:
expected_pdf.reset_index(drop=True, inplace=True)
Expand All @@ -48,6 +44,7 @@ def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
num_rows=num_rows,
use_index=use_index,
)

compare_dataframe(expected_pdf, gdf)


Expand All @@ -61,14 +58,16 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
file_io_obj=io.BytesIO(file_buffer), stripes=stripes
)

if columns is not None:
if columns is not None and len(columns) > 0:
# ORC reader picks columns if only
# there are any elements in `columns`
expected_pdf = expected_pdf[columns]

gdf = cudf.read_orc(
io.BytesIO(file_buffer), columns=columns, stripes=stripes
)

assert_eq(expected_pdf, gdf, check_dtype=False)
compare_dataframe(expected_pdf, gdf)


@pythonfuzz(
Expand All @@ -91,6 +90,7 @@ def orc_writer_test(pdf, compression, enable_statistics):
file_to_strore.seek(0)

actual_df = cudf.read_orc(file_to_strore)

compare_dataframe(pdf, actual_df)


Expand Down
33 changes: 27 additions & 6 deletions python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@
np.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
pyorc.TinyInt().name: pd.Int8Dtype(),
pyorc.Int().name: pd.Int32Dtype(),
pyorc.Boolean().name: pd.BooleanDtype(),
pyorc.SmallInt().name: pd.Int16Dtype(),
pyorc.BigInt().name: pd.Int64Dtype(),
pyorc.String().name: np.dtype("O"),
pyorc.Float().name: np.dtype("float32"),
pyorc.Double().name: np.dtype("float64"),
pyorc.Timestamp().name: np.dtype("<M8[ns]"),
}


def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
obj._current_params = {}
Expand All @@ -73,7 +85,8 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
if null_frequency_override is None
else null_frequency_override
)
cardinality = obj._rand(obj._max_rows)
# `cardinality` has to be at least 1.
cardinality = max(1, obj._rand(obj._max_rows))
meta = dict()
if dtype == "str":
# We want to operate near the limits of string column
Expand Down Expand Up @@ -190,7 +203,8 @@ def get_avro_schema(df):

def get_orc_schema(df):
ordered_dict = OrderedDict(
(col_name, col_dtype) for col_name, col_dtype in df.dtypes.items()
(col_name, get_orc_dtype_info(col_dtype))
for col_name, col_dtype in df.dtypes.items()
)

schema = pyorc.Struct(**ordered_dict)
Expand Down Expand Up @@ -269,13 +283,11 @@ def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):

if file_name is not None:
with open(file_name, "wb") as data:
with pyorc.Writer(
data, str(schema), stripe_size=stripe_size
) as writer:
with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
writer.writerows(tuple_list)
elif file_io_obj is not None:
with pyorc.Writer(
file_io_obj, str(schema), stripe_size=stripe_size
file_io_obj, schema, stripe_size=stripe_size
) as writer:
writer.writerows(tuple_list)

Expand All @@ -288,6 +300,11 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):

reader = pyorc.Reader(f)

dtypes = {
col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
for col, pyorc_type in reader.schema.fields.items()
}

if stripes is None:
df = pd.DataFrame.from_records(
reader, columns=reader.schema.fields.keys()
Expand All @@ -300,6 +317,10 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
records, columns=reader.schema.fields.keys()
)

# Need to type-cast to extracted `dtypes` from pyorc schema because
# a fully empty/ full <NA> can result in incorrect dtype by pandas.
df = df.astype(dtypes)

return df


Expand Down