Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Update orc reader and writer fuzz tests #7357

Merged
merged 5 commits into from
Feb 11, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions python/cudf/cudf/_fuzz_testing/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@ def generate_input(self):
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
)
if num_cols == 0:
"""
If a dataframe has no columns, then pyorc writer will throw
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wondered what the desired behavior is here, i.e. whether ORC as a format supports having no columns.
I think we also have some issues writing empty dataframes to ORC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we need to write an empty struct. With pyorc we can do it this way:

>>> import pyorc
>>> import pandas as pd
>>> output = open("sample.orc", "wb")
>>> writer = pyorc.Writer(output, pyorc.Struct())
>>> writer.close()
>>> pd.read_orc('sample.orc')
Empty DataFrame
Columns: []
Index: []

Looks like I also need to make some code-changes in this PR. I'll update this PR.

the following error:
ValueError: Struct type must contain at least one sub type.
Hence this is a work-around to skip generating an empty
dataframe.
"""
while num_cols == 0:
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
)

self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2 ** 32 - 1)
self._current_params["seed"] = seed
Expand Down Expand Up @@ -106,7 +119,6 @@ def set_rand_params(self, params):
elif param == "stripes":
f = io.BytesIO(self._current_buffer)
reader = pyorc.Reader(f)
print("READ: ", reader.num_of_stripes)
stripes = [i for i in range(reader.num_of_stripes)]
params_dict[param] = np.random.choice(
[
Expand All @@ -125,10 +137,10 @@ def set_rand_params(self, params):
)
elif param == "use_index":
params_dict[param] = np.random.choice([True, False])
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
[None, self._rand(len(self._df))]
)
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
[None, self._rand(len(self._df))]
)
else:
if not isinstance(values, list):
raise TypeError("values must be of type list")
Expand All @@ -143,12 +155,16 @@ def __init__(
max_rows=100_000,
max_columns=1000,
max_string_length=None,
max_lists_length=None,
max_lists_nesting_depth=None,
):
super().__init__(
dirs=dirs,
max_rows=max_rows,
max_columns=max_columns,
max_string_length=max_string_length,
max_lists_length=None,
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
max_lists_nesting_depth=None,
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
)
self._df = None

Expand All @@ -163,11 +179,18 @@ def generate_input(self):
else:
dtypes_list = list(
cudf.utils.dtypes.ALL_TYPES
- {"category"}
# TODO: Remove "bool" from below
# list after following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6763
- {"category", "bool"}
# Following dtypes are not supported by orc
# https://orc.apache.org/specification/ORCv0/
- cudf.utils.dtypes.TIMEDELTA_TYPES
- cudf.utils.dtypes.UNSIGNED_TYPES
# TODO: Remove `DATETIME_TYPES` once
# following bug is fixed:
# https://github.com/rapidsai/cudf/issues/7355
- cudf.utils.dtypes.DATETIME_TYPES
)

dtypes_meta, num_rows, num_cols = _generate_rand_meta(
Expand Down
16 changes: 6 additions & 10 deletions python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
orc_to_pandas,
run_test,
)
from cudf.tests.utils import assert_eq


@pythonfuzz(
Expand All @@ -24,19 +23,14 @@
"use_index": ALL_POSSIBLE_VALUES,
},
)
def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
# TODO: Remove skiprows=0 after
# following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6563
skiprows = 0

def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
pdf, file_buffer = input_tuple
expected_pdf = pdf.iloc[skiprows:]
if num_rows is not None:
expected_pdf = expected_pdf.head(num_rows)
if skiprows is not None or num_rows is not None:
expected_pdf.reset_index(drop=True, inplace=True)
if columns is not None:
if columns is not None and len(columns) > 0:
expected_pdf = expected_pdf[columns]
if use_index is False:
expected_pdf.reset_index(drop=True, inplace=True)
Expand All @@ -48,6 +42,7 @@ def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
num_rows=num_rows,
use_index=use_index,
)

compare_dataframe(expected_pdf, gdf)


Expand All @@ -61,14 +56,14 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
file_io_obj=io.BytesIO(file_buffer), stripes=stripes
)

if columns is not None:
if columns is not None and len(columns) > 0:
expected_pdf = expected_pdf[columns]

gdf = cudf.read_orc(
io.BytesIO(file_buffer), columns=columns, stripes=stripes
)

assert_eq(expected_pdf, gdf, check_dtype=False)
compare_dataframe(expected_pdf, gdf)


@pythonfuzz(
Expand All @@ -91,6 +86,7 @@ def orc_writer_test(pdf, compression, enable_statistics):
file_to_strore.seek(0)

actual_df = cudf.read_orc(file_to_strore)

compare_dataframe(pdf, actual_df)


Expand Down
23 changes: 21 additions & 2 deletions python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@
np.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
pyorc.TinyInt().name: pd.Int8Dtype(),
pyorc.Int().name: pd.Int32Dtype(),
pyorc.Boolean().name: pd.BooleanDtype(),
pyorc.SmallInt().name: pd.Int16Dtype(),
pyorc.BigInt().name: pd.Int64Dtype(),
pyorc.String().name: np.dtype("O"),
pyorc.Float().name: np.dtype("float32"),
pyorc.Double().name: np.dtype("float64"),
pyorc.Timestamp().name: np.dtype("<M8[ns]"),
}


def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
obj._current_params = {}
Expand All @@ -73,7 +85,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
if null_frequency_override is None
else null_frequency_override
)
cardinality = obj._rand(obj._max_rows)
cardinality = max(1, obj._rand(obj._max_rows))
meta = dict()
if dtype == "str":
# We want to operate near the limits of string column
Expand Down Expand Up @@ -190,7 +202,8 @@ def get_avro_schema(df):

def get_orc_schema(df):
ordered_dict = OrderedDict(
(col_name, col_dtype) for col_name, col_dtype in df.dtypes.items()
(col_name, get_orc_dtype_info(col_dtype))
for col_name, col_dtype in df.dtypes.items()
)

schema = pyorc.Struct(**ordered_dict)
Expand Down Expand Up @@ -288,6 +301,11 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):

reader = pyorc.Reader(f)

dtypes = {
col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
for col, pyorc_type in reader.schema.fields.items()
}

if stripes is None:
df = pd.DataFrame.from_records(
reader, columns=reader.schema.fields.keys()
Expand All @@ -299,6 +317,7 @@ def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
df = pd.DataFrame.from_records(
records, columns=reader.schema.fields.keys()
)
df = df.astype(dtypes)

return df

Expand Down