Check if some arguments for read_csv and scan_csv got a 1 byte input. #3389

ghuls · 2022-05-12T20:07:13Z

No description provided.

ghuls · 2022-05-12T20:15:40Z

Before

In [23]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep=",")
Out[23]: 
shape: (1, 1)
┌──────────┐
│ column_1 │
│ ---      │
│ str      │
╞══════════╡
│ x§y§z    │
└──────────┘

In [24]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")
thread '<unnamed>' panicked at 'index out of bounds: the len is 0 but the index is 0', src/dataframe.rs:159:29
---------------------------------------------------------------------------
PanicException                            Traceback (most recent call last)
Input In [24], in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/io.py:402, in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    396         dtypes = {
    397             new_to_current.get(column_name, column_name): column_dtype
    398             for column_name, column_dtype in dtypes.items()
    399         }
    401 with _prepare_file_arg(file, **storage_options) as data:
--> 402     df = DataFrame._read_csv(
    403         file=data,
    404         has_header=has_header,
    405         columns=columns if columns else projection,
    406         sep=sep,
    407         comment_char=comment_char,
    408         quote_char=quote_char,
    409         skip_rows=skip_rows,
    410         dtypes=dtypes,
    411         null_values=null_values,
    412         ignore_errors=ignore_errors,
    413         parse_dates=parse_dates,
    414         n_threads=n_threads,
    415         infer_schema_length=infer_schema_length,
    416         batch_size=batch_size,
    417         n_rows=n_rows,
    418         encoding=encoding,
    419         low_memory=low_memory,
    420         rechunk=rechunk,
    421         skip_rows_after_header=skip_rows_after_header,
    422         row_count_name=row_count_name,
    423         row_count_offset=row_count_offset,
    424         sample_size=sample_size,
    425     )
    427 if new_columns:
    428     return update_columns(df, new_columns)

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/internals/frame.py:586, in DataFrame._read_csv(cls, file, has_header, columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_count_name, row_count_offset, sample_size)
    580         raise ValueError(
    581             "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
    582         )
    584 projection, columns = handle_projection_columns(columns)
--> 586 self._df = PyDataFrame.read_csv(
    587     file,
    588     infer_schema_length,
    589     batch_size,
    590     has_header,
    591     ignore_errors,
    592     n_rows,
    593     skip_rows,
    594     projection,
    595     sep,
    596     rechunk,
    597     columns,
    598     encoding,
    599     n_threads,
    600     path,
    601     dtype_list,
    602     dtype_slice,
    603     low_memory,
    604     comment_char,
    605     quote_char,
    606     processed_null_values,
    607     parse_dates,
    608     skip_rows_after_header,
    609     _prepare_row_count_args(row_count_name, row_count_offset),
    610     sample_size=sample_size,
    611 )
    612 return self

PanicException: index out of bounds: the len is 0 but the index is 0

In [25]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")
---------------------------------------------------------------------------
ComputeError                              Traceback (most recent call last)
Input In [25], in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/io.py:402, in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    396         dtypes = {
    397             new_to_current.get(column_name, column_name): column_dtype
    398             for column_name, column_dtype in dtypes.items()
    399         }
    401 with _prepare_file_arg(file, **storage_options) as data:
--> 402     df = DataFrame._read_csv(
    403         file=data,
    404         has_header=has_header,
    405         columns=columns if columns else projection,
    406         sep=sep,
    407         comment_char=comment_char,
    408         quote_char=quote_char,
    409         skip_rows=skip_rows,
    410         dtypes=dtypes,
    411         null_values=null_values,
    412         ignore_errors=ignore_errors,
    413         parse_dates=parse_dates,
    414         n_threads=n_threads,
    415         infer_schema_length=infer_schema_length,
    416         batch_size=batch_size,
    417         n_rows=n_rows,
    418         encoding=encoding,
    419         low_memory=low_memory,
    420         rechunk=rechunk,
    421         skip_rows_after_header=skip_rows_after_header,
    422         row_count_name=row_count_name,
    423         row_count_offset=row_count_offset,
    424         sample_size=sample_size,
    425     )
    427 if new_columns:
    428     return update_columns(df, new_columns)

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/internals/frame.py:586, in DataFrame._read_csv(cls, file, has_header, columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_count_name, row_count_offset, sample_size)
    580         raise ValueError(
    581             "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
    582         )
    584 projection, columns = handle_projection_columns(columns)
--> 586 self._df = PyDataFrame.read_csv(
    587     file,
    588     infer_schema_length,
    589     batch_size,
    590     has_header,
    591     ignore_errors,
    592     n_rows,
    593     skip_rows,
    594     projection,
    595     sep,
    596     rechunk,
    597     columns,
    598     encoding,
    599     n_threads,
    600     path,
    601     dtype_list,
    602     dtype_slice,
    603     low_memory,
    604     comment_char,
    605     quote_char,
    606     processed_null_values,
    607     parse_dates,
    608     skip_rows_after_header,
    609     _prepare_row_count_args(row_count_name, row_count_offset),
    610     sample_size=sample_size,
    611 )
    612 return self

ComputeError: invalid utf8 data in csv

After:

In [6]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep=",")
Out[6]: 
shape: (1, 1)
┌──────────┐
│ column_1 │
│ ---      │
│ str      │
╞══════════╡
│ x§y§z    │
└──────────┘

In [7]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-c3e3834a5958> in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")

~/software/polars/py-polars/polars/io.py in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    288         columns = kwargs.pop("projection", None)
    289 
--> 290     _check_arg_is_1byte("sep", sep, False)
    291     _check_arg_is_1byte("comment_char", comment_char, False)
    292     _check_arg_is_1byte("quote_char", quote_char, True)

~/software/polars/py-polars/polars/io.py in _check_arg_is_1byte(arg_name, arg, can_be_empty)
     69                 )
     70         elif arg_byte_length != 1:
---> 71             raise ValueError(
     72                 f'{arg_name}="{arg}" should be a single byte character, but is {arg_byte_length} bytes long.'
     73             )

ValueError: sep="" should be a single byte character, but is 0 bytes long.

In [8]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-df7d8494e3b7> in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")

~/software/polars/py-polars/polars/io.py in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    288         columns = kwargs.pop("projection", None)
    289 
--> 290     _check_arg_is_1byte("sep", sep, False)
    291     _check_arg_is_1byte("comment_char", comment_char, False)
    292     _check_arg_is_1byte("quote_char", quote_char, True)

~/software/polars/py-polars/polars/io.py in _check_arg_is_1byte(arg_name, arg, can_be_empty)
     69                 )
     70         elif arg_byte_length != 1:
---> 71             raise ValueError(
     72                 f'{arg_name}="{arg}" should be a single byte character, but is {arg_byte_length} bytes long.'
     73             )

ValueError: sep="§" should be a single byte character, but is 2 bytes long.

…pola-rs#3389)

Check if some arguments for read_csv and scan_csv got a 1 byte input.

08e0543

github-actions bot added the python Related to Python Polars label May 12, 2022

ritchie46 merged commit fad45bb into pola-rs:master May 13, 2022

moritzwilksch pushed a commit to moritzwilksch/polars that referenced this pull request May 29, 2022

Check if some arguments for read_csv and scan_csv got a 1 byte input. (…

382a622

…pola-rs#3389)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Check if some arguments for read_csv and scan_csv got a 1 byte input. #3389

Check if some arguments for read_csv and scan_csv got a 1 byte input. #3389

ghuls commented May 12, 2022

ghuls commented May 12, 2022

Check if some arguments for read_csv and scan_csv got a 1 byte input. #3389

Check if some arguments for read_csv and scan_csv got a 1 byte input. #3389

Conversation

ghuls commented May 12, 2022

ghuls commented May 12, 2022