Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check if some arguments for read_csv and scan_csv got a 1 byte input. #3389

Merged
merged 1 commit into from
May 13, 2022

Conversation

ghuls
Copy link
Collaborator

@ghuls ghuls commented May 12, 2022

No description provided.

@github-actions github-actions bot added the python Related to Python Polars label May 12, 2022
@ghuls
Copy link
Collaborator Author

ghuls commented May 12, 2022

Before

In [23]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep=",")
Out[23]: 
shape: (1, 1)
┌──────────┐
│ column_1 │
│ ---      │
│ str      │
╞══════════╡
│ x§y§z    │
└──────────┘

In [24]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")
thread '<unnamed>' panicked at 'index out of bounds: the len is 0 but the index is 0', src/dataframe.rs:159:29
---------------------------------------------------------------------------
PanicException                            Traceback (most recent call last)
Input In [24], in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/io.py:402, in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    396         dtypes = {
    397             new_to_current.get(column_name, column_name): column_dtype
    398             for column_name, column_dtype in dtypes.items()
    399         }
    401 with _prepare_file_arg(file, **storage_options) as data:
--> 402     df = DataFrame._read_csv(
    403         file=data,
    404         has_header=has_header,
    405         columns=columns if columns else projection,
    406         sep=sep,
    407         comment_char=comment_char,
    408         quote_char=quote_char,
    409         skip_rows=skip_rows,
    410         dtypes=dtypes,
    411         null_values=null_values,
    412         ignore_errors=ignore_errors,
    413         parse_dates=parse_dates,
    414         n_threads=n_threads,
    415         infer_schema_length=infer_schema_length,
    416         batch_size=batch_size,
    417         n_rows=n_rows,
    418         encoding=encoding,
    419         low_memory=low_memory,
    420         rechunk=rechunk,
    421         skip_rows_after_header=skip_rows_after_header,
    422         row_count_name=row_count_name,
    423         row_count_offset=row_count_offset,
    424         sample_size=sample_size,
    425     )
    427 if new_columns:
    428     return update_columns(df, new_columns)

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/internals/frame.py:586, in DataFrame._read_csv(cls, file, has_header, columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_count_name, row_count_offset, sample_size)
    580         raise ValueError(
    581             "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
    582         )
    584 projection, columns = handle_projection_columns(columns)
--> 586 self._df = PyDataFrame.read_csv(
    587     file,
    588     infer_schema_length,
    589     batch_size,
    590     has_header,
    591     ignore_errors,
    592     n_rows,
    593     skip_rows,
    594     projection,
    595     sep,
    596     rechunk,
    597     columns,
    598     encoding,
    599     n_threads,
    600     path,
    601     dtype_list,
    602     dtype_slice,
    603     low_memory,
    604     comment_char,
    605     quote_char,
    606     processed_null_values,
    607     parse_dates,
    608     skip_rows_after_header,
    609     _prepare_row_count_args(row_count_name, row_count_offset),
    610     sample_size=sample_size,
    611 )
    612 return self

PanicException: index out of bounds: the len is 0 but the index is 0

In [25]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")
---------------------------------------------------------------------------
ComputeError                              Traceback (most recent call last)
Input In [25], in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/io.py:402, in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    396         dtypes = {
    397             new_to_current.get(column_name, column_name): column_dtype
    398             for column_name, column_dtype in dtypes.items()
    399         }
    401 with _prepare_file_arg(file, **storage_options) as data:
--> 402     df = DataFrame._read_csv(
    403         file=data,
    404         has_header=has_header,
    405         columns=columns if columns else projection,
    406         sep=sep,
    407         comment_char=comment_char,
    408         quote_char=quote_char,
    409         skip_rows=skip_rows,
    410         dtypes=dtypes,
    411         null_values=null_values,
    412         ignore_errors=ignore_errors,
    413         parse_dates=parse_dates,
    414         n_threads=n_threads,
    415         infer_schema_length=infer_schema_length,
    416         batch_size=batch_size,
    417         n_rows=n_rows,
    418         encoding=encoding,
    419         low_memory=low_memory,
    420         rechunk=rechunk,
    421         skip_rows_after_header=skip_rows_after_header,
    422         row_count_name=row_count_name,
    423         row_count_offset=row_count_offset,
    424         sample_size=sample_size,
    425     )
    427 if new_columns:
    428     return update_columns(df, new_columns)

File ~/software/anaconda3/envs/create_cistarget_databases/lib/python3.10/site-packages/polars/internals/frame.py:586, in DataFrame._read_csv(cls, file, has_header, columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_count_name, row_count_offset, sample_size)
    580         raise ValueError(
    581             "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
    582         )
    584 projection, columns = handle_projection_columns(columns)
--> 586 self._df = PyDataFrame.read_csv(
    587     file,
    588     infer_schema_length,
    589     batch_size,
    590     has_header,
    591     ignore_errors,
    592     n_rows,
    593     skip_rows,
    594     projection,
    595     sep,
    596     rechunk,
    597     columns,
    598     encoding,
    599     n_threads,
    600     path,
    601     dtype_list,
    602     dtype_slice,
    603     low_memory,
    604     comment_char,
    605     quote_char,
    606     processed_null_values,
    607     parse_dates,
    608     skip_rows_after_header,
    609     _prepare_row_count_args(row_count_name, row_count_offset),
    610     sample_size=sample_size,
    611 )
    612 return self

ComputeError: invalid utf8 data in csv

After:

In [6]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep=",")
Out[6]: 
shape: (1, 1)
┌──────────┐
│ column_1 │
│ ---      │
│ str      │
╞══════════╡
│ x§y§z    │
└──────────┘

In [7]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-c3e3834a5958> in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="")

~/software/polars/py-polars/polars/io.py in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    288         columns = kwargs.pop("projection", None)
    289 
--> 290     _check_arg_is_1byte("sep", sep, False)
    291     _check_arg_is_1byte("comment_char", comment_char, False)
    292     _check_arg_is_1byte("quote_char", quote_char, True)

~/software/polars/py-polars/polars/io.py in _check_arg_is_1byte(arg_name, arg, can_be_empty)
     69                 )
     70         elif arg_byte_length != 1:
---> 71             raise ValueError(
     72                 f'{arg_name}="{arg}" should be a single byte character, but is {arg_byte_length} bytes long.'
     73             )

ValueError: sep="" should be a single byte character, but is 0 bytes long.

In [8]: pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-df7d8494e3b7> in <module>
----> 1 pl.read_csv("x§y§z\n".encode("utf-8"), has_header=False, sep="§")

~/software/polars/py-polars/polars/io.py in read_csv(file, has_header, columns, new_columns, sep, comment_char, quote_char, skip_rows, dtypes, null_values, ignore_errors, parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_count_name, row_count_offset, sample_size, **kwargs)
    288         columns = kwargs.pop("projection", None)
    289 
--> 290     _check_arg_is_1byte("sep", sep, False)
    291     _check_arg_is_1byte("comment_char", comment_char, False)
    292     _check_arg_is_1byte("quote_char", quote_char, True)

~/software/polars/py-polars/polars/io.py in _check_arg_is_1byte(arg_name, arg, can_be_empty)
     69                 )
     70         elif arg_byte_length != 1:
---> 71             raise ValueError(
     72                 f'{arg_name}="{arg}" should be a single byte character, but is {arg_byte_length} bytes long.'
     73             )

ValueError: sep="§" should be a single byte character, but is 2 bytes long.

@ritchie46 ritchie46 merged commit fad45bb into pola-rs:master May 13, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
python Related to Python Polars
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

2 participants