Skip to content

Commit

Permalink
csv: exclude delimiter from whitespace skip; and python don't automat…
Browse files Browse the repository at this point in the history
…ically infer dates
  • Loading branch information
ritchie46 committed Dec 4, 2021
1 parent 0ca6ff8 commit 53c1f6c
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 4 deletions.
15 changes: 15 additions & 0 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1358,4 +1358,19 @@ A3,\"B4_\"\"with_embedded_double_quotes\"\"\",C4,4";

Ok(())
}

#[test]
fn test_tsv_header_offset() -> Result<()> {
let csv = "foo\tbar\n\t1000011\t1\n\t1000026\t2\n\t1000949\t2";
let file = Cursor::new(csv);
let df = CsvReader::new(file).with_delimiter(b'\t').finish()?;

assert_eq!(df.shape(), (3, 2));
assert_eq!(df.dtypes(), &[DataType::Utf8, DataType::Int64]);
let a = df.column("foo")?;
let a = a.utf8()?;
assert_eq!(a.get(0), Some(""));

Ok(())
}
}
10 changes: 9 additions & 1 deletion polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,14 @@ pub(crate) fn skip_whitespace(input: &[u8]) -> (&[u8], usize) {
skip_condition(input, |b| is_whitespace(b) || is_line_ending(b))
}

#[inline]
/// Can be used to skip whitespace, but exclude the delimiter
pub(crate) fn skip_whitespace_exclude(input: &[u8], exclude: u8) -> (&[u8], usize) {
skip_condition(input, |b| {
b != exclude && (is_whitespace(b) || is_line_ending(b))
})
}

/// Local version of slice::starts_with (as it won't inline)
#[inline]
fn starts_with(bytes: &[u8], needle: u8) -> bool {
Expand Down Expand Up @@ -406,7 +414,7 @@ pub(crate) fn parse_lines(
let end = bytes.as_ptr() as usize;
return Ok(end - start);
}
let (b, _) = skip_whitespace(bytes);
let (b, _) = skip_whitespace_exclude(bytes, delimiter);
bytes = b;
if bytes.is_empty() {
return Ok(original_bytes_len);
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def read_csv(
quote_char: Optional[str] = r'"',
storage_options: Optional[Dict] = None,
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
parse_dates: bool = True,
parse_dates: bool = False,
) -> DataFrame:
"""
Read into a DataFrame from a csv file.
Expand Down
10 changes: 8 additions & 2 deletions py-polars/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io
import pickle
import zlib
from functools import partial
from pathlib import Path
from typing import Dict, Type

Expand All @@ -20,7 +21,12 @@ def test_to_from_buffer(df: pl.DataFrame) -> None:

for to_fn, from_fn, text_based in zip(
[df.to_parquet, df.to_csv, df.to_ipc, df.to_json],
[pl.read_parquet, pl.read_csv, pl.read_ipc, pl.read_json],
[
pl.read_parquet,
partial(pl.read_csv, parse_dates=True),
pl.read_ipc,
pl.read_json,
],
[False, True, False, True],
):
f = io.BytesIO()
Expand Down Expand Up @@ -170,7 +176,7 @@ def test_datetime_parsing() -> None:
"""

f = io.StringIO(csv)
df = pl.read_csv(f)
df = pl.read_csv(f, parse_dates=True)
assert df.dtypes == [pl.Datetime, pl.Float64, pl.Float64]


Expand Down

0 comments on commit 53c1f6c

Please sign in to comment.