-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
245 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ pub(crate) mod parser; | |
pub(crate) mod utils; | ||
#[cfg(feature = "private")] | ||
pub mod utils; | ||
mod write; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
use std::borrow::Cow; | ||
use std::fmt::{Display, Formatter}; | ||
use std::io::Write; | ||
use memchr::memchr; | ||
use polars_core::{ | ||
prelude::*, | ||
series::SeriesIter, | ||
POOL | ||
}; | ||
use super::*; | ||
use rayon::prelude::*; | ||
use polars_utils::contention_pool::LowContentionPool; | ||
use arrow::temporal_conversions; | ||
use polars_core::export::chrono::FixedOffset; | ||
|
||
fn fmt_and_escape_str(f: &mut Vec<u8>, v: &str, options: &SerializeOptions) -> std::io::Result<()> { | ||
|
||
let surround_with_quotes = memchr(options.delimiter, v.as_bytes()).is_some(); | ||
let needs_escaping = memchr(options.quote, v.as_bytes()).is_some(); | ||
|
||
let value = if needs_escaping { | ||
let replaced = unsafe { v.replace(v, std::str::from_utf8_unchecked(&[options.quote, options.quote])) }; | ||
Cow::Owned(replaced) | ||
} else { | ||
Cow::Borrowed(v) | ||
}; | ||
|
||
if surround_with_quotes { | ||
write!(f, "\"{}\"", v) | ||
} else { | ||
write!(f, "{}", v) | ||
} | ||
|
||
} | ||
|
||
fn write_anyvalue(f: &mut Vec<u8>, value: AnyValue, options: &SerializeOptions) { | ||
match value { | ||
AnyValue::Null => write!(f, ""), | ||
AnyValue::Int8(v) => write!(f, "{}", v), | ||
AnyValue::Int16(v) => write!(f, "{}", v), | ||
AnyValue::Int32(v) => write!(f, "{}", v), | ||
AnyValue::Int64(v) => write!(f, "{}", v), | ||
AnyValue::UInt8(v) => write!(f, "{}", v), | ||
AnyValue::UInt16(v) => write!(f, "{}", v), | ||
AnyValue::UInt32(v) => write!(f, "{}", v), | ||
AnyValue::UInt64(v) => write!(f, "{}", v), | ||
AnyValue::Float32(v) => write!(f, "{}", v), | ||
AnyValue::Float64(v) => write!(f, "{}", v), | ||
AnyValue::Boolean(v) => write!(f, "{}", v), | ||
AnyValue::Utf8(v) => { | ||
fmt_and_escape_str(f, v, options) | ||
}, | ||
AnyValue::Categorical(idx, rev_map) => { | ||
let v = rev_map.get(idx); | ||
fmt_and_escape_str(f, v, options) | ||
}, | ||
AnyValue::Date(v) => { | ||
let date = temporal_conversions::date32_to_date(v); | ||
match &options.date_format { | ||
None => write!(f, "{}", date), | ||
Some(fmt) => write!(f, "{}", date.format(fmt)) | ||
} | ||
} | ||
AnyValue::Datetime(v, tu, tz) => { | ||
match tz { | ||
None => { | ||
let dt = match tu { | ||
TimeUnit::Nanoseconds => temporal_conversions::timestamp_ns_to_datetime(v), | ||
TimeUnit::Microseconds => temporal_conversions::timestamp_us_to_datetime(v), | ||
TimeUnit::Milliseconds => temporal_conversions::timestamp_ms_to_datetime(v), | ||
}; | ||
match &options.datetime_format { | ||
None => write!(f, "{}", dt), | ||
Some(fmt) => write!(f, "{}", dt.format(fmt)) | ||
} | ||
} | ||
Some(tz) => { | ||
let tz = temporal_conversions::parse_offset(&tz).unwrap(); | ||
|
||
let dt = temporal_conversions::timestamp_to_datetime(v, tu.to_arrow(), &tz); | ||
match &options.datetime_format { | ||
None => write!(f, "{}", dt), | ||
Some(fmt) => write!(f, "{}", dt.format(fmt)) | ||
} | ||
} | ||
} | ||
}, | ||
AnyValue::Time(v) => { | ||
let date = temporal_conversions::time64ns_to_time(v); | ||
match &options.time_format { | ||
None => write!(f, "{}", date), | ||
Some(fmt) => write!(f, "{}", date.format(fmt)) | ||
} | ||
} | ||
dt => panic!("DataType: {} not supported in writing to csv", dt) | ||
}.unwrap(); | ||
|
||
} | ||
|
||
/// Options to serialize logical types to CSV | ||
/// The default is to format times and dates as `chrono` crate formats them. | ||
#[derive(Debug, PartialEq, Eq, Hash, Clone)] | ||
pub struct SerializeOptions { | ||
/// used for [`DataType::Date`] | ||
pub date_format: Option<String>, | ||
/// used for [`DataType::Time64`] | ||
pub time_format: Option<String>, | ||
/// used for [`DataType::Timestamp`] | ||
pub datetime_format: Option<String>, | ||
/// used as separator/delimiter | ||
pub delimiter: u8, | ||
/// quoting character | ||
pub quote: u8, | ||
} | ||
|
||
impl Default for SerializeOptions { | ||
fn default() -> Self { | ||
SerializeOptions { | ||
date_format: None, | ||
time_format: None, | ||
datetime_format: None, | ||
delimiter: b',', | ||
quote: b'"', | ||
} | ||
} | ||
} | ||
|
||
/// Utility to write to `&mut Vec<u8>` buffer | ||
struct StringWrap<'a>(pub &'a mut Vec<u8>); | ||
|
||
impl<'a> std::fmt::Write for StringWrap<'a> { | ||
fn write_str(&mut self, s: &str) -> std::fmt::Result { | ||
self.0.extend_from_slice(s.as_bytes()); | ||
Ok(()) | ||
} | ||
} | ||
|
||
pub(super) fn write<W: Write>(writer: &mut W, df: &DataFrame, chunk_size: usize, options: &SerializeOptions) -> Result<()> { | ||
// check that the double quote is valid utf8 | ||
std::str::from_utf8(&[options.quote, options.quote]).map_err(|_| PolarsError::ComputeError("quote char leads invalid utf8".into())); | ||
|
||
let len = df.height(); | ||
let n_threads = POOL.current_num_threads(); | ||
|
||
let total_rows_per_pool_iter = n_threads * chunk_size; | ||
|
||
let mut any_value_iter_pool = LowContentionPool::<Vec<_>>::new(df.width()); | ||
let mut write_buffer_pool = LowContentionPool::<Vec<_>>::new(df.width()); | ||
|
||
let mut n_rows_finished = 0; | ||
|
||
// holds the buffers that will be written | ||
let mut result_buf = Vec::with_capacity(n_threads); | ||
while n_rows_finished < len { | ||
|
||
|
||
let par_iter = (0..n_threads) | ||
.into_par_iter() | ||
.map(|thread_no| { | ||
let thread_offset = thread_no * chunk_size; | ||
let total_offset = n_rows_finished + thread_offset; | ||
let df = df.slice(total_offset as i64, chunk_size); | ||
let cols = df.get_columns(); | ||
let any_value_iters = cols.iter().map(|s| s.iter()); | ||
let mut col_iters = any_value_iter_pool.get(); | ||
col_iters.extend(any_value_iters); | ||
|
||
let mut write_buffer = write_buffer_pool.get(); | ||
|
||
let last_ptr = &col_iters[col_iters.len() - 1] as *const SeriesIter; | ||
// loop rows | ||
loop { | ||
for col in &mut col_iters { | ||
match col.next() { | ||
Some(value) => { | ||
write_anyvalue(&mut write_buffer, value, options); | ||
|
||
}, | ||
None => { | ||
break | ||
} | ||
} | ||
let current_ptr = col as *const SeriesIter; | ||
if current_ptr != last_ptr { | ||
write!(&mut write_buffer, ",").unwrap() | ||
} | ||
} | ||
} | ||
|
||
// return buffers to the pool | ||
col_iters.clear(); | ||
any_value_iter_pool.set(col_iters); | ||
|
||
write_buffer | ||
}); | ||
|
||
// rayon will ensure the right order | ||
result_buf.par_extend(par_iter); | ||
|
||
for mut buf in result_buf.drain(..) { | ||
writer.write(&buf)?; | ||
buf.clear(); | ||
write_buffer_pool.set(buf); | ||
} | ||
|
||
|
||
|
||
n_rows_finished += total_rows_per_pool_iter; | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters