-
Notifications
You must be signed in to change notification settings - Fork 208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(executor): support reading and writing csv files #112
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
// TODO(wrj): remove this once linked to plan | ||
#![allow(dead_code)] | ||
|
||
use super::*; | ||
use crate::{ | ||
array::ArrayBuilderImpl, | ||
physical_planner::{FileFormat, PhysicalCopyFromFile}, | ||
}; | ||
use std::fs::File; | ||
|
||
/// The executor of loading file data. | ||
pub struct CopyFromFileExecutor { | ||
plan: PhysicalCopyFromFile, | ||
} | ||
|
||
impl CopyFromFileExecutor { | ||
pub fn execute(self) -> impl Stream<Item = Result<DataChunk, ExecutorError>> { | ||
try_stream! { | ||
let chunk = tokio::task::spawn_blocking(|| self.read_file_blocking()).await.unwrap()?; | ||
yield chunk; | ||
} | ||
} | ||
|
||
// TODO(wrj): process a window at a time | ||
fn read_file_blocking(self) -> Result<DataChunk, ExecutorError> { | ||
let mut array_builders = self | ||
.plan | ||
.column_types | ||
.iter() | ||
.map(ArrayBuilderImpl::new) | ||
.collect::<Vec<ArrayBuilderImpl>>(); | ||
|
||
let file = File::open(&self.plan.path)?; | ||
let mut reader = match self.plan.format { | ||
FileFormat::Csv { | ||
delimiter, | ||
quote, | ||
escape, | ||
header, | ||
} => csv::ReaderBuilder::new() | ||
.delimiter(delimiter) | ||
.quote(quote) | ||
.escape(escape) | ||
.has_headers(header) | ||
.from_reader(file), | ||
}; | ||
|
||
for result in reader.records() { | ||
let record = result?; | ||
if record.len() != array_builders.len() { | ||
return Err(ExecutorError::LengthMismatch { | ||
expected: array_builders.len(), | ||
actual: record.len(), | ||
}); | ||
} | ||
for ((s, builder), ty) in record | ||
.iter() | ||
.zip(&mut array_builders) | ||
.zip(&self.plan.column_types) | ||
{ | ||
if !ty.is_nullable() && s.is_empty() { | ||
return Err(ExecutorError::NotNullable); | ||
} | ||
builder.push_str(s)?; | ||
} | ||
} | ||
let chunk = array_builders | ||
.into_iter() | ||
.map(|builder| builder.finish()) | ||
.collect(); | ||
Ok(chunk) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use crate::{ | ||
array::ArrayImpl, | ||
types::{DataTypeExt, DataTypeKind}, | ||
}; | ||
use std::io::Write; | ||
|
||
#[tokio::test] | ||
async fn read_csv() { | ||
let csv = "1,1.5,one\n2,2.5,two\n"; | ||
|
||
let mut file = tempfile::NamedTempFile::new().expect("failed to create temp file"); | ||
write!(file, "{}", csv).expect("failed to write file"); | ||
|
||
let executor = CopyFromFileExecutor { | ||
plan: PhysicalCopyFromFile { | ||
path: file.path().into(), | ||
format: FileFormat::Csv { | ||
delimiter: b',', | ||
quote: b'"', | ||
escape: None, | ||
header: false, | ||
}, | ||
column_types: vec![ | ||
DataTypeKind::Int.not_null(), | ||
DataTypeKind::Double.not_null(), | ||
DataTypeKind::String.not_null(), | ||
], | ||
}, | ||
}; | ||
let actual = executor.execute().boxed().next().await.unwrap().unwrap(); | ||
|
||
let expected: DataChunk = [ | ||
ArrayImpl::Int32([1, 2].into_iter().collect()), | ||
ArrayImpl::Float64([1.5, 2.5].into_iter().collect()), | ||
ArrayImpl::UTF8(["one", "two"].iter().map(Some).collect()), | ||
] | ||
.into_iter() | ||
.collect(); | ||
assert_eq!(actual, expected); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
// TODO(wrj): remove this once linked to plan | ||
#![allow(dead_code)] | ||
|
||
use super::*; | ||
use crate::physical_planner::FileFormat; | ||
use std::{fs::File, path::PathBuf}; | ||
use tokio::sync::mpsc; | ||
|
||
/// The executor of saving data to file. | ||
pub struct CopyToFileExecutor { | ||
pub path: PathBuf, | ||
pub format: FileFormat, | ||
pub child: BoxedExecutor, | ||
} | ||
|
||
impl CopyToFileExecutor { | ||
pub fn execute(self) -> impl Stream<Item = Result<DataChunk, ExecutorError>> { | ||
try_stream! { | ||
let Self { path, format, child } = self; | ||
let (sender, recver) = mpsc::channel(1); | ||
let writer = tokio::task::spawn_blocking(move || Self::write_file_blocking(path, format, recver)); | ||
for await batch in child { | ||
sender.send(batch?).await.unwrap(); | ||
} | ||
drop(sender); | ||
writer.await.unwrap()?; | ||
yield DataChunk::single(1); | ||
} | ||
} | ||
|
||
fn write_file_blocking( | ||
path: PathBuf, | ||
format: FileFormat, | ||
mut recver: mpsc::Receiver<DataChunk>, | ||
) -> Result<(), ExecutorError> { | ||
let file = File::create(&path)?; | ||
let mut writer = match format { | ||
FileFormat::Csv { | ||
delimiter, | ||
quote, | ||
escape, | ||
header, | ||
} => csv::WriterBuilder::new() | ||
.delimiter(delimiter) | ||
.quote(quote) | ||
.escape(escape.unwrap_or(b'\\')) | ||
.has_headers(header) | ||
.from_writer(file), | ||
}; | ||
|
||
while let Some(chunk) = recver.blocking_recv() { | ||
for i in 0..chunk.cardinality() { | ||
// TODO(wrj): avoid dynamic memory allocation (String) | ||
let row = chunk.arrays().iter().map(|a| a.get_to_string(i)); | ||
writer.write_record(row)?; | ||
} | ||
writer.flush()?; | ||
} | ||
Ok(()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use crate::array::ArrayImpl; | ||
|
||
#[tokio::test] | ||
async fn write_csv() { | ||
let file = tempfile::NamedTempFile::new().expect("failed to create temp file"); | ||
|
||
let executor = CopyToFileExecutor { | ||
path: file.path().into(), | ||
format: FileFormat::Csv { | ||
delimiter: b',', | ||
quote: b'"', | ||
escape: None, | ||
header: false, | ||
}, | ||
child: try_stream! { | ||
yield [ | ||
ArrayImpl::Int32([1, 2].into_iter().collect()), | ||
ArrayImpl::Float64([1.5, 2.5].into_iter().collect()), | ||
ArrayImpl::UTF8(["one", "two"].iter().map(Some).collect()), | ||
] | ||
.into_iter() | ||
.collect(); | ||
} | ||
.boxed(), | ||
}; | ||
executor.execute().boxed().next().await.unwrap().unwrap(); | ||
|
||
let actual = std::fs::read_to_string(file.path()).unwrap(); | ||
let expected = "1,1.5,one\n2,2.5,two\n"; | ||
assert_eq!(actual, expected); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
use crate::types::DataType; | ||
use std::path::PathBuf; | ||
|
||
/// The physical plan of `copy`. | ||
#[derive(Debug, PartialEq, Clone)] | ||
pub struct PhysicalCopyFromFile { | ||
/// The file path to copy from. | ||
pub path: PathBuf, | ||
/// The file format. | ||
pub format: FileFormat, | ||
/// The column types. | ||
pub column_types: Vec<DataType>, | ||
} | ||
|
||
/// File format. | ||
#[derive(Debug, PartialEq, Clone)] | ||
pub enum FileFormat { | ||
Csv { | ||
/// Delimiter to parse. | ||
delimiter: u8, | ||
/// Quote to use. | ||
quote: u8, | ||
/// Escape character to use. | ||
escape: Option<u8>, | ||
/// Whether or not the file has a header line. | ||
header: bool, | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
use super::{FileFormat, PhysicalPlan}; | ||
use crate::types::DataType; | ||
use std::path::PathBuf; | ||
|
||
/// The physical plan of `copy`. | ||
#[derive(Debug, PartialEq, Clone)] | ||
pub struct PhysicalCopyToFile { | ||
/// The file path to copy to. | ||
pub path: PathBuf, | ||
/// The file format. | ||
pub format: FileFormat, | ||
/// The column types. | ||
pub column_types: Vec<DataType>, | ||
/// The child plan. | ||
pub child: PhysicalPlan, | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How can we know that column types when constructing the plan? Do we need to open that file before actually executing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The column types can be inferred from table catalog or query results, so we don't need to open the file.
btw, I have another question: If we are going to export data to an existing file, should we append or truncate it to empty first?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should only allow export to a new file. e.g.
OpenOptions::default().create_new(true)
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
... if exporting to an existing file, we should truncate the content, as it might contain data from other tables. btw, we should warn users before really overwrite a file.