Skip to content

Commit

Permalink
Merge pull request #29 from soma-smart/datetime_provider
Browse files Browse the repository at this point in the history
Implements Random datetime provider
  • Loading branch information
vianneybacoup committed Mar 5, 2024
2 parents d44e188 + cf6dde6 commit b130d1b
Show file tree
Hide file tree
Showing 12 changed files with 469 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fakelake"
version = "1.2.0"
version = "1.3.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
16 changes: 16 additions & 0 deletions docs/columns/providers/random.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ Create a random date with:

[Options](../options.md) are also possible.

##### datetime
```yaml
- name: connection
provider: Random.Date.datetime
format: "%m-%d-%Y %H-%M-%S"
after: 02-15-2000 12:01:01
before: 07-17-2020 15:06:06
```
Create a random datetime with:

- an optional parameter **format**. Default is "%Y-%m-%d %H:%M:%S"
- an optional parameter **after** as a lower boundary. It should follow the **format** parameter. Default is 1980-01-01 12:00:00
- an optional parameter **before** as a upper boundary. It should follow the **format** parameter. Default is 2000-01-01 12:00:00

[Options](../options.md) are also possible.

### Number
##### i32
```yaml
Expand Down
17 changes: 16 additions & 1 deletion src/generate/csv/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ impl OutputFormat for OutputCsv {
Value::Bool(value) => value.to_string(),
Value::Int32(value) => value.to_string(),
Value::String(value) => value,
Value::Date(value) => value.to_string(),
Value::Date(value, date_format) => value.format(&date_format).to_string(),
Value::Timestamp(value, date_format) => {
value.format(&date_format).to_string()
}
};
}
row.push(str_value);
Expand All @@ -82,6 +85,7 @@ mod tests {
use crate::providers::increment::integer::IncrementIntegerProvider;
use crate::providers::random::bool::BoolProvider;
use crate::providers::random::date::date::DateProvider;
use crate::providers::random::date::datetime::DatetimeProvider;
use crate::providers::random::string::alphanumeric::AlphanumericProvider;

use yaml_rust::YamlLoader;
Expand Down Expand Up @@ -180,6 +184,17 @@ mod tests {
&YamlLoader::load_from_str("presence: 1").unwrap()[0],
),
},
Column {
name: "id".to_string(),
provider: Box::new(DatetimeProvider {
format: "%Y-%m-%d %H:%M:%S".to_string(),
after: 10_000_000,
before: 12_000_000,
}),
presence: presence::new_from_yaml(
&YamlLoader::load_from_str("presence: 1").unwrap()[0],
),
},
];

let config = Config {
Expand Down
104 changes: 100 additions & 4 deletions src/generate/parquet/batch_generator.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use super::utils::get_parquet_type_from_column;
use crate::config::Column;
use crate::providers::provider::Value;
use arrow_array::{Array, ArrayRef, BooleanArray, Date32Array, Int32Array, StringArray};
use arrow_schema::DataType;
use arrow_array::{
Array, ArrayRef, BooleanArray, Date32Array, Int32Array, StringArray, TimestampSecondArray,
};
use arrow_schema::{DataType, TimeUnit};
use chrono::{Datelike, NaiveDate};
use std::sync::Arc;

Expand Down Expand Up @@ -130,7 +132,7 @@ impl ParquetBatchGenerator for DateBatchGenerator {
for i in 0..rows_to_generate {
if self.column.is_next_present() {
match self.column.provider.value(i) {
Value::Date(value) => {
Value::Date(value, _) => {
vec.push(Some(value.num_days_from_ce() - epoch.num_days_from_ce()))
}
_ => panic!("Wrong provider type"),
Expand All @@ -151,12 +153,42 @@ impl ParquetBatchGenerator for DateBatchGenerator {
}
}

#[derive(Clone)]
struct TimestampBatchGenerator {
column: Column,
}
impl ParquetBatchGenerator for TimestampBatchGenerator {
fn batch_array(&self, rows_to_generate: u32) -> Arc<dyn Array> {
let mut vec: Vec<Option<i64>> = Vec::new();
for i in 0..rows_to_generate {
if self.column.is_next_present() {
match self.column.provider.value(i) {
Value::Timestamp(value, _) => vec.push(Some(value.timestamp())),
_ => panic!("Wrong provider type"),
}
} else {
vec.push(None)
}
}
Arc::new(TimestampSecondArray::from(vec)) as ArrayRef
}
fn name(&self) -> &str {
&self.column.name
}
fn new(column: Column) -> TimestampBatchGenerator {
TimestampBatchGenerator { column }
}
}

pub fn parquet_batch_generator_builder(column: Column) -> Box<dyn ParquetBatchGenerator> {
match get_parquet_type_from_column(column.clone()) {
DataType::Boolean => Box::new(BoolBatchGenerator::new(column.clone())),
DataType::Int32 => Box::new(IntBatchGenerator::new(column.clone())),
DataType::Utf8 => Box::new(StrBatchGenerator::new(column.clone())),
DataType::Date32 => Box::new(DateBatchGenerator::new(column.clone())),
DataType::Timestamp(TimeUnit::Second, None) => {
Box::new(TimestampBatchGenerator::new(column.clone()))
}
_ => panic!("Parquet type expected not handled."),
}
}
Expand All @@ -167,7 +199,8 @@ mod tests {
use crate::options::presence::new_from_yaml;
use crate::providers::{
increment::integer::IncrementIntegerProvider, random::bool::BoolProvider,
random::date::date::DateProvider, random::string::alphanumeric::AlphanumericProvider,
random::date::date::DateProvider, random::date::datetime::DatetimeProvider,
random::string::alphanumeric::AlphanumericProvider,
};

use yaml_rust::YamlLoader;
Expand Down Expand Up @@ -387,4 +420,67 @@ mod tests {
let batch_generator = DateBatchGenerator { column };
let _ = batch_generator.batch_array(1);
}

// Timestamp batch generator
#[test]
fn given_timestamp_provider_should_return_batch_generator() {
let column = Column {
name: "timestamp_column".to_string(),
provider: Box::new(DatetimeProvider {
format: "%Y-%m-%d %H:%M:%S".to_string(),
after: 10_000_000,
before: 12_000_000,
}),
presence: new_from_yaml(&YamlLoader::load_from_str("name: test").unwrap()[0]),
};

let ret = parquet_batch_generator_builder(column);
assert_eq!(ret.name(), "timestamp_column");
}

#[test]
fn given_timestamp_batch_generator_should_batch_correctly() {
let column = Column {
name: "timestamp_column".to_string(),
provider: Box::new(DatetimeProvider {
format: "%Y-%m-%d %H:%M:%S".to_string(),
after: 10_000_000,
before: 12_000_000,
}),
presence: new_from_yaml(&YamlLoader::load_from_str("name: test").unwrap()[0]),
};
let batch_generator = TimestampBatchGenerator { column };
let arr = batch_generator.batch_array(1000);

assert_eq!(arr.len(), 1000);
}

#[test]
fn given_timestamp_batch_generator_with_presence_should_batch_correctly() {
let column = Column {
name: "timestamp_column".to_string(),
provider: Box::new(DatetimeProvider {
format: "%Y-%m-%d %H:%M:%S".to_string(),
after: 10_000_000,
before: 12_000_000,
}),
presence: new_from_yaml(&YamlLoader::load_from_str("presence: 0.5").unwrap()[0]),
};
let batch_generator = TimestampBatchGenerator { column };
let arr = batch_generator.batch_array(1000);

assert_eq!(arr.len(), 1000);
}

#[test]
#[should_panic]
fn given_timestamp_batch_generator_with_wrong_provider_should_panic() {
let column = Column {
name: "timestamp_column".to_string(),
provider: Box::new(IncrementIntegerProvider { start: 0 }),
presence: new_from_yaml(&YamlLoader::load_from_str("name: temp").unwrap()[0]),
};
let batch_generator = TimestampBatchGenerator { column };
let _ = batch_generator.batch_array(1);
}
}
27 changes: 23 additions & 4 deletions src/generate/parquet/utils.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
use crate::config::Column;
use crate::providers::provider::Value;
use arrow_schema::DataType;
use arrow_schema::{DataType, TimeUnit};

pub fn get_parquet_type_from_column(column: Column) -> DataType {
match column.provider.value(0) {
Value::Bool(_) => DataType::Boolean,
Value::Int32(_) => DataType::Int32,
Value::String(_) => DataType::Utf8,
Value::Date(_) => DataType::Date32,
Value::Date(_, _) => DataType::Date32,
Value::Timestamp(_, _) => DataType::Timestamp(TimeUnit::Second, None),
}
}

Expand All @@ -19,10 +20,11 @@ mod tests {
use crate::options::presence::new_from_yaml;
use crate::providers::{
increment::integer::IncrementIntegerProvider, random::bool::BoolProvider,
random::date::date::DateProvider, random::string::alphanumeric::AlphanumericProvider,
random::date::date::DateProvider, random::date::datetime::DatetimeProvider,
random::string::alphanumeric::AlphanumericProvider,
};

use arrow_schema::DataType;
use arrow_schema::{DataType, TimeUnit};
use yaml_rust::YamlLoader;

#[test]
Expand Down Expand Up @@ -68,4 +70,21 @@ mod tests {
};
assert_eq!(get_parquet_type_from_column(column), DataType::Date32);
}

#[test]
fn given_timestamp_provider_should_return_timestamp_datatype() {
let column = Column {
name: "timestamp_column".to_string(),
provider: Box::new(DatetimeProvider {
format: "%Y-%m-%d %H:%M:%S".to_string(),
after: 10_000_000,
before: 12_000_000,
}),
presence: new_from_yaml(&YamlLoader::load_from_str("name: test").unwrap()[0]),
};
assert_eq!(
get_parquet_type_from_column(column),
DataType::Timestamp(TimeUnit::Second, None)
);
}
}
5 changes: 3 additions & 2 deletions src/providers/provider.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::errors::FakeLakeError;
use crate::providers;

use chrono::NaiveDate;
use chrono::{NaiveDate, NaiveDateTime};
use core::fmt;
use yaml_rust::Yaml;

Expand All @@ -10,7 +10,8 @@ pub enum Value {
Bool(bool),
Int32(i32),
String(String),
Date(NaiveDate),
Date(NaiveDate, String),
Timestamp(NaiveDateTime, String),
}

pub trait CloneProvider {
Expand Down
15 changes: 15 additions & 0 deletions src/providers/random/date/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::errors::FakeLakeError;
use crate::providers::provider::Provider;

use super::date::DateProvider;
use super::datetime::DatetimeProvider;

use yaml_rust::Yaml;

Expand All @@ -11,6 +12,7 @@ pub fn get_corresponding_provider(
) -> Result<Box<dyn Provider>, FakeLakeError> {
match provider_split.next() {
Some("date") => Ok(Box::new(DateProvider::new_from_yaml(column))),
Some("datetime") => Ok(Box::new(DatetimeProvider::new_from_yaml(column))),
_ => Err(FakeLakeError::BadYAMLFormat("".to_string())),
}
}
Expand All @@ -34,6 +36,19 @@ mod tests {
}
}

#[test]
fn given_datetime_should_return_provider() {
let provider_name = "datetime";
let yaml_str = format!("name: created_at{}provider: {}", '\n', provider_name);
let column = &YamlLoader::load_from_str(yaml_str.as_str()).unwrap()[0];

let provider_split = provider_name.split('.');
match get_corresponding_provider(provider_split, column) {
Ok(_) => (),
_ => panic!(),
}
}

#[test]
fn given_wrong_provider_should_return_error() {
let provider_name = "not_a_provider";
Expand Down
7 changes: 4 additions & 3 deletions src/providers/random/date/date.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ impl Provider for DateProvider {
fn value(&self, _: u32) -> Value {
Value::Date(
NaiveDate::from_num_days_from_ce_opt(fastrand::i32(self.after..self.before)).unwrap(),
self.format.clone(),
)
}
fn new_from_yaml(column: &Yaml) -> DateProvider {
Expand Down Expand Up @@ -124,7 +125,7 @@ mod tests {
fn given_nothing_should_return_parquet_type() {
let provider: DateProvider = generate_provider(None, None, None);
match provider.value(0) {
Value::Date(_) => (),
Value::Date(_, _) => (),
_ => panic!(),
};
}
Expand Down Expand Up @@ -237,7 +238,7 @@ mod tests {

for value in 1..100 {
match provider.value(value) {
Value::Date(value) => {
Value::Date(value, _) => {
assert!(value.num_days_from_ce() >= provider.after);
assert!(value.num_days_from_ce() < provider.before);
}
Expand All @@ -256,7 +257,7 @@ mod tests {

for value in 1..100 {
match provider.value(value) {
Value::Date(value) => {
Value::Date(value, _) => {
assert_eq!(
value.num_days_from_ce(),
get_day_since_year0("2020-05-18", DEFAULT_FORMAT)
Expand Down
Loading

0 comments on commit b130d1b

Please sign in to comment.