Skip to content

Commit

Permalink
Adding jsonExtract function (#84)
Browse files Browse the repository at this point in the history
* Adding jsonExtract

* Adding infer_schema_len argument

* Adding dtype option to jsonExtract

* Resolving PR comments

* Update src/series.rs

Co-authored-by: Cory Grinstead <universalmind.candy@gmail.com>

---------

Co-authored-by: Darek <dchrostowski@medallia.com>
Co-authored-by: Cory Grinstead <universalmind.candy@gmail.com>
  • Loading branch information
3 people committed Aug 7, 2023
1 parent 1972e27 commit 2ad0904
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 2 deletions.
34 changes: 34 additions & 0 deletions __tests__/expr.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,40 @@ describe("expr.str", () => {
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("jsonExtract", () => {
const df = pl.DataFrame({
json: ['{"a":1, "b": true}', null, '{"a":2, "b": false}'],
});
const actual = df.select(pl.col("json").str.jsonExtract());
const expected = pl.DataFrame({
json: [
{ a: 1, b: true },
{ a: null, b: null },
{ a: 2, b: false },
],
});
expect(actual).toFrameEqual(expected);
let s = pl.Series(["[1, 2, 3]", null, "[4, 5, 6]"]);
let dtype = pl.List(pl.Int64);
const expSeries = pl.Series([[1, 2, 3], null, [4, 5, 6]]);
expect(s.str.jsonExtract()).toSeriesEqual(expSeries);
expect(s.str.jsonExtract(dtype)).toSeriesEqual(expSeries);
dtype = pl.Struct([
new pl.Field("a", pl.Int64),
new pl.Field("b", pl.Bool),
]);
s = pl.Series("json", ['{"a":1, "b": true}', null, '{"a":2, "b": false}']);
expect(s.str.jsonExtract().as("json")).toSeriesEqual(
expected.getColumn("json"),
);
expect(s.str.jsonExtract(dtype).as("json")).toSeriesEqual(
expected.getColumn("json"),
);
s = pl.Series("col_a", [], pl.Utf8);
const exp = pl.Series("col_a", []).cast(pl.List(pl.Int64));
dtype = pl.List(pl.Int64);
expect(s.str.jsonExtract(dtype).as("col_a")).toSeriesEqual(exp);
});
test("jsonPathMatch", () => {
const df = pl.DataFrame({
data: [
Expand Down
17 changes: 17 additions & 0 deletions __tests__/lazyframe.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1202,4 +1202,21 @@ describe("lazyframe", () => {
});
expect(actual).toFrameEqual(expected);
});
test("json:extract", () => {
const expected = pl.DataFrame({
json: [
{ a: 1, b: true },
{ a: null, b: null },
{ a: 2, b: false },
],
});
const actual = pl
.DataFrame({
json: ['{"a": 1, "b": true}', null, '{"a": 2, "b": false}'],
})
.lazy()
.select(pl.col("json").str.jsonExtract())
.collectSync();
expect(actual).toFrameEqual(expected);
});
});
30 changes: 30 additions & 0 deletions polars/lazy/expr/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,39 @@ export interface StringNamespace extends StringFunctions<Expr> {
* ```
*/
extract(pat: string | RegExp, groupIndex: number): Expr;

/**
* Parse string values as JSON.
* Throw errors if encounter invalid JSON strings.
* @params Not implemented ATM
* @returns DF with struct
* @example
* >>> df = pl.DataFrame( {json: ['{"a":1, "b": true}', null, '{"a":2, "b": false}']} )
* >>> df.select(pl.col("json").str.jsonExtract())
* shape: (3, 1)
* ┌─────────────┐
* │ json │
* │ --- │
* │ struct[2] │
* ╞═════════════╡
* │ {1,true} │
* │ {null,null} │
* │ {2,false} │
* └─────────────┘
* See Also
* ----------
* jsonPathMatch : Extract the first match of json string with provided JSONPath expression.
*/
jsonExtract(dtype?: DataType, inferSchemaLength?: number): Expr;
/**
* Extract the first match of json string with provided JSONPath expression.
* Throw errors if encounter invalid json strings.
* All return value will be casted to Utf8 regardless of the original value.
* @see https://goessner.net/articles/JsonPath/
* @param jsonPath - A valid JSON path query string
* @param dtype - The dtype to cast the extracted value to. If None, the dtype will be inferred from the JSON value.
* @param inferSchemaLength - How many rows to parse to determine the schema. If ``None`` all rows are used.
* @returns Utf8 array. Contain null if original value is null or the `jsonPath` return nothing.
* @example
* ```
Expand Down Expand Up @@ -316,6 +343,9 @@ export const ExprStringFunctions = (_expr: any): StringNamespace => {
extract(pat: string | RegExp, groupIndex: number) {
return wrap("strExtract", regexToString(pat), groupIndex);
},
jsonExtract(dtype?: DataType, inferSchemaLength?: number) {
return wrap("strJsonExtract", dtype, inferSchemaLength);
},
jsonPathMatch(pat: string) {
return wrap("strJsonPathMatch", pat);
},
Expand Down
20 changes: 20 additions & 0 deletions polars/series/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,29 @@ export interface StringNamespace extends StringFunctions<Series> {
* ```
*/
extract(pattern: string | RegExp, groupIndex: number): Series;
/***
* Parse string values as JSON.
* @returns Utf8 array. Contain null if original value is null or the `jsonPath` return nothing.
* @example
* s = pl.Series("json", ['{"a":1, "b": true}', null, '{"a":2, "b": false}']);
* s.str.json_extract().as("json");
* shape: (3,)
* Series: 'json' [struct[2]]
* [
* {1,true}
* {null,null}
* {2,false}
* ]
*/
jsonExtract(dtype?: DataType, inferSchemaLength?: number): Series;
/**
* Extract the first match of json string with provided JSONPath expression.
* Throw errors if encounter invalid json strings.
* All return value will be casted to Utf8 regardless of the original value.
* @see https://goessner.net/articles/JsonPath/
* @param jsonPath - A valid JSON path query string
* @param dtype - The dtype to cast the extracted value to. If None, the dtype will be inferred from the JSON value.
* @param inferSchemaLength - How many rows to parse to determine the schema. If ``None`` all rows are used.
* @returns Utf8 array. Contain null if original value is null or the `jsonPath` return nothing.
* @example
* ```
Expand Down Expand Up @@ -283,6 +300,9 @@ export const SeriesStringFunctions = (_s: any): StringNamespace => {
extract(pat: string | RegExp, groupIndex: number) {
return wrap("strExtract", regexToString(pat), groupIndex);
},
jsonExtract(dtype?: DataType, inferSchemaLength?: number) {
return wrap("strJsonExtract", dtype, inferSchemaLength);
},
jsonPathMatch(pat: string) {
return wrap("strJsonPathMatch", pat);
},
Expand Down
16 changes: 14 additions & 2 deletions src/conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -619,10 +619,22 @@ impl FromNapiValue for Wrap<DataType> {
"Time" => DataType::Time,
"Object" => DataType::Object("object"),
"Categorical" => DataType::Categorical(None),
"Struct" => DataType::Struct(vec![]),
"Struct" => {
let inner = obj.get::<_, Array>("fields")?.unwrap();
let mut fldvec: Vec<Field> = Vec::with_capacity(inner.len() as usize);
for i in 0..inner.len() {
let inner_dtype: Object = inner.get::<Object>(i)?.unwrap();
let napi_dt = Object::to_napi_value(env, inner_dtype).unwrap();
let obj = Object::from_napi_value(env, napi_dt)?;
let name = obj.get::<_, String>("name")?.unwrap();
let dt = obj.get::<_, Wrap<DataType>>("dtype")?.unwrap();
let fld = Field::new(&name, dt.0);
fldvec.push(fld);
}
DataType::Struct(fldvec)
}
tp => panic!("Type {} not implemented in str_to_polarstype", tp),
};

Ok(Wrap(dtype))
}
_ => Err(Error::new(
Expand Down
14 changes: 14 additions & 0 deletions src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,20 @@ impl JsExpr {
.into()
}
#[napi(catch_unwind)]
pub fn str_json_extract(
&self,
dtype: Option<Wrap<DataType>>,
infer_schema_len: Option<i64>,
) -> JsExpr {
let dt = dtype.clone().map(|d| d.0 as DataType);
let infer_schema_len = infer_schema_len.map(|l| l as usize);
self.inner
.clone()
.str()
.json_extract(dt, infer_schema_len)
.into()
}
#[napi(catch_unwind)]
pub fn str_json_path_match(&self, pat: String) -> JsExpr {
let function = move |s: Series| {
let ca = s.utf8()?;
Expand Down
16 changes: 16 additions & 0 deletions src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,22 @@ impl JsSeries {
Ok(s.into())
}

#[napi(catch_unwind)]
pub fn str_json_extract(
&self,
dtype: Option<Wrap<DataType>>,
infer_schema_len: Option<i64>,
) -> napi::Result<JsSeries> {
let ca = self.series.utf8().map_err(JsPolarsErr::from)?;
let dt = dtype.map(|d| d.0);
let infer_schema_len = infer_schema_len.map(|l| l as usize);
let s = ca
.json_extract(dt, infer_schema_len)
.map_err(JsPolarsErr::from)?
.into_series();
Ok(s.into())
}

#[napi(catch_unwind)]
pub fn str_json_path_match(&self, pat: String) -> napi::Result<JsSeries> {
let ca = self.series.utf8().map_err(JsPolarsErr::from)?;
Expand Down

0 comments on commit 2ad0904

Please sign in to comment.