Skip to content

Commit

Permalink
feat: encoding and decoding of strings (#2236)
Browse files Browse the repository at this point in the history
  • Loading branch information
universalmind303 committed Jan 1, 2022
1 parent abfd444 commit 6bd5016
Show file tree
Hide file tree
Showing 23 changed files with 790 additions and 29 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ polars/vendor
AUTO_CHANGELOG.md
node_modules/
.coverage
venv/
venv/
*.iml
1 change: 1 addition & 0 deletions nodejs-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ features = [
"repeat_by",
"horizontal_concat",
"dataframe_arithmetic",
"string_encoding",
]
path = "../polars"

Expand Down
152 changes: 152 additions & 0 deletions nodejs-polars/__tests__/expr.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1453,6 +1453,158 @@ describe("expr.str", () => {
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("hex encode", () => {
const df = pl.DataFrame({
"original": [
"foo",
"bar",
null
]
});
const expected = pl.DataFrame({
"encoded": [
"666f6f",
"626172",
null
]
});
const seriesActual = df.getColumn("original")
.str
.encode("hex")
.rename("encoded")
.toFrame();
const actual = df.select(
col("original")
.str
.encode("hex")
.as("encoded")
);
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("hex decode", () => {
const df = pl.DataFrame({
"encoded": [
"666f6f",
"626172",
null
]
});
const expected = pl.DataFrame({
"decoded": [
"foo",
"bar",
null
]
});
const seriesActual = df.getColumn("encoded")
.str
.decode("hex")
.rename("decoded")
.toFrame();
const actual = df.select(
col("encoded")
.str
.decode("hex")
.as("decoded")
);
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("hex decode strict", () => {
const df = pl.DataFrame({
"encoded": [
"666f6f",
"626172",
"not a hex",
null
]
});

const fn = () => df.select(
col("encoded")
.str
.decode({encoding:"hex", strict:true})
.as("decoded")
);
expect(fn).toThrow();
});

test("encode base64", () => {
const df = pl.DataFrame({
"original": [
"foo",
"bar",
null
]
});
const expected = pl.DataFrame({
"encoded": [
"Zm9v",
"YmFy",
null
]
});
const seriesActual = df.getColumn("original")
.str
.encode("base64")
.rename("encoded")
.toFrame();
const actual = df.select(
col("original")
.str
.encode("base64")
.as("encoded")
);
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test.only("base64 decode", () => {
const _df = pl.DataFrame({"strings": ["666f6f", "626172", null]});
console.log(_df.select(col("strings").str.decode("hex")));
const df = pl.DataFrame({
"encoded": [
"Zm9v",
"YmFy",
null
]
});
const expected = pl.DataFrame({
"decoded": [
"foo",
"bar",
null
]
});
const seriesActual = df.getColumn("encoded")
.str
.decode("base64")
.rename("decoded")
.toFrame();
const actual = df.select(
col("encoded")
.str
.decode("base64", false)
.as("decoded")
);
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("base64 decode strict", () => {
const df = pl.DataFrame({
"encoded": [
"not a base64"
]
});

const fn = () => df.select(
col("encoded")
.str
.decode({encoding:"base64", strict:true})
.as("decoded")
);
expect(fn).toThrow();
});
});
describe("expr.lst", () => {
test("get", () => {
Expand Down
40 changes: 40 additions & 0 deletions nodejs-polars/__tests__/series.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -821,4 +821,44 @@ describe("StringFunctions", () => {

expect(expected).toStrictEqual(actual);
});

test("hex encode", () => {
const s = pl.Series("strings", ["foo", "bar", null]);
const expected = pl.Series("encoded", ["666f6f", "626172", null]);
const encoded = s.str.encode("hex").alias("encoded");
expect(encoded).toSeriesEqual(expected);
});
test("hex decode", () => {
const s = pl.Series("encoded", ["666f6f", "626172", "invalid", null]);
const expected = pl.Series("decoded", ["foo", "bar", null, null]);
const decoded = s.str.decode("hex").alias("decoded");
expect(decoded).toSeriesEqual(expected);
});
test("hex decode strict", () => {
const s = pl.Series("encoded", ["666f6f", "626172", "invalid", null]);
const fn0 = () => s.str.decode("hex", true).alias("decoded");
const fn1 = () => s.str.decode({encoding: "hex", strict: true}).alias("decoded");
expect(fn0).toThrow();
expect(fn1).toThrow();
});
test("encode base64", () => {
const s = pl.Series("strings", ["foo", "bar"]);
const expected = pl.Series("encoded", ["Zm9v", "YmFy"]);
const encoded = s.str.encode("base64").alias("encoded");
expect(encoded).toSeriesEqual(expected);
});
test("base64 decode strict", () => {
const s = pl.Series("encoded", ["Zm9v", "YmFy", "not base64 encoded", null]);
const fn0 = () => s.str.decode("base64", true).alias("decoded");
const fn1 = () => s.str.decode({encoding: "base64", strict: true}).alias("decoded");
expect(fn0).toThrow();
expect(fn1).toThrow();
});
test("base64 decode", () => {
const s = pl.Series("encoded", ["Zm9v", "YmFy", "invalid", null]);
const decoded = pl.Series("decoded", ["foo", "bar", null, null]);

const actual = s.str.decode("base64").alias("decoded");
expect(actual).toSeriesEqual(decoded);
});
});
76 changes: 76 additions & 0 deletions nodejs-polars/polars/lazy/expr.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,55 @@ export interface ExprStringFunctions {
concat(delimiter: string): Expr;
/** Check if strings in Series contain regex pattern. */
contains(pat: string | RegExp): Expr;
/**
* Decodes a value using the provided encoding
* @param encoding - hex | base64
* @param strict - how to handle invalid inputs
*
* - true: method will throw error if unable to decode a value
* - false: unhandled values will be replaced with `null`
* @example
* ```
* >>> df = pl.DataFrame({"strings": ["666f6f", "626172", null]})
* >>> df.select(col("strings").str.decode("hex"))
* shape: (3, 1)
* ┌─────────┐
* │ strings │
* │ --- │
* │ str │
* ╞═════════╡
* │ foo │
* ├╌╌╌╌╌╌╌╌╌┤
* │ bar │
* ├╌╌╌╌╌╌╌╌╌┤
* │ null │
* └─────────┘
* ```
*/
decode(encoding: "hex" | "base64", strict?: boolean): Expr
decode(options: {encoding: "hex" | "base64", strict?: boolean}): Expr
/**
* Encodes a value using the provided encoding
* @param encoding - hex | base64
* @example
* ```
* >>> df = pl.DataFrame({"strings", ["foo", "bar", null]})
* >>> df.select(col("strings").str.encode("hex"))
* shape: (3, 1)
* ┌─────────┐
* │ strings │
* │ --- │
* │ str │
* ╞═════════╡
* │ 666f6f │
* ├╌╌╌╌╌╌╌╌╌┤
* │ 626172 │
* ├╌╌╌╌╌╌╌╌╌┤
* │ null │
* └─────────┘
* ```
*/
encode(encoding: "hex" | "base64"): Expr
/**
* Extract the target capture group from provided patterns.
* @param pattern A valid regex pattern
Expand Down Expand Up @@ -948,6 +997,16 @@ const ExprStringFunctions = (_expr: JsExpr): ExprStringFunctions => {

return Expr(pli.expr.str[method]({_expr, ...args }));
};
const handleDecode = (encoding, strict) => {
switch (encoding) {
case "hex":
return wrap(`decodeHex`, {strict});
case "base64":
return wrap(`decodeBase64`, {strict});
default:
throw new RangeError("supported encodings are 'hex' and 'base64'");
}
};

return {
concat(delimiter: string) {
Expand All @@ -956,6 +1015,23 @@ const ExprStringFunctions = (_expr: JsExpr): ExprStringFunctions => {
contains(pat: string | RegExp) {
return wrap("contains", {pat: regexToString(pat)});
},
decode(arg, strict=false) {
if(typeof arg === "string") {
return handleDecode(arg, strict);
}

return handleDecode(arg.encoding, arg.strict);
},
encode(encoding) {
switch (encoding) {
case "hex":
return wrap(`encodeHex`);
case "base64":
return wrap(`encodeBase64`);
default:
throw new RangeError("supported encodings are 'hex' and 'base64'");
}
},
extract(pat: string | RegExp, groupIndex: number) {
return wrap("extract", {pat: regexToString(pat), groupIndex});
},
Expand Down

0 comments on commit 6bd5016

Please sign in to comment.