Skip to content

Commit

Permalink
feat(clean): support conversion into packed binary format in clean_ip
Browse files Browse the repository at this point in the history
  • Loading branch information
NoirTree committed May 26, 2021
1 parent c735cd9 commit 37a83b0
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 5 deletions.
10 changes: 8 additions & 2 deletions dataprep/clean/clean_ip.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def clean_ip(
- 'binary': binary representation ('00001100000000110000010000000101')
- 'hexa': hexadecimal representation ('0xc030405')
- 'integer': integer representation (201524229)
- 'packed': packed binary representation (big-endian, a bytes object)
(default: 'compressed')
inplace
Expand Down Expand Up @@ -98,10 +99,10 @@ def clean_ip(
f'input_format {input_format} is invalid, it needs to be "ipv4", "ipv6" or "auto"'
)

if output_format not in {"compressed", "full", "binary", "hexa", "integer"}:
if output_format not in {"compressed", "full", "binary", "hexa", "integer", "packed"}:
raise ValueError(
f'output_format {output_format} is invalid, it needs to be "compressed", "full", '
'"binary", "hexa" or "integer"'
'"binary", "hexa", "integer" or "packed"'
)

if not isinstance(inplace, bool):
Expand Down Expand Up @@ -192,6 +193,7 @@ def _format_ip(val: Any, input_format: str, output_format: str, errors: str) ->
2 := the value is cleaned and the cleaned value is DIFFERENT than the input value
3 := the value is cleaned and is THE SAME as the input value (no transformation)
"""
# pylint: disable=too-many-branches
address, status = _check_ip(val, input_format, True)

if status == "null":
Expand Down Expand Up @@ -221,6 +223,10 @@ def _format_ip(val: Any, input_format: str, output_format: str, errors: str) ->
elif output_format == "integer":
result = int(address)

# converts to packed binary format (big-endian)
elif output_format == "packed":
result = address.packed

# convert to full representation
else:
dlm = "." if address.version == 4 else ":" # delimiter
Expand Down
15 changes: 15 additions & 0 deletions dataprep/tests/clean/test_clean_ip.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,21 @@ def test_clean_output_binary(df_ips: pd.DataFrame) -> None:
assert df_check.equals(df_clean)


def test_clean_output_packed(df_ips: pd.DataFrame) -> None:
df_clean = clean_ip(df_ips, column="messy_ip", output_format="packed")
df_check = df_ips.copy()
df_check["messy_ip_clean"] = [
b' \x01\r\xb8\x85\xa3\x00\x00\x00\x00\x8a.\x03ps4',
b'\x0c\x03\x04\x05',
b'\xe9\x05\x06\x00',
np.nan,
np.nan,
b'\xb1\xc3\x94t',
b'\xfd\xf8\xf5;\x82\xe4\x00\x00\x00\x00\x00\x00\x00\x00\x00S',
]
assert df_check.equals(df_clean)


def test_validate_value() -> None:
assert validate_ip("2001:0db8:85a3:0000:0000:8a2e:0370:7334") == True
assert validate_ip("") == False
Expand Down
23 changes: 20 additions & 3 deletions docs/source/user_guide/clean/clean_ip.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
"* `full`: provides full version of the ip address,\n",
"* `binary`: provides binary representation of the ip address,\n",
"* `hexa`: provides hexadecimal representation of the ip address,\n",
"* `integer`: provides integer representation of the ip address.\n",
"* `integer`: provides integer representation of the ip address,\n",
"* `packed`: provides packed binary representation of the ip address.\n",
"\n",
"The default output format is `compressed`.\n",
"\n",
Expand Down Expand Up @@ -69,7 +70,7 @@
"df = pd.DataFrame({\n",
" \"ips\": [\n",
" \"00.000.0.0\", \"455.0.0.0\", None, 876234, {}, \"00.12.021.255\",\n",
" \"684D:1111:222:3333:4444:5555:6:77\"\n",
" \"684D:1111:222:3333:4444:5555:6:77\", b'\\xc9\\xdb\\x10\\x00'\n",
" ]\n",
"})\n",
"df"
Expand Down Expand Up @@ -264,6 +265,22 @@
"clean_ip(df, \"ips\", output_format=\"integer\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `packed`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clean_ip(df, \"ips\", output_format=\"packed\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -367,7 +384,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.7.4"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 37a83b0

Please sign in to comment.