Skip to content

Commit

Permalink
docs(clean): add API reference
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandon Lockhart committed Feb 2, 2021
1 parent 0b3f224 commit 68182f6
Show file tree
Hide file tree
Showing 15 changed files with 449 additions and 174 deletions.
119 changes: 87 additions & 32 deletions dataprep/clean/clean_country.py
@@ -1,9 +1,9 @@
"""
Implement clean_country function
Clean and validate a DataFrame column containing country names.
"""
import os
from functools import lru_cache
from operator import itemgetter
from os import path
from typing import Any, Union

import dask
Expand All @@ -15,7 +15,7 @@
from ..eda.progress_bar import ProgressBar
from .utils import NULL_VALUES, create_report_new, to_dask

COUNTRY_DATA_FILE = os.path.join(os.path.split(os.path.abspath(__file__))[0], "country_data.tsv")
COUNTRY_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "country_data.tsv")

DATA = pd.read_csv(COUNTRY_DATA_FILE, sep="\t", encoding="utf-8", dtype=str)

Expand All @@ -32,47 +32,82 @@ def clean_country(
fuzzy_dist: int = 0,
strict: bool = False,
inplace: bool = False,
report: bool = True,
errors: str = "coerce",
report: bool = True,
progress: bool = True,
) -> pd.DataFrame:
"""
This function cleans countries
Clean and standardize country names.
Read more in the :ref:`User Guide <country_userguide>`.
Parameters
----------
df
pandas or Dask DataFrame
A pandas or Dask DataFrame containing the data to be cleaned.
column
column name containing messy country data
The name of the column containing country names.
input_format
the ISO 3166 input format of the country:
"auto" (infers the input format), country name ("name"),
official state name ("official"), alpha-2 code ("alpha-2"),
alpha-3 code ("alpha-3"), numeric code ("numeric")
The ISO 3166 input format of the country.
- 'auto': infer the input format
- 'name': country name ('United States')
- 'official': official state name ('United States of America')
- 'alpha-2': alpha-2 code ('US')
- 'alpha-3': alpha-3 code ('USA')
- 'numeric': numeric code (840)
(default: 'auto')
output_format
the desired format of the country:
country name ("name"), official state name ("official"), alpha-2 code ("alpha-2"),
alpha-3 code ("alpha-3"), numeric code ("numeric")
The desired ISO 3166 format of the country:
- 'name': country name ('United States')
- 'official': official state name ('United States of America')
- 'alpha-2': alpha-2 code ('US')
- 'alpha-3': alpha-3 code ('USA')
- 'numeric': numeric code (840)
(default: 'name')
fuzzy_dist
The maximum edit distance (number of single character insertions, deletions
or substitutions required to change one word into the other) between a country value
and input that will count as a match. Only applies to "auto", "name" and "official" formats.
and input that will count as a match. Only applies to 'auto', 'name' and 'official'
input formats.
(default: 0)
strict
If True, matching for input formats "name" and "official" are done by looking
for a direct match, if False, matching is done by searching the input for a
regex match
If True, matching for input formats 'name' and 'official' are done by looking
for a direct match. If False, matching is done by searching the input for a
regex match.
(default: False)
inplace
If True, delete the given column with dirty data, else, create a new
column with cleaned data.
If True, delete the column containing the data that was cleaned. Otherwise,
keep the original column.
(default: False)
errors
How to handle parsing errors.
- ‘raise’: invalid parsing will raise an exception.
- ‘coerce’: invalid parsing will be set to null.
- ‘ignore’: then invalid parsing will return the input.
(default: 'coerce')
report
If True, output the summary report. Otherwise, no report is outputted.
errors {‘ignore’, ‘raise’, ‘coerce’}, default 'coerce'
* If ‘raise’, then invalid parsing will raise an exception.
* If ‘coerce’, then invalid parsing will be set as NaN.
* If ‘ignore’, then invalid parsing will return the input.
(default: True)
progress
If True, enable the progress bar
If True, display a progress bar.
(default: True)
Examples
--------
>>> df = pd.DataFrame({'country': [' Canada ', 'US']})
>>> clean_country(df, 'country')
country country_clean
0 Canada Canada
1 US United States
"""
# pylint: disable=too-many-arguments

Expand Down Expand Up @@ -134,21 +169,41 @@ def validate_country(
x: Union[str, int, pd.Series], input_format: str = "auto", strict: bool = True
) -> Union[bool, pd.Series]:
"""
This function validates countries
Validate country names.
Read more in the :ref:`User Guide <country_userguide>`.
Parameters
----------
x
pandas Series of countries or str/int country value
input_format
the ISO 3166 input format of the country:
"auto" (infers the input format), country name ("name"),
official state name ("official"), alpha-2 code ("alpha-2"),
alpha-3 code ("alpha-3"), numeric code ("numeric")
The ISO 3166 input format of the country.
- 'auto': infer the input format
- 'name': country name ('United States')
- 'official': official state name ('United States of America')
- 'alpha-2': alpha-2 code ('US')
- 'alpha-3': alpha-3 code ('USA')
- 'numeric': numeric code (840)
(default: 'auto')
strict
If True, matching for input formats "name" and "official" are done by
If True, matching for input formats 'name' and 'official' are done by
looking for a direct match, if False, matching is done by searching
the input for a regex match
the input for a regex match.
(default: False)
Examples
--------
>>> validate_country('United States')
True
>>> df = pd.DataFrame({'country': ['Canada', 'NaN']})
>>> validate_country(df['country'])
0 True
1 False
Name: country, dtype: bool
"""

if isinstance(x, pd.Series):
Expand Down
3 changes: 3 additions & 0 deletions dataprep/clean/clean_email.py
Expand Up @@ -223,6 +223,9 @@ def clean_email(
) -> pd.DataFrame:
"""
This function cleans emails
Read more in the :ref:`User Guide <email_userguide>`.
Parameters
----------
df
Expand Down
105 changes: 68 additions & 37 deletions dataprep/clean/clean_ip.py
@@ -1,5 +1,5 @@
"""
Implement clean_ip functionality
Clean and validate a DataFrame column containing IP addresses.
"""
from ipaddress import ip_address
from operator import itemgetter
Expand All @@ -20,50 +20,66 @@ def clean_ip(
input_format: str = "auto",
output_format: str = "compressed",
inplace: bool = False,
report: bool = True,
errors: str = "coerce",
report: bool = True,
progress: bool = True,
) -> Union[pd.DataFrame, dd.DataFrame]:
"""
This function cleans a column of ip addresses in a Dataframe and formats them
into the desired format
Clean and standardize IP addresses.
Read more in the :ref:`User Guide <ip_userguide>`.
Parameters
----------
df
Pandas or Dask DataFrame
A pandas or Dask DataFrame containing the data to be cleaned.
column
Column name where the ip address are stored
The name of the column containing IP addresses.
input_format
Specify what format the data is in {'ipv4', 'ipv6', 'auto'}, default 'auto',
'ipv4': will only parse ipv4 addresses
'ipv6': will only parse ipv6 addresses
'auto': will parse both ipv4 and ipv6 addresses
The input format of the IP addresses.
- 'auto': parse both ipv4 and ipv6 addresses.
- 'ipv4': only parse ipv4 addresses.
- 'ipv6': only parse ipv6 addresses.
(default: 'auto')
output_format
Desired output format,
{'compressed', 'full', 'binary', 'hexa', 'integer'}, default is 'compressed'
'compressed': provides a compressed version of the ip address,
'full': provides full version of the ip address,
'binary': provides binary representation of the ip address,
'hexa': provides hexadecimal representation of the ip address,
'integer': provides integer representation of the ip address.
The desired output format of the IP addresses.
- 'compressed': compressed representation (12.3.4.5)
- 'full': full representation (0012.0003.0004.0005)
- 'binary': binary representation (00001100000000110000010000000101)
- 'hexa': hexadecimal representation (0xc030405)
- 'integer': integer representation (201524229)
(default: 'compressed')
inplace
If True, deletes the given column with dirty data, else, creates a new
column with cleaned data.
Default value is set to `False`
If True, delete the column containing the data that was cleaned. Otherwise,
keep the original column.
(default: False)
errors
How to handle parsing errors.
- ‘raise’: invalid parsing will raise an exception.
- ‘coerce’: invalid parsing will be set to null.
- ‘ignore’: then invalid parsing will return the input.
(default: 'coerce')
report
Displays the cleaning report for ip addresses
Default value is set to `True`
errors {‘ignore’, ‘raise’, ‘coerce’}, default 'coerce'
* If ‘raise’, then invalid parsing will raise an exception.
* If ‘coerce’, then invalid parsing will be set as NaN.
* If ‘ignore’, then invalid parsing will return the input.
If True, output the summary report. Otherwise, no report is outputted.
(default: True)
progress
If True, enable the progress bar
If True, display a progress bar.
Returns
----------
A new Dataframe with the new relavant columns
(default: True)
Examples
--------
>>> df = pd.DataFrame({'ip': ['2001:0db8:85a3:0000:0000:8a2e:0370:7334', '233.5.6.000']})
>>> clean_ip(df, 'ip')
ip ip_clean
0 2001:0db8:85a3:0000:0000:8a2e:0370:7334 2001:db8:85a3::8a2e:370:7334
1 233.5.6.000 233.5.6.0
"""
# pylint: disable=too-many-arguments

Expand Down Expand Up @@ -130,17 +146,32 @@ def clean_ip(

def validate_ip(x: Union[str, pd.Series], input_format: str = "auto") -> Union[bool, pd.Series]:
"""
This function validates ip address, can be a series or a single value
Validate IP addresses.
Read more in the :ref:`User Guide <ip_userguide>`.
Parameters
----------
x
pandas Series of ip addresses or an ip address value
pandas Series of IP addresses or a str ip address value
input_format
Specify what format the data is in {'ipv4', 'ipv6', 'auto'}, default 'auto',
'ipv4': will only parse ipv4 addresses
'ipv6': will only parse ipv6 addresses
'auto': will parse both ipv4 and ipv6 addresses
The IP address format to validate.
- 'auto': validate both ipv4 and ipv6 addresses.
- 'ipv4': only validate ipv4 addresses.
- 'ipv6': only validate ipv6 addresses.
(default: 'auto')
Examples
--------
>>> validate_ip('fdf8:f53b:82e4::53')
True
>>> df = pd.DataFrame({'ip': ['fdf8:f53b:82e4::53', None]})
>>> validate_country(df['country'])
0 True
1 False
Name: ip, dtype: bool
"""
if isinstance(x, pd.Series):
return x.apply(_check_ip, args=(input_format, False))
Expand Down Expand Up @@ -190,7 +221,7 @@ def _format_ip(val: Any, input_format: str, output_format: str, errors: str) ->
# convert to full representation
else:
dlm = "." if address.version == 4 else ":" # delimiter
result = "".join(f"{'0' * (4 - len(x))}{x}{dlm}" for x in address.exploded.split(dlm))[:-1]
result = f"{dlm}".join(f"{'0' * (4 - len(x))}{x}" for x in address.exploded.split(dlm))

return result, 2 if result != val else 3

Expand Down

0 comments on commit 68182f6

Please sign in to comment.