Skip to content

Commit

Permalink
fix(clean): add comma after street suffix or name
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandon Lockhart committed Mar 12, 2021
1 parent 1c03d8b commit e7655db
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 13 deletions.
9 changes: 7 additions & 2 deletions dataprep/clean/clean_address.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def clean_address(
df: Union[pd.DataFrame, dd.DataFrame],
column: str,
output_format: str = "(building) house_number street_prefix_abbr "
"street_name street_suffix_abbr apartment, city, state_abbr zipcode",
"street_name street_suffix_abbr, apartment, city, state_abbr zipcode",
must_contain: Tuple[str, ...] = ("house_number", "street_name"),
split: bool = False,
inplace: bool = False,
Expand Down Expand Up @@ -66,7 +66,7 @@ def clean_address(
The output_format can contain '\\\\t' characters to specify how to split the output into
columns.
(default: '(building) house_number street_prefix_abbr street_name street_suffix_abbr
(default: '(building) house_number street_prefix_abbr street_name street_suffix_abbr,
apartment, city, state_abbr zipcode')
must_contain
A tuple containing parts of the address that must be included for the address to be
Expand Down Expand Up @@ -299,6 +299,11 @@ def _address_dict_to_string(address: Dict[str, str], output_format: str, split:
if split:
output_format = "\t".join(output_format.split())

# add a comma after the street name if there is no street suffix
# in address_items
if "street_suffix_abbr" not in address_items and not split:
output_format = output_format.replace("street_name", "street_name,")

# first split output_format into each column of the final output
# for each column split it into attributes and add the corresponding
# cleaned part of the address to the output for each attribute
Expand Down
18 changes: 9 additions & 9 deletions dataprep/tests/clean/test_clean_address.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def test_clean_default(df_addresses: pd.DataFrame) -> None:
df_check = df_addresses.copy()
df_check["messy_address_clean"] = [
"123 Pine Ave.",
"1234 W. Main Hts. 57033",
"1234 W. Main Hts., 57033",
np.nan,
"(Robie House) 789 N. Main St. Manhattan, NY",
"1111 S. Figueroa St. Los Angeles, CA 90015",
"(Staples Center) 1111 S. Figueroa St. Los Angeles",
"(Robie House) 789 N. Main St., Manhattan, NY",
"1111 S. Figueroa St., Los Angeles, CA 90015",
"(Staples Center) 1111 S. Figueroa St., Los Angeles",
np.nan,
np.nan,
np.nan,
Expand Down Expand Up @@ -133,11 +133,11 @@ def test_clean_must_contain(df_addresses: pd.DataFrame) -> None:
df_check["messy_address_clean"] = [
np.nan,
np.nan,
"S. Maple Rd. Apt 1, Manhattan",
"(Robie House) 789 N. Main St. Manhattan, NY",
"1111 S. Figueroa St. Los Angeles, CA 90015",
"(Staples Center) 1111 S. Figueroa St. Los Angeles",
"S. Figueroa Los Angeles",
"S. Maple Rd., Apt 1, Manhattan",
"(Robie House) 789 N. Main St., Manhattan, NY",
"1111 S. Figueroa St., Los Angeles, CA 90015",
"(Staples Center) 1111 S. Figueroa St., Los Angeles",
"S. Figueroa, Los Angeles",
np.nan,
np.nan,
np.nan,
Expand Down
4 changes: 2 additions & 2 deletions docs/source/user_guide/clean/clean_address.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
"* state_full: ('California')\n",
"* zipcode: ('57903')\n",
"\n",
"The default `output_format` is \"(building) house_number street_prefix_abbr street_name street_suffix_abbr apartment,\n",
"The default `output_format` is \"(building) house_number street_prefix_abbr street_name street_suffix_abbr, apartment,\n",
" city, state_abbr zipcode\"\n",
" \n",
"The `must_contain` parameter takes a tuple containing parts of the address that must be included for the address to be successfully cleaned, the following keywords are supported.\n",
Expand Down Expand Up @@ -363,7 +363,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
"version": "3.9.1"
}
},
"nbformat": 4,
Expand Down

0 comments on commit e7655db

Please sign in to comment.