Skip to content

Commit

Permalink
feat(clean): support letters in clean_phone
Browse files Browse the repository at this point in the history
  • Loading branch information
atol committed Mar 27, 2021
1 parent fe8f5e7 commit 25d163b
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 11 deletions.
46 changes: 42 additions & 4 deletions dataprep/clean/clean_phone.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,45 @@
[-. (]*
(?P<area>\d{3})?
[-. )\/]*
(?P<office>\d{3})
(?:(?P<office>\d{3})
[-. \/]*
(?P<station>\d{4})
(?P<station>\d{4})|
(?P<letters>[0-9A-Z-. \/]{7,13}?))
(?:[ \t]*(?:\#|x[.:]?|[Ee]xt[.:]?|[Ee]xtension)[ \t]*(?P<ext>\d+))?
\s*$
""",
re.VERBOSE,
)

ALPHA_NUM_MAP = {
"A": "2",
"B": "2",
"C": "2",
"D": "3",
"E": "3",
"F": "3",
"G": "4",
"H": "4",
"I": "4",
"J": "5",
"K": "5",
"L": "5",
"M": "6",
"N": "6",
"O": "6",
"P": "7",
"Q": "7",
"R": "7",
"S": "7",
"T": "8",
"U": "8",
"V": "8",
"W": "9",
"X": "9",
"Y": "9",
"Z": "9",
}


def clean_phone(
df: Union[pd.DataFrame, dd.DataFrame],
Expand Down Expand Up @@ -265,12 +295,20 @@ def _check_phone(phone: Any, clean: bool) -> Any:
return (None,) * 5 + ("unknown",) if clean else False
if mch.group("country") and not mch.group("area"):
return (None,) * 5 + ("unknown",) if clean else False
if mch.group("letters"):
# Check that there are 7 alphanumeric characters present
letters = re.sub(r"\W+", "", mch.group("letters"))
if len(letters) != 7:
return (None,) * 5 + ("unknown",) if clean else False
# Convert letters to numbers
numlist = [ALPHA_NUM_MAP[char] if char.isalpha() else char for char in letters]
numbers = "".join(numlist)

# Components for phone number
country_code = mch.group("country")
area_code = mch.group("area")
office_code = mch.group("office")
station_code = mch.group("station")
office_code = numbers[:3] if mch.group("letters") else mch.group("office")
station_code = numbers[3:] if mch.group("letters") else mch.group("station")
ext_num = mch.group("ext")

return (
Expand Down
107 changes: 107 additions & 0 deletions dataprep/tests/clean/test_clean_phone.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ def df_phone() -> pd.DataFrame:
"+1 (234) 567-8901 x. 1234",
"2345678901 extension 1234",
"2345678",
"800-299-JUNK",
"1-866-4ZIPCAR",
"1-800-G-O-T-J-U-N-K",
"123 ABC COMPANY",
"+66 91 889 8948",
"hello",
np.nan,
Expand All @@ -49,6 +53,10 @@ def test_clean_default(df_phone: pd.DataFrame) -> None:
"234-567-8901 ext. 1234",
"234-567-8901 ext. 1234",
"234-5678",
"800-299-5865",
"866-494-7227",
"800-468-5865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -71,6 +79,10 @@ def test_clean_output_format(df_phone: pd.DataFrame) -> None:
"+12345678901 ext. 1234",
"+12345678901 ext. 1234",
"2345678",
"+18002995865",
"+18664947227",
"+18004685865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -87,6 +99,10 @@ def test_clean_output_format(df_phone: pd.DataFrame) -> None:
"(234) 567-8901 ext. 1234",
"(234) 567-8901 ext. 1234",
"234-5678",
"(800) 299-5865",
"(866) 494-7227",
"(800) 468-5865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -110,6 +126,10 @@ def test_clean_split(df_phone: pd.DataFrame) -> None:
np.nan,
np.nan,
np.nan,
"1",
"1",
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -124,6 +144,10 @@ def test_clean_split(df_phone: pd.DataFrame) -> None:
"234",
"234",
np.nan,
"800",
"866",
"800",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -139,6 +163,10 @@ def test_clean_split(df_phone: pd.DataFrame) -> None:
"567",
"567",
"234",
"299",
"494",
"468",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -154,6 +182,10 @@ def test_clean_split(df_phone: pd.DataFrame) -> None:
"8901",
"8901",
"5678",
"5865",
"7227",
"5865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -173,6 +205,10 @@ def test_clean_split(df_phone: pd.DataFrame) -> None:
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
]
assert df_check.equals(df_clean)

Expand All @@ -190,6 +226,10 @@ def test_clean_split_fix_missing(df_phone: pd.DataFrame) -> None:
"1",
"1",
np.nan,
"1",
"1",
"1",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -205,6 +245,10 @@ def test_clean_split_fix_missing(df_phone: pd.DataFrame) -> None:
"234",
"234",
np.nan,
"800",
"866",
"800",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -220,6 +264,10 @@ def test_clean_split_fix_missing(df_phone: pd.DataFrame) -> None:
"567",
"567",
"234",
"299",
"494",
"468",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -235,6 +283,10 @@ def test_clean_split_fix_missing(df_phone: pd.DataFrame) -> None:
"8901",
"8901",
"5678",
"5865",
"7227",
"5865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -254,6 +306,10 @@ def test_clean_split_fix_missing(df_phone: pd.DataFrame) -> None:
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
]
assert df_check.equals(df_clean)

Expand All @@ -272,6 +328,10 @@ def test_clean_inplace(df_phone: pd.DataFrame) -> None:
"234-567-8901 ext. 1234",
"234-567-8901 ext. 1234",
"234-5678",
"800-299-5865",
"866-494-7227",
"800-468-5865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -297,6 +357,10 @@ def test_clean_split_inplace(df_phone: pd.DataFrame) -> None:
np.nan,
np.nan,
np.nan,
"1",
"1",
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -311,6 +375,10 @@ def test_clean_split_inplace(df_phone: pd.DataFrame) -> None:
"234",
"234",
np.nan,
"800",
"866",
"800",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -326,6 +394,10 @@ def test_clean_split_inplace(df_phone: pd.DataFrame) -> None:
"567",
"567",
"234",
"299",
"494",
"468",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -341,6 +413,10 @@ def test_clean_split_inplace(df_phone: pd.DataFrame) -> None:
"8901",
"8901",
"5678",
"5865",
"7227",
"5865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -360,6 +436,10 @@ def test_clean_split_inplace(df_phone: pd.DataFrame) -> None:
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
],
}
)
Expand All @@ -380,6 +460,10 @@ def test_clean_split_inplace_fix_missing(df_phone: pd.DataFrame) -> None:
"1",
"1",
np.nan,
"1",
"1",
"1",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -395,6 +479,10 @@ def test_clean_split_inplace_fix_missing(df_phone: pd.DataFrame) -> None:
"234",
"234",
np.nan,
"800",
"866",
"800",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -410,6 +498,10 @@ def test_clean_split_inplace_fix_missing(df_phone: pd.DataFrame) -> None:
"567",
"567",
"234",
"299",
"494",
"468",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -425,6 +517,10 @@ def test_clean_split_inplace_fix_missing(df_phone: pd.DataFrame) -> None:
"8901",
"8901",
"5678",
"5865",
"7227",
"5865",
np.nan,
np.nan,
np.nan,
np.nan,
Expand All @@ -444,6 +540,10 @@ def test_clean_split_inplace_fix_missing(df_phone: pd.DataFrame) -> None:
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
],
}
)
Expand All @@ -456,6 +556,9 @@ def test_validate_value() -> None:
assert validate_phone("1 800 234 6789") == True
assert validate_phone("+44 7700 900077") == False
assert validate_phone("555-234-6789 ext 32") == True
assert validate_phone("1-866-4ZIPCAR") == True
assert validate_phone("1-800-G-O-T-J-U-N-K") == True
assert validate_phone("123 ABC COMPANY") == False


def test_validate_series(df_phone: pd.DataFrame) -> None:
Expand All @@ -471,6 +574,10 @@ def test_validate_series(df_phone: pd.DataFrame) -> None:
True,
True,
True,
True,
True,
True,
False,
False,
False,
False,
Expand Down

0 comments on commit 25d163b

Please sign in to comment.