In [None]:
import re

class Scrubber:
    def __init__(self):
        # Borrowed from Gattaca which borrowed from textacy package, thanks!
        self.EMAIL_REGEX = re.compile(r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))", flags=re.IGNORECASE | re.UNICODE)

        # I revised the phone_regex, as the old one captured credit card numbers as phone numbers
        # self.PHONE_REGEX = re.compile(r'(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?\d{3}[ .-]?\d{4}(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))')
        self.PHONE_REGEX = re.compile(r"((?:(?:(?:\+\d)?\d\d\d[\s-]?\d\d\d[\s-]?\d\d\d\d)|(?:(?:\+\d)?\(\d\d\d\)[\s-]?\d\d\d[\s-]?\d\d\d\d))(?:(?:ext\.?|[#x-])\s?\d{2,6})?)")
        
        # Borrowed from scrubadub, thanks!
        self.SSN_REGEX   = re.compile(("[0-9][0-9][0-9][\S ][0-9][0-9][\S ][0-9][0-9][0-9][0-9]"), re.VERBOSE)

        # This matches credit card numbers. Figured if we have SSN matched, might as well have this
        # Captures: 1-Credit Card Number 2-3rd set of 4 digits (this can almost certainly be ignored)
        self.CREDIT_REGEX = re.compile (r"((?:\d{4}[ -]){3}\d{4})")

        
        # This matches latitudes and longitudes of any of the major formats (degrees mins seconds, degrees mins, or just degrees)
        # Assumes degree symbol not used
        # Captures: 1-Latitude 2-Longitude 3-Longitude 4-Latitude
        self.LATLONG_REGEX = re.compile(r"(\d+.{1,2}\d+.{1,2}\d+.{1,2}[NS])\s*(\d+.{1,2}\d+.{1,2}\d+.{1,2}[EW])|(\d+\.\d+[NS])(\d+\.\d+[EW])") 

        # This matches addresses following pretty specific format. I'm not sure how many people will post their whole address in the proper
        # formatting, but it might catch a couple here and there. 
        # Captures: 1-Street address, 2-city, 3-state, 4-zip code
        # Accepted Format:  Street Address followed by comma or new line
        #          City name (comma, space) State(space) Zip Code
        # 
        self.ADDRESS_REGEX = re.compile(r"([\w ]*)[,|\n]([a-zA-Z ]+), ([a-zA-Z]+) (\d+\-?\d*)")
 
    def email(self, text): 
        return re.sub(self.EMAIL_REGEX, '[SCRUBBED_EMAIL]', text)
    
    def phone(self, text): 
        return re.sub(self.PHONE_REGEX, '[SCRUBBED_PHONE]', text)

    def ssn(self, text): 
        return re.sub(self.SSN_REGEX, '[SCRUBBED_SSN]', text)

    def latlong(self, text): 
        return re.sub(self.LATLONG_REGEX, '[SCRUBBED_LATLONG]', text)

    def credit(self, text): 
        return re.sub(self.CREDIT_REGEX, '[SCRUBBED_CREDIT]', text)
    
    def address(self, text):
        return re.sub(self.ADDRESS_REGEX, '[SCRUBBED_ADDRESS]', text)

    def all_pii(self, text):
        text = self.email(text)
        text = self.phone(text)
        text = self.ssn(text)
        text = self.credit(text)  
        text = self.latlong(text)
        text = self.address(text)
        return text

In [1]:
# example
text = 'Hi this is Sam at 800-213-9888. my credit card is very truly 5000-4400-9897-9999 and my social security number is 000=22 2222. Thx'
scrub = Scrubber()
scrub.all_pii(text)

NameError: name 'Scrubber' is not defined