In [3]:
import pandas as pd
import random

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx"

# Larger pools of names and domains
first_names = [
    "John", "Jane", "Michael", "Sarah", "Chris", "Emily", "David", "Emma",
    "Daniel", "Sophia", "James", "Olivia", "Robert", "Isabella", "Thomas",
    "Charlotte", "Andrew", "Mia", "Joseph", "Amelia", "Ethan", "Harper"
]
last_names = [
    "Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee", "Clark", "Lewis",
    "Martinez", "Garcia", "Rodriguez", "Walker", "Hall", "Allen", "Young",
    "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker"
]
domains = [
    "example.com", "mail.com", "test.com", "sample.com", "dummy.net",
    "placeholder.org", "fakemail.com", "testdomain.com", "mydomain.net", "demoemail.com"
]

# Masking functions
def mask_street(value):
    """Replace the street address with a random string."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return f"{random.randint(100, 999)} {random.choice(['Maple', 'Oak', 'Pine', 'Elm', 'Cedar'])} St"

def mask_email(value):
    """Generate a random email address."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    first = random.choice(first_names).lower()
    last = random.choice(last_names).lower()
    domain = random.choice(domains)
    return f"{first}.{last}@{domain}"

def mask_name(value, name_list):
    """Replace a name with a random first or last name."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return random.choice(name_list)

def mask_random_phone_number(value):
    """Generate a randomized phone number."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return f"XXX-XXX-{random.randint(1000, 9999)}"

def mask_full_random_phone(value):
    """Generate a completely random number with a random number of digits."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return ''.join([str(random.randint(0, 9)) for _ in range(random.randint(10, 15))])

def combine_name(first_name, last_name):
    """Combine FIRSTNAME and LASTNAME into NAME."""
    if pd.isnull(first_name) and pd.isnull(last_name):
        return None
    first_name = first_name if not pd.isnull(first_name) else ""
    last_name = last_name if not pd.isnull(last_name) else ""
    return f"{first_name.strip()} {last_name.strip()}".strip()

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking to BILLING_STREET__PC if not null
    if "BILLING_STREET__PC" in df.columns:
        df["BILLING_STREET__PC"] = df["BILLING_STREET__PC"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Apply masking to EMAIL__C if not null
    if "EMAIL__C" in df.columns:
        df["EMAIL__C"] = df["EMAIL__C"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to FIRSTNAME and LASTNAME if not null
    if "FIRSTNAME" in df.columns:
        df["FIRSTNAME"] = df["FIRSTNAME"].apply(lambda x: mask_name(x, first_names) if not pd.isnull(x) else x)
    if "LASTNAME" in df.columns:
        df["LASTNAME"] = df["LASTNAME"].apply(lambda x: mask_name(x, last_names) if not pd.isnull(x) else x)

    # Combine FIRSTNAME and LASTNAME into NAME
    if "FIRSTNAME" in df.columns and "LASTNAME" in df.columns and "NAME" in df.columns:
        df["NAME"] = df.apply(lambda row: combine_name(row["FIRSTNAME"], row["LASTNAME"]), axis=1)

    # Apply masking to PERSONMOBILEPHONE with random digits
    if "PERSONMOBILEPHONE" in df.columns:
        df["PERSONMOBILEPHONE"] = df["PERSONMOBILEPHONE"].apply(lambda x: mask_full_random_phone(x) if not pd.isnull(x) else x)

    # Apply masking to PERSONEMAIL with dummy emails
    if "PERSONEMAIL" in df.columns:
        df["PERSONEMAIL"] = df["PERSONEMAIL"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to PHONE with random phone numbers
    if "PHONE" in df.columns:
        df["PHONE"] = df["PHONE"].apply(lambda x: mask_random_phone_number(x) if not pd.isnull(x) else x)

    # Apply masking to SHIPPINGSTREET if not null
    if "SHIPPINGSTREET" in df.columns:
        df["SHIPPINGSTREET"] = df["SHIPPINGSTREET"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Save the modified file to Excel
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Error processing file: 'bool' object has no attribute 'strip'


In [7]:
import pandas as pd
import random
import re

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx"

# Masking functions
def generate_random_phone(format_type=None):
    """Generate a random phone number in different formats."""
    random_digits = ''.join([str(random.randint(0, 9)) for _ in range(10)])
    
    # Randomly choose a format if none specified
    if format_type is None:
        format_type = random.choice(["plain", "dotted", "dashed", "parentheses"])
    
    if format_type == "plain":
        return f"{random_digits}"  # e.g., 8135058322
    elif format_type == "dotted":
        return f"{random_digits[:3]}.{random_digits[3:6]}.{random_digits[6:]}"  # e.g., 407.222.7038
    elif format_type == "dashed":
        return f"{random_digits[:3]}-{random_digits[3:6]}-{random_digits[6:]}"  # e.g., 901-485-7928
    elif format_type == "parentheses":
        return f"({random_digits[:3]}) {random_digits[3:6]}-{random_digits[6:]}"  # e.g., (330) 327-3890
    return random_digits  # Fallback

def mask_phone_number(value):
    """Mask phone numbers by generating random numbers in the same format."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    
    # Extract only digits
    digits = re.sub(r'\D', '', value)  # Remove all non-numeric characters
    if len(digits) < 10:  # If it's not a valid phone number, skip it
        return value

    # Generate random digits
    random_digits = ''.join([str(random.randint(0, 9)) for _ in range(len(digits))])

    # Preserve the original format of the phone number
    if re.match(r'^\d{10}$', value):  # Plain format e.g., 8135058322
        return random_digits
    elif re.match(r'^\d{3}\.\d{3}\.\d{4}$', value):  # Dotted format e.g., 407.222.7038
        return f"{random_digits[:3]}.{random_digits[3:6]}.{random_digits[6:]}"
    elif re.match(r'^\d{3}-\d{3}-\d{4}$', value):  # Dashed format e.g., 901-485-7928
        return f"{random_digits[:3]}-{random_digits[3:6]}-{random_digits[6:]}"
    elif re.match(r'^\(\d{3}\) \d{3}-\d{4}$', value):  # Parentheses format e.g., (330) 327-3890
        return f"({random_digits[:3]}) {random_digits[3:6]}-{random_digits[6:]}"
    else:
        # If the format is unrecognized, generate a random format
        return generate_random_phone()

def mask_street(value):
    """Replace the street address with a random string."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return f"{random.randint(100, 999)} {random.choice(['Maple', 'Oak', 'Pine', 'Elm', 'Cedar'])} St"

def mask_email(value):
    """Generate a random email address."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    domains = ["example.com", "mail.com", "test.com", "dummy.net", "sample.com"]
    first = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))
    last = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=7))
    return f"{first}.{last}@{random.choice(domains)}"

def mask_name(value, name_list):
    """Replace a name with a random first or last name."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return random.choice(name_list)

def combine_name(first_name, last_name):
    """Combine FIRSTNAME and LASTNAME into NAME."""
    if pd.isnull(first_name) and pd.isnull(last_name):
        return None
    first_name = str(first_name).strip() if not pd.isnull(first_name) else ""
    last_name = str(last_name).strip() if not pd.isnull(last_name) else ""
    return f"{first_name} {last_name}".strip()

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking to BILLING_STREET__PC if not null
    if "BILLING_STREET__PC" in df.columns:
        df["BILLING_STREET__PC"] = df["BILLING_STREET__PC"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Apply masking to EMAIL__C if not null
    if "EMAIL__C" in df.columns:
        df["EMAIL__C"] = df["EMAIL__C"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to FIRSTNAME and LASTNAME if not null
    if "FIRSTNAME" in df.columns:
        first_names = ["John", "Jane", "Michael", "Sarah", "Chris", "Emily"]
        df["FIRSTNAME"] = df["FIRSTNAME"].apply(lambda x: mask_name(x, first_names) if not pd.isnull(x) else x)
    if "LASTNAME" in df.columns:
        last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee"]
        df["LASTNAME"] = df["LASTNAME"].apply(lambda x: mask_name(x, last_names) if not pd.isnull(x) else x)

    # Combine FIRSTNAME and LASTNAME into NAME
    if "FIRSTNAME" in df.columns and "LASTNAME" in df.columns and "NAME" in df.columns:
        df["NAME"] = df.apply(lambda row: combine_name(row["FIRSTNAME"], row["LASTNAME"]), axis=1)

    # Apply masking to PERSONMOBILEPHONE with digit replacement in random format
    if "PERSONMOBILEPHONE" in df.columns:
        df["PERSONMOBILEPHONE"] = df["PERSONMOBILEPHONE"].apply(lambda x: mask_phone_number(x) if not pd.isnull(x) else x)

    # PHONE remains unchanged

    # Apply masking to PERSONEMAIL with dummy emails
    if "PERSONEMAIL" in df.columns:
        df["PERSONEMAIL"] = df["PERSONEMAIL"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to SHIPPINGSTREET if not null
    if "SHIPPINGSTREET" in df.columns:
        df["SHIPPINGSTREET"] = df["SHIPPINGSTREET"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Save the modified file to


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx


In [11]:
import pandas as pd
import random
import re

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked_2.xlsx"

# Masking functions
def mask_phone_number(value):
    """Mask phone numbers by generating random numbers and preserving the original format."""
    if pd.isnull(value) or not isinstance(value, str):
        return value

    # Extract digits from the phone number
    digits = re.sub(r'\D', '', value)  # Remove all non-digit characters
    if len(digits) < 10:  # If it's not a valid phone number, skip it
        return value

    # Generate random digits
    random_digits = iter(''.join([str(random.randint(0, 9)) for _ in range(len(digits))]))

    # Replace each digit in the original format with a random digit
    masked_number = re.sub(r'\d', lambda _: next(random_digits), value)
    return masked_number

def mask_street(value):
    """Replace the street address with a random string."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return f"{random.randint(100, 999)} {random.choice(['Maple', 'Oak', 'Pine', 'Elm', 'Cedar'])} St"

def mask_email(value):
    """Generate a random email address."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    domains = ["example.com", "mail.com", "test.com", "dummy.net", "sample.com"]
    first = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))
    last = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=7))
    return f"{first}.{last}@{random.choice(domains)}"

def mask_name(value, name_list):
    """Replace a name with a random first or last name."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return random.choice(name_list)

def combine_name(first_name, last_name):
    """Combine FIRSTNAME and LASTNAME into NAME."""
    if pd.isnull(first_name) and pd.isnull(last_name):
        return None
    first_name = str(first_name).strip() if not pd.isnull(first_name) else ""
    last_name = str(last_name).strip() if not pd.isnull(last_name) else ""
    return f"{first_name} {last_name}".strip()

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking to BILLING_STREET__PC if not null
    if "BILLING_STREET__PC" in df.columns:
        df["BILLING_STREET__PC"] = df["BILLING_STREET__PC"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Apply masking to EMAIL__C if not null
    if "EMAIL__C" in df.columns:
        df["EMAIL__C"] = df["EMAIL__C"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to FIRSTNAME and LASTNAME if not null
    if "FIRSTNAME" in df.columns:
        first_names = ["John", "Jane", "Michael", "Sarah", "Chris", "Emily"]
        df["FIRSTNAME"] = df["FIRSTNAME"].apply(lambda x: mask_name(x, first_names) if not pd.isnull(x) else x)
    if "LASTNAME" in df.columns:
        last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee"]
        df["LASTNAME"] = df["LASTNAME"].apply(lambda x: mask_name(x, last_names) if not pd.isnull(x) else x)

    # Combine FIRSTNAME and LASTNAME into NAME
    if "FIRSTNAME" in df.columns and "LASTNAME" in df.columns and "NAME" in df.columns:
        df["NAME"] = df.apply(lambda row: combine_name(row["FIRSTNAME"], row["LASTNAME"]), axis=1)

    # Apply masking to PERSONMOBILEPHONE with digit replacement in random format
    if "PERSONMOBILEPHONE" in df.columns:
        df["PERSONMOBILEPHONE"] = df["PERSONMOBILEPHONE"].apply(lambda x: mask_phone_number(x) if not pd.isnull(x) else x)
    

    # Apply masking to PHONE with digit replacement in random format
    if "PHONE" in df.columns:
        df["PHONE"] = df["PHONE"].apply(lambda x: mask_phone_number(x) if not pd.isnull(x) else x)

    # Apply masking to PERSONEMAIL with dummy emails
    if "PERSONEMAIL" in df.columns:
        df["PERSONEMAIL"] = df["PERSONEMAIL"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to SHIPPINGSTREET if not null
    if "SHIPPINGSTREET" in df.columns:
        df["SHIPPINGSTREET"] = df["SHIPPINGSTREET"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Save the modified file to Excel
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked_2.xlsx


In [13]:
import pandas as pd
import random
import re

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx"

# Masking functions
def mask_phone_number(value):
    """Mask phone numbers by generating random numbers and preserving the original format."""
    if pd.isnull(value) or not isinstance(value, str):
        return value.strip() if isinstance(value, str) else value

    # Clean up leading/trailing spaces
    value = value.strip()

    # Extract digits from the phone number
    digits = re.sub(r'\D', '', value)  # Remove all non-digit characters
    if len(digits) < 10:  # If it's not a valid phone number, skip it
        return value

    # Generate random digits
    random_digits = iter(''.join([str(random.randint(0, 9)) for _ in range(len(digits))]))

    # Replace each digit in the original format with a random digit
    masked_number = re.sub(r'\d', lambda _: next(random_digits), value)
    return masked_number

def mask_street(value):
    """Replace the street address with a random string."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return f"{random.randint(100, 999)} {random.choice(['Maple', 'Oak', 'Pine', 'Elm', 'Cedar'])} St"

def mask_email(value):
    """Generate a random email address."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    domains = ["example.com", "mail.com", "test.com", "dummy.net", "sample.com"]
    first = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))
    last = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=7))
    return f"{first}.{last}@{random.choice(domains)}"

def mask_name(value, name_list):
    """Replace a name with a random first or last name."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return random.choice(name_list)

def combine_name(first_name, last_name):
    """Combine FIRSTNAME and LASTNAME into NAME."""
    if pd.isnull(first_name) and pd.isnull(last_name):
        return None
    first_name = str(first_name).strip() if not pd.isnull(first_name) else ""
    last_name = str(last_name).strip() if not pd.isnull(last_name) else ""
    return f"{first_name} {last_name}".strip()

# Load the Excel file
try:
    # Read the data and clean up column names
    df = pd.read_excel(input_file)
    df.columns = [col.strip() for col in df.columns]  # Clean column names

    # Apply masking to BILLING_STREET__PC if not null
    if "BILLING_STREET__PC" in df.columns:
        df["BILLING_STREET__PC"] = df["BILLING_STREET__PC"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Apply masking to EMAIL__C if not null
    if "EMAIL__C" in df.columns:
        df["EMAIL__C"] = df["EMAIL__C"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to FIRSTNAME and LASTNAME if not null
    if "FIRSTNAME" in df.columns:
        first_names = ["John", "Jane", "Michael", "Sarah", "Chris", "Emily"]
        df["FIRSTNAME"] = df["FIRSTNAME"].apply(lambda x: mask_name(x, first_names) if not pd.isnull(x) else x)
    if "LASTNAME" in df.columns:
        last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee"]
        df["LASTNAME"] = df["LASTNAME"].apply(lambda x: mask_name(x, last_names) if not pd.isnull(x) else x)

    # Combine FIRSTNAME and LASTNAME into NAME
    if "FIRSTNAME" in df.columns and "LASTNAME" in df.columns and "NAME" in df.columns:
        df["NAME"] = df.apply(lambda row: combine_name(row["FIRSTNAME"], row["LASTNAME"]), axis=1)

    # Apply masking to PERSONMOBILEPHONE with digit replacement in random format
    if "PERSONMOBILEPHONE" in df.columns:
        df["PERSONMOBILEPHONE"] = df["PERSONMOBILEPHONE"].apply(
            lambda x: mask_phone_number(x.strip()) if not pd.isnull(x) else x
        )

    # Apply masking to PHONE with digit replacement in random format
    if "PHONE" in df.columns:
        df["PHONE"] = df["PHONE"].apply(lambda x: mask_phone_number(x.strip()) if not pd.isnull(x) else x)

    # Apply masking to PERSONEMAIL with dummy emails
    if "PERSONEMAIL" in df.columns:
        df["PERSONEMAIL"] = df["PERSONEMAIL"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to SHIPPINGSTREET if not null
    if "SHIPPINGSTREET" in df.columns:
        df["SHIPPINGSTREET"] = df["SHIPPINGSTREET"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Save the modified file to Excel
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Error processing file: 'int' object has no attribute 'strip'


In [14]:
import pandas as pd
import random

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx"

# Masking functions
def generate_dummy_phone():
    """Generate a random 10-digit phone number."""
    return ''.join([str(random.randint(0, 9)) for _ in range(10)])

def mask_phone_column(value):
    """Mask phone numbers with a random 10-digit dummy number."""
    if pd.isnull(value):
        return value
    return generate_dummy_phone()

def mask_street(value):
    """Replace the street address with a random string."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return f"{random.randint(100, 999)} {random.choice(['Maple', 'Oak', 'Pine', 'Elm', 'Cedar'])} St"

def mask_email(value):
    """Generate a random email address."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    domains = ["example.com", "mail.com", "test.com", "dummy.net", "sample.com"]
    first = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))
    last = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=7))
    return f"{first}.{last}@{random.choice(domains)}"

def mask_name(value, name_list):
    """Replace a name with a random first or last name."""
    if pd.isnull(value):
        return value
    return random.choice(name_list)

def combine_name(first_name, last_name):
    """Combine FIRSTNAME and LASTNAME into NAME."""
    if pd.isnull(first_name) and pd.isnull(last_name):
        return None
    first_name = str(first_name).strip() if not pd.isnull(first_name) else ""
    last_name = str(last_name).strip() if not pd.isnull(last_name) else ""
    return f"{first_name} {last_name}".strip()

# Load the Excel file
try:
    # Read the data and clean up column names
    df = pd.read_excel(input_file)
    df.columns = [col.strip() for col in df.columns]  # Clean column names

    # Apply masking to BILLING_STREET__PC if it exists
    if "BILLING_STREET__PC" in df.columns:
        df["BILLING_STREET__PC"] = df["BILLING_STREET__PC"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Apply masking to EMAIL__C if it exists
    if "EMAIL__C" in df.columns:
        df["EMAIL__C"] = df["EMAIL__C"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to FIRSTNAME and LASTNAME if they exist
    if "FIRSTNAME" in df.columns:
        first_names = ["John", "Jane", "Michael", "Sarah", "Chris", "Emily"]
        df["FIRSTNAME"] = df["FIRSTNAME"].apply(lambda x: mask_name(x, first_names) if not pd.isnull(x) else x)
    if "LASTNAME" in df.columns:
        last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee"]
        df["LASTNAME"] = df["LASTNAME"].apply(lambda x: mask_name(x, last_names) if not pd.isnull(x) else x)

    # Combine FIRSTNAME and LASTNAME into NAME
    if "FIRSTNAME" in df.columns and "LASTNAME" in df.columns and "NAME" in df.columns:
        df["NAME"] = df.apply(lambda row: combine_name(row["FIRSTNAME"], row["LASTNAME"]), axis=1)

    # Mask PERSONMOBILEPHONE with dummy phone numbers if the column exists
    if "PERSONMOBILEPHONE" in df.columns:
        df["PERSONMOBILEPHONE"] = df["PERSONMOBILEPHONE"].apply(mask_phone_column)

    # Mask PHONE with dummy phone numbers if the column exists
    if "PHONE" in df.columns:
        df["PHONE"] = df["PHONE"].apply(mask_phone_column)

    # Apply masking to PERSONEMAIL with dummy emails if the column exists
    if "PERSONEMAIL" in df.columns:
        df["PERSONEMAIL"] = df["PERSONEMAIL"].apply(lambda x: mask_email(x) if not pd.isnull(x) else x)

    # Apply masking to SHIPPINGSTREET if it exists
    if "SHIPPINGSTREET" in df.columns:
        df["SHIPPINGSTREET"] = df["SHIPPINGSTREET"].apply(lambda x: mask_street(x) if not pd.isnull(x) else x)

    # Save the modified file to Excel
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx


In [18]:
import pandas as pd

# File paths
original_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
modified_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master_data_masked_files_25112024\Account_Filtered_Data_Masked.xlsx"

# Columns to check for modifications
columns_to_check = ["PHONE", "PERSONMOBILEPHONE", "EMAIL__C", "BILLING_STREET__PC", "PERSONEMAIL"]

# Load the files
try:
    # Read original and modified datasets
    original_df = pd.read_excel(original_file)
    modified_df = pd.read_excel(modified_file)

    # Clean column names
    original_df.columns = [col.strip() for col in original_df.columns]
    modified_df.columns = [col.strip() for col in modified_df.columns]

    # Initialize results dictionary
    comparison_results = {}

    # Compare each column
    for column in columns_to_check:
        if column in original_df.columns and column in modified_df.columns:
            # Ensure both columns are strings and fill NaN with "NULL" for comparison
            original_values = original_df[column].fillna("NULL").astype(str).str.strip()
            modified_values = modified_df[column].fillna("NULL").astype(str).str.strip()

            # Check if the columns are different
            if (original_values != modified_values).any():  # If any value is different
                comparison_results[column] = "Modified"
            else:
                comparison_results[column] = "Not Modified"
        else:
            comparison_results[column] = "Column not found in both files"

    # Print results
    print("Comparison Results:")
    for col, status in comparison_results.items():
        print(f"{col}: {status}")

except Exception as e:
    print(f"Error processing files: {e}")


Comparison Results:
PHONE: Modified
PERSONMOBILEPHONE: Modified
EMAIL__C: Modified
BILLING_STREET__PC: Modified
PERSONEMAIL: Modified


In [19]:
import pandas as pd

# File paths
original_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
# modified_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master_data_masked_files_25112024\Account_Filtered_Data_Masked.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx"

# Columns to check for modifications
columns_to_check = ["PHONE", "PERSONMOBILEPHONE", "EMAIL__C", "BILLING_STREET__PC", "PERSONEMAIL"]

# Load the files
try:
    # Read original and modified datasets
    original_df = pd.read_excel(original_file)
    modified_df = pd.read_excel(modified_file)

    # Clean column names
    original_df.columns = [col.strip() for col in original_df.columns]
    modified_df.columns = [col.strip() for col in modified_df.columns]

    # Initialize results dictionary
    comparison_results = {}

    # Compare each column
    for column in columns_to_check:
        if column in original_df.columns and column in modified_df.columns:
            # Ensure both columns are strings and fill NaN with "NULL" for comparison
            original_values = original_df[column].fillna("NULL").astype(str).str.strip()
            modified_values = modified_df[column].fillna("NULL").astype(str).str.strip()

            # Check for rows where both original and modified values are non-NULL
            non_null_mask = (original_values != "NULL") & (modified_values != "NULL")
            original_non_null = original_values[non_null_mask]
            modified_non_null = modified_values[non_null_mask]

            # Compare non-NULL rows
            differences = original_non_null != modified_non_null

            # Validate if all non-NULL rows are different
            if differences.all():
                comparison_results[column] = "Modified and Valid"
            elif differences.any():
                comparison_results[column] = "Modified but Not Fully Valid"
            else:
                comparison_results[column] = "Not Modified"
        else:
            comparison_results[column] = "Column not found in both files"

    # Print results
    print("Final Comparison Results:")
    for col, status in comparison_results.items():
        print(f"{col}: {status}")

except Exception as e:
    print(f"Error processing files: {e}")


Final Comparison Results:
PHONE: Modified and Valid
PERSONMOBILEPHONE: Modified and Valid
EMAIL__C: Modified and Valid
BILLING_STREET__PC: Modified but Not Fully Valid
PERSONEMAIL: Modified and Valid


In [20]:
import pandas as pd
import random
import uuid

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx"

# Masking functions
def generate_dummy_phone():
    """Generate a random 10-digit phone number."""
    return ''.join([str(random.randint(0, 9)) for _ in range(10)])

def mask_phone_column(value):
    """Mask phone numbers with a random 10-digit dummy number."""
    if pd.isnull(value):
        return value
    return generate_dummy_phone()

def mask_street(value, row_index):
    """Replace the street address with a unique random string."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    streets = ["Maple", "Oak", "Pine", "Elm", "Cedar"]
    unique_suffix = str(uuid.uuid4())[:8]  # Add a unique suffix
    return f"{random.randint(100, 999)} {random.choice(streets)} St, ID: {row_index}_{unique_suffix}"

def mask_email(value):
    """Generate a random email address."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    domains = ["example.com", "mail.com", "test.com", "dummy.net", "sample.com"]
    first = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))
    last = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=7))
    return f"{first}.{last}@{random.choice(domains)}"

# Load the Excel file
try:
    # Read the data and clean up column names
    df = pd.read_excel(input_file)
    df.columns = [col.strip() for col in df.columns]  # Clean column names

    # Apply masking to BILLING_STREET__PC if it exists
    if "BILLING_STREET__PC" in df.columns:
        df["BILLING_STREET__PC"] = [
            mask_street(value, idx) for idx, value in enumerate(df["BILLING_STREET__PC"])
        ]

    # Apply masking to EMAIL__C if it exists
    if "EMAIL__C" in df.columns:
        df["EMAIL__C"] = df["EMAIL__C"].apply(mask_email)

    # Mask PERSONMOBILEPHONE with dummy phone numbers if the column exists
    if "PERSONMOBILEPHONE" in df.columns:
        df["PERSONMOBILEPHONE"] = df["PERSONMOBILEPHONE"].apply(mask_phone_column)

    # Mask PHONE with dummy phone numbers if the column exists
    if "PHONE" in df.columns:
        df["PHONE"] = df["PHONE"].apply(mask_phone_column)

    # Apply masking to PERSONEMAIL with dummy emails if the column exists
    if "PERSONEMAIL" in df.columns:
        df["PERSONEMAIL"] = df["PERSONEMAIL"].apply(mask_email)

    # Save the modified file to Excel
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx


In [21]:
import pandas as pd

# File paths
original_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account Filtered Data.xlsx"
modified_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Account_Filtered_Data_Masked.xlsx"

# Columns to check
columns_to_check = ["PHONE", "PERSONMOBILEPHONE", "EMAIL__C", "BILLING_STREET__PC", "PERSONEMAIL"]

# Load the files
try:
    # Read original and modified datasets
    original_df = pd.read_excel(original_file)
    modified_df = pd.read_excel(modified_file)

    # Clean column names
    original_df.columns = [col.strip() for col in original_df.columns]
    modified_df.columns = [col.strip() for col in modified_df.columns]

    # Initialize results dictionary
    comparison_results = {}

    # Compare each column
    for column in columns_to_check:
        if column in original_df.columns and column in modified_df.columns:
            # Ensure both columns are strings and fill NaN with "NULL" for comparison
            original_values = original_df[column].fillna("NULL").astype(str).str.strip()
            modified_values = modified_df[column].fillna("NULL").astype(str).str.strip()

            # Check for rows where both original and modified values are non-NULL
            non_null_mask = (original_values != "NULL") & (modified_values != "NULL")
            original_non_null = original_values[non_null_mask]
            modified_non_null = modified_values[non_null_mask]

            # Compare non-NULL rows
            differences = original_non_null != modified_non_null

            # Store results
            modified_count = differences.sum()  # Count of modified rows
            total_non_null = non_null_mask.sum()  # Total non-NULL rows
            modified_percentage = (modified_count / total_non_null) * 100 if total_non_null > 0 else 0

            # Prepare a sample of differences for display
            difference_indices = original_non_null[differences].index.tolist()
            difference_sample = pd.DataFrame({
                "Original": original_non_null[differences].head(5).values,
                "Modified": modified_non_null[differences].head(5).values,
            })

            comparison_results[column] = {
                "Modified Count": modified_count,
                "Total Non-NULL Rows": total_non_null,
                "Modified Percentage": modified_percentage,
                "Sample Differences": difference_sample,
            }
        else:
            comparison_results[column] = "Column not found in both files"

    # Print detailed results
    print("\nDetailed Comparison Results:")
    for col, result in comparison_results.items():
        if isinstance(result, dict):
            print(f"\nColumn: {col}")
            print(f"  Modified Count: {result['Modified Count']}")
            print(f"  Total Non-NULL Rows: {result['Total Non-NULL Rows']}")
            print(f"  Modified Percentage: {result['Modified Percentage']:.2f}%")
            print(f"  Sample Differences:\n{result['Sample Differences']}")
        else:
            print(f"\nColumn: {col} - {result}")

except Exception as e:
    print(f"Error processing files: {e}")



Detailed Comparison Results:

Column: PHONE
  Modified Count: 184804
  Total Non-NULL Rows: 184804
  Modified Percentage: 100.00%
  Sample Differences:
     Original      Modified
0  8135058322   818511946.0
1  7708458382  2625350112.0
2  9014912384   153832924.0
3  5029300727  2713250577.0
4  2698326758  8901641728.0

Column: PERSONMOBILEPHONE
  Modified Count: 4076
  Total Non-NULL Rows: 4076
  Modified Percentage: 100.00%
  Sample Differences:
         Original      Modified
0      4238835029  2169958035.0
1      5049157550  7416314007.0
2      3187297376  7221777949.0
3  (274) 442-9665  5763595339.0
4  (713) 319-5252  8724394970.0

Column: EMAIL__C
  Modified Count: 81936
  Total Non-NULL Rows: 81936
  Modified Percentage: 100.00%
  Sample Differences:
                 Original                   Modified
0     mpark@mpjonline.com     pigvd.saiiski@mail.com
1       lbm@austin.rr.com    pmhiw.zrhanoh@dummy.net
2      moalassi@yahoo.com     fpqrt.ytakpcq@mail.com
3       foxxy1215@ma