In [2]:
!pip install pandas openpyxl


Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.1.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     -------------------------- ------------- 41.0/60.8 kB 2.0 MB/s eta 0:00:01
     ---------------------------------------- 60.8/60.8 kB 1.6 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Collecting six>=1.5 (from python-dateutil>


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd
import random

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1_Masked.xlsx"

# Larger pools of names and domains
first_names = [
    "John", "Jane", "Michael", "Sarah", "Chris", "Emily", "David", "Emma", 
    "Daniel", "Sophia", "James", "Olivia", "Robert", "Isabella", "Thomas", 
    "Charlotte", "Andrew", "Mia", "Joseph", "Amelia", "Ethan", "Harper"
]

last_names = [
    "Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee", "Clark", "Lewis", 
    "Martinez", "Garcia", "Rodriguez", "Walker", "Hall", "Allen", "Young", 
    "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker"
]

domains = [
    "example.com", "mail.com", "test.com", "sample.com", "dummy.net", 
    "placeholder.org", "fakemail.com", "testdomain.com", "mydomain.net", "demoemail.com"
]

# Masking functions
def generate_readable_name(value, name_pool):
    """Generate a random readable dummy name."""
    if pd.isnull(value):
        return value
    return random.choice(name_pool)

def generate_email_from_name(first_name, last_name):
    """Generate an email address based on the generated name."""
    if pd.isnull(first_name) or pd.isnull(last_name):
        return None
    username = f"{first_name.lower()}.{last_name.lower()}"
    domain = random.choice(domains)
    return f"{username}@{domain}"

def shuffle_phone_number(value):
    """Shuffle digits of the phone number while maintaining the same length."""
    if pd.isnull(value):
        return value
    digits = ''.join(filter(str.isdigit, str(value)))  # Extract digits only
    if digits:
        shuffled = ''.join(random.sample(digits, len(digits)))
        return shuffled
    return value  # Return as-is if no digits

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking for FirstName
    if "FirstName" in df.columns:
        df["FirstName"] = df["FirstName"].apply(lambda x: generate_readable_name(x, first_names))
    else:
        print("Warning: Column 'FirstName' not found in the dataset.")

    # Apply masking for LastName
    if "LastName" in df.columns:
        df["LastName"] = df["LastName"].apply(lambda x: generate_readable_name(x, last_names))
    else:
        print("Warning: Column 'LastName' not found in the dataset.")

    # Generate emails based on masked names
    if "FirstName" in df.columns and "LastName" in df.columns and "Email" in df.columns:
        df["Email"] = df.apply(lambda row: generate_email_from_name(row["FirstName"], row["LastName"]), axis=1)
    else:
        print("Warning: Columns 'FirstName', 'LastName', or 'Email' not found in the dataset.")

    # Shuffle phone numbers
    if "Phone" in df.columns:
        df["Phone"] = df["Phone"].apply(shuffle_phone_number)
    else:
        print("Warning: Column 'Phone' not found in the dataset.")

    # Update Secondary_Email__c
    if "Secondary_Email__c" in df.columns:
        df["Secondary_Email__c"] = df.apply(
            lambda row: row["Email"] if pd.notnull(row["Secondary_Email__c"]) else row["Secondary_Email__c"], axis=1
        )
    else:
        print("Warning: Column 'Secondary_Email__c' not found in the dataset.")

    # Save the modified file
    df.to_excel(output_file, index=False, engine='openpyxl')  # Save as Excel file
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1_Masked.xlsx


In [11]:
import pandas as pd
import random

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1_Masked.xlsx"

# Larger pools of names and domains
first_names = [
    "John", "Jane", "Michael", "Sarah", "Chris", "Emily", "David", "Emma", 
    "Daniel", "Sophia", "James", "Olivia", "Robert", "Isabella", "Thomas", 
    "Charlotte", "Andrew", "Mia", "Joseph", "Amelia", "Ethan", "Harper"
]

last_names = [
    "Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee", "Clark", "Lewis", 
    "Martinez", "Garcia", "Rodriguez", "Walker", "Hall", "Allen", "Young", 
    "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker"
]

domains = [
    "example.com", "mail.com", "test.com", "sample.com", "dummy.net", 
    "placeholder.org", "fakemail.com", "testdomain.com", "mydomain.net", "demoemail.com"
]

# Masking functions
def generate_readable_name(value, name_pool):
    """Generate a random readable dummy name."""
    if pd.isnull(value):
        return value
    return random.choice(name_pool)

def generate_email_from_name(first_name, last_name):
    """Generate an email address based on the generated name."""
    if pd.isnull(first_name) or pd.isnull(last_name):
        return None
    username = f"{first_name.lower()}.{last_name.lower()}"
    domain = random.choice(domains)
    return f"{username}@{domain}"

def shuffle_phone_number(value):
    """Shuffle digits of the phone number while maintaining the same length."""
    if pd.isnull(value):
        return value
    digits = ''.join(filter(str.isdigit, str(value)))  # Extract digits only
    if digits:
        shuffled = ''.join(random.sample(digits, len(digits)))
        return shuffled
    return value  # Return as-is if no digits

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking for FirstName
    if "FirstName" in df.columns:
        df["FirstName"] = df["FirstName"].apply(lambda x: generate_readable_name(x, first_names))
    else:
        print("Warning: Column 'FirstName' not found in the dataset.")

    # Apply masking for LastName
    if "LastName" in df.columns:
        df["LastName"] = df["LastName"].apply(lambda x: generate_readable_name(x, last_names))
    else:
        print("Warning: Column 'LastName' not found in the dataset.")

    # Generate emails based on masked names
    if "FirstName" in df.columns and "LastName" in df.columns and "Email" in df.columns:
        df["Email"] = df.apply(lambda row: generate_email_from_name(row["FirstName"], row["LastName"]), axis=1)
    else:
        print("Warning: Columns 'FirstName', 'LastName', or 'Email' not found in the dataset.")

    # Shuffle phone numbers in the Phone column
    if "Phone" in df.columns:
        df["Phone"] = df["Phone"].apply(shuffle_phone_number)
    else:
        print("Warning: Column 'Phone' not found in the dataset.")

    # Shuffle phone numbers in the MobilePhone column
    if "MobilePhone" in df.columns:
        df["MobilePhone"] = df["MobilePhone"].apply(shuffle_phone_number)
    else:
        print("Warning: Column 'MobilePhone' not found in the dataset.")

    # Update Secondary_Email__c
    if "Secondary_Email__c" in df.columns:
        df["Secondary_Email__c"] = df.apply(
            lambda row: row["Email"] if pd.notnull(row["Secondary_Email__c"]) else row["Secondary_Email__c"], axis=1
        )
    else:
        print("Warning: Column 'Secondary_Email__c' not found in the dataset.")

    # Save the modified file
    df.to_excel(output_file, index=False, engine='openpyxl')  # Save as Excel file
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1_Masked.xlsx


In [12]:
import pandas as pd
import random

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1_Masked.xlsx"

# Larger pools of names and domains
first_names = [
    "John", "Jane", "Michael", "Sarah", "Chris", "Emily", "David", "Emma", 
    "Daniel", "Sophia", "James", "Olivia", "Robert", "Isabella", "Thomas", 
    "Charlotte", "Andrew", "Mia", "Joseph", "Amelia", "Ethan", "Harper"
]

last_names = [
    "Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee", "Clark", "Lewis", 
    "Martinez", "Garcia", "Rodriguez", "Walker", "Hall", "Allen", "Young", 
    "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker"
]

domains = [
    "example.com", "mail.com", "test.com", "sample.com", "dummy.net", 
    "placeholder.org", "fakemail.com", "testdomain.com", "mydomain.net", "demoemail.com"
]

# Masking functions
def generate_readable_name(value, name_pool):
    """Generate a random readable dummy name."""
    if pd.isnull(value):
        return value
    return random.choice(name_pool)

def generate_email_from_name(first_name, last_name):
    """Generate an email address based on the generated name."""
    if pd.isnull(first_name) or pd.isnull(last_name):
        return None
    username = f"{first_name.lower()}.{last_name.lower()}"
    domain = random.choice(domains)
    return f"{username}@{domain}"

def shuffle_phone_number(value):
    """Shuffle digits of the phone number while maintaining the same length."""
    if pd.isnull(value):
        return value
    digits = ''.join(filter(str.isdigit, str(value)))  # Extract digits only
    if digits:
        shuffled = ''.join(random.sample(digits, len(digits)))
        return shuffled
    return value  # Return as-is if no digits

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking for FirstName
    if "FirstName" in df.columns:
        df["FirstName"] = df["FirstName"].apply(lambda x: generate_readable_name(x, first_names))
    else:
        print("Warning: Column 'FirstName' not found in the dataset.")

    # Apply masking for LastName
    if "LastName" in df.columns:
        df["LastName"] = df["LastName"].apply(lambda x: generate_readable_name(x, last_names))
    else:
        print("Warning: Column 'LastName' not found in the dataset.")

    # Update Name column based on masked FirstName and LastName
    if "Name" in df.columns and "FirstName" in df.columns and "LastName" in df.columns:
        df["Name"] = df.apply(lambda row: f"{row['FirstName']} {row['LastName']}", axis=1)
    else:
        print("Warning: Columns 'Name', 'FirstName', or 'LastName' not found in the dataset.")

    # Generate emails based on masked names
    if "FirstName" in df.columns and "LastName" in df.columns and "Email" in df.columns:
        df["Email"] = df.apply(lambda row: generate_email_from_name(row["FirstName"], row["LastName"]), axis=1)
    else:
        print("Warning: Columns 'FirstName', 'LastName', or 'Email' not found in the dataset.")

    # Shuffle phone numbers in the Phone column
    if "Phone" in df.columns:
        df["Phone"] = df["Phone"].apply(shuffle_phone_number)
    else:
        print("Warning: Column 'Phone' not found in the dataset.")

    # Shuffle phone numbers in the MobilePhone column
    if "MobilePhone" in df.columns:
        df["MobilePhone"] = df["MobilePhone"].apply(shuffle_phone_number)
    else:
        print("Warning: Column 'MobilePhone' not found in the dataset.")

    # Update Secondary_Email__c
    if "Secondary_Email__c" in df.columns:
        df["Secondary_Email__c"] = df.apply(
            lambda row: row["Email"] if pd.notnull(row["Secondary_Email__c"]) else row["Secondary_Email__c"], axis=1
        )
    else:
        print("Warning: Column 'Secondary_Email__c' not found in the dataset.")

    # Save the modified file
    df.to_excel(output_file, index=False, engine='openpyxl')  # Save as Excel file
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\FilteredContact1_Masked.xlsx
