In [11]:
import pandas as pd
import random

# File paths
# input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\Property Sorted.csv"
# output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\Property_Sorted_Masked.csv"

D:\Jupyter_projects\Salesforce_dataset\master_data_25112024

input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property Sorted.csv"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property_Sorted_Masked.csv"


# Larger pools of names and domains
first_names = [
    "John", "Jane", "Michael", "Sarah", "Chris", "Emily", "David", "Emma", 
    "Daniel", "Sophia", "James", "Olivia", "Robert", "Isabella", "Thomas", 
    "Charlotte", "Andrew", "Mia", "Joseph", "Amelia", "Ethan", "Harper"
]

last_names = [
    "Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee", "Clark", "Lewis", 
    "Martinez", "Garcia", "Rodriguez", "Walker", "Hall", "Allen", "Young", 
    "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker"
]

domains = [
    "example.com", "mail.com", "test.com", "sample.com", "dummy.net", 
    "placeholder.org", "fakemail.com", "testdomain.com", "mydomain.net", "demoemail.com"
]

# Masking functions
def generate_readable_name(value):
    """Generate a random readable dummy name."""
    if pd.isnull(value):
        return value
    return f"{random.choice(first_names)} {random.choice(last_names)}"

def generate_email_from_name(name):
    """Generate an email address based on the generated name."""
    if pd.isnull(name):
        return name
    if isinstance(name, str):
        name_parts = name.split()  # Split the name into first and last names
        username = f"{name_parts[0].lower()}.{name_parts[1].lower()}"
        domain = random.choice(domains)
        return f"{username}@{domain}"
    return None

def shuffle_phone_number(value):
    """Shuffle digits of the phone number while maintaining the format."""
    if pd.isnull(value):
        return value
    digits = ''.join(filter(str.isdigit, value))  # Extract digits only
    if digits:
        shuffled = ''.join(random.sample(digits, len(digits)))
        return f"{shuffled[:3]}-{shuffled[3:6]}-{shuffled[6:]}"
    return value  # Return as-is if no digits

# Columns to mask
columns_to_mask = {
    "Account__r.Name": generate_readable_name,        # Generate readable dummy name
    "Account__r.Phone": shuffle_phone_number          # Shuffle phone digits with formatting
}

# Load the CSV file with encoding handling
try:
    df = pd.read_csv(input_file, encoding='ISO-8859-1')

    # Apply name masking
    if "Account__r.Name" in df.columns:
        df["Account__r.Name"] = df["Account__r.Name"].apply(generate_readable_name)
    else:
        print("Warning: Column 'Account__r.Name' not found in the dataset.")

    # Generate emails based on masked names
    if "Account__r.Name" in df.columns and "Account__r.PersonEmail" in df.columns:
        df["Account__r.PersonEmail"] = df["Account__r.Name"].apply(generate_email_from_name)
    else:
        print("Warning: Column 'Account__r.PersonEmail' not found in the dataset or 'Account__r.Name' is missing.")

    # Apply phone number masking
    if "Account__r.Phone" in df.columns:
        df["Account__r.Phone"] = df["Account__r.Phone"].apply(shuffle_phone_number)
    else:
        print("Warning: Column 'Account__r.Phone' not found in the dataset.")

    # Save the modified file
    df.to_csv(output_file, index=False, encoding='ISO-8859-1')  # Save with same encoding
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_filtering\Property_Sorted_Masked.csv


In [22]:
import pandas as pd
import random
import re

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property Sorted.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property_Sorted_Masked.xlsx"

# Masking functions
def shuffle_property_name(value):
    """Shuffle the characters in the property name while maintaining the original length."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    return ''.join(random.sample(value, len(value)))

def completely_mask_pool_company(value):
    """Completely change the Pool_Company__c field to a randomized value."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    company_names = [
        "Pluto Aqua Services",
        "Neptune Pool Masters",
        "Mars Waterworks",
        "Jupiter Cleaners",
        "Saturn Pool Solutions",
        "Mercury AquaGuard",
        "Venus Pool Professionals",
        "Andromeda Maintenance Co.",
        "Orion Pool Specialists",
        "Galactic Aqua Systems"
    ]
    contacts = [
        "contact@plutoaqua.com",
        "support@neptunepools.net",
        "info@marswaterworks.org",
        "services@jupitercleaners.com",
        "help@saturnpools.co",
        "assist@mercuryaquaguard.com",
        "connect@venuspools.net",
        "reach@andromedamaintenance.com",
        "inquiry@orionpoolspecialists.org",
        "hello@galacticaqua.com"
    ]
    phone_number = f"XXX-XXX-{random.randint(1000, 9999)}"
    schedule = random.choice(["Weekly", "Bi-weekly", "Monthly", "Custom Schedule"])
    
    new_value = (
        f"{random.choice(company_names)}, "
        f"Contact: {random.choice(contacts)}, "
        f"Phone: {phone_number}, "
        f"Service Schedule: {schedule}."
    )
    return new_value

def completely_mask_property_nickname(value, nickname_counts):
    """Replace Property_Nickname__c with a completely new unique value with incremental numbering."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    nickname_options = [
        "Aurora Retreat", "Lunar Haven", "Solar Bliss", "Starlight Escape",
        "Nebula Residence", "Comet Cottage", "Meteor Villa", "Cosmic Abode",
        "Galaxy Hideaway", "Orbit Mansion", "Astral Homestead", "Celestial Suite"
    ]
    new_nickname = random.choice(nickname_options)
    
    # Increment count for duplicates
    if new_nickname in nickname_counts:
        nickname_counts[new_nickname] += 1
        new_nickname = f"{new_nickname} {nickname_counts[new_nickname]}"
    else:
        nickname_counts[new_nickname] = 1
    
    return new_nickname

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking to Property_Name__c by shuffling characters
    if "Property_Name__c" in df.columns:
        df["Property_Name__c"] = df["Property_Name__c"].apply(shuffle_property_name)
    else:
        print("Warning: Column 'Property_Name__c' not found in the dataset.")

    # Apply masking to Pool_Company__c
    if "Pool_Company__c" in df.columns:
        df["Pool_Company__c"] = df["Pool_Company__c"].apply(completely_mask_pool_company)
    else:
        print("Warning: Column 'Pool_Company__c' not found in the dataset.")

    # Apply masking to Property_Nickname__c
    if "Property_Nickname__c" in df.columns:
        nickname_counts = {}  # Track occurrences of each nickname
        df["Property_Nickname__c"] = df["Property_Nickname__c"].apply(
            lambda x: completely_mask_property_nickname(x, nickname_counts)
        )
    else:
        print("Warning: Column 'Property_Nickname__c' not found in the dataset.")

    # Save the modified file to Excel
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property_Sorted_Masked.xlsx


In [23]:
import pandas as pd

# File paths
source_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property Sorted.xlsx"
masked_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master_data_masked_files_25112024\Property_Sorted_Masked.xlsx"

# Columns to check
columns_to_check = ["Property_Name__c", "Pool_Company__c", "Property_Nickname__c"]

# Load the files
try:
    # Read source and masked files
    source_df = pd.read_excel(source_file)
    masked_df = pd.read_excel(masked_file)

    # Clean column names
    source_df.columns = [col.strip() for col in source_df.columns]
    masked_df.columns = [col.strip() for col in masked_df.columns]

    # Initialize a dictionary to store comparison results
    comparison_results = {}

    # Compare each column
    for column in columns_to_check:
        if column in source_df.columns and column in masked_df.columns:
            # Fill NaN values and convert to strings for comparison
            source_values = source_df[column].fillna("NULL").astype(str).str.strip()
            masked_values = masked_df[column].fillna("NULL").astype(str).str.strip()

            # Check if the columns are different
            if (source_values != masked_values).any():  # If any value differs
                comparison_results[column] = "Modified"
            else:
                comparison_results[column] = "Not Modified"
        else:
            comparison_results[column] = "Column not found in both files"

    # Print the results
    print("Comparison Results:")
    for col, status in comparison_results.items():
        print(f"{col}: {status}")

except Exception as e:
    print(f"Error processing files: {e}")


Comparison Results:
Property_Name__c: Modified
Pool_Company__c: Modified
Property_Nickname__c: Modified


Final Comparison Results:
Property_Name__c: Modified and Valid
Pool_Company__c: Modified and Valid
Property_Nickname__c: Modified and Valid


In [34]:
import pandas as pd
import random
import uuid

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property Sorted.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property_Sorted_Masked.xlsx"

# Masking functions
def fully_randomize_property_name(value):
    """Replace Property_Name__c with a completely unique random string."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    random_suffix = str(uuid.uuid4())[:8]  # Add a random UUID suffix for uniqueness
    return f"Prop_{random_suffix}"

def completely_mask_pool_company(value):
    """Completely change the Pool_Company__c field to a unique randomized value."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    company_names = [
        "Pluto Aqua Services", "Neptune Pool Masters", "Mars Waterworks",
        "Jupiter Cleaners", "Saturn Pool Solutions", "Mercury AquaGuard",
        "Venus Pool Professionals", "Andromeda Maintenance Co.",
        "Orion Pool Specialists", "Galactic Aqua Systems"
    ]
    unique_suffix = str(uuid.uuid4())[:8]
    return f"{random.choice(company_names)}_{unique_suffix}"

def fully_unique_property_nickname(value, row_index):
    """Replace Property_Nickname__c with a completely unique value."""
    if pd.isnull(value):
        return value
    nickname_options = [
        "Aurora Retreat", "Lunar Haven", "Solar Bliss", "Starlight Escape",
        "Nebula Residence", "Comet Cottage", "Meteor Villa", "Cosmic Abode",
        "Galaxy Hideaway", "Orbit Mansion", "Astral Homestead", "Celestial Suite"
    ]
    unique_suffix = str(uuid.uuid4())[:8]  # Add a unique suffix for each row
    return f"{random.choice(nickname_options)}_{row_index}_{unique_suffix}"

# Load the Excel file
try:
    df = pd.read_excel(input_file)

    # Apply masking to Property_Name__c
    if "Property_Name__c" in df.columns:
        df["Property_Name__c"] = df["Property_Name__c"].apply(fully_randomize_property_name)

    # Apply masking to Pool_Company__c
    if "Pool_Company__c" in df.columns:
        df["Pool_Company__c"] = df["Pool_Company__c"].apply(completely_mask_pool_company)

    # Apply masking to Property_Nickname__c
    if "Property_Nickname__c" in df.columns:
        df["Property_Nickname__c"] = [
            fully_unique_property_nickname(value, idx) for idx, value in enumerate(df["Property_Nickname__c"])
        ]

    # Save the modified file to Excel
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")
except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property_Sorted_Masked.xlsx


In [36]:
import pandas as pd

# File paths
source_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property Sorted.xlsx"
masked_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\Property_Sorted_Masked.xlsx"

# Columns to check
columns_to_check = ["Property_Name__c", "Pool_Company__c", "Property_Nickname__c"]

try:
    # Read source and masked files
    source_df = pd.read_excel(source_file)
    masked_df = pd.read_excel(masked_file)

    # Clean column names
    source_df.columns = [col.strip() for col in source_df.columns]
    masked_df.columns = [col.strip() for col in masked_df.columns]

    # Initialize a dictionary to store comparison results
    comparison_results = {}

    # Compare each column
    for column in columns_to_check:
        if column in source_df.columns and column in masked_df.columns:
            # Fill NaN values and convert to strings for comparison
            source_values = source_df[column].fillna("NULL").astype(str).str.strip()
            masked_values = masked_df[column].fillna("NULL").astype(str).str.strip()

            # Check for rows where both source and masked values are non-NULL
            non_null_mask = (source_values != "NULL") & (masked_values != "NULL")
            non_null_source_values = source_values[non_null_mask]
            non_null_masked_values = masked_values[non_null_mask]

            # Compare non-NULL rows
            differences = non_null_source_values != non_null_masked_values

            # Validate uniqueness in masked data
            unique_masked = masked_values[non_null_mask].is_unique

            if differences.all() and unique_masked:  # All non-NULL rows differ and masked data is unique
                comparison_results[column] = "Modified and Valid"
            elif differences.any():  # Some rows differ but not all
                comparison_results[column] = "Modified but Not Fully Valid"
            else:  # No differences in non-NULL rows
                comparison_results[column] = "Not Modified"
        else:
            comparison_results[column] = "Column not found in both files"

    # Print final results
    print("Final Comparison Results:")
    for col, result in comparison_results.items():
        print(f"{col}: {result}")

except Exception as e:
    print(f"Error processing files: {e}")


Final Comparison Results:
Property_Name__c: Modified and Valid
Pool_Company__c: Modified and Valid
Property_Nickname__c: Modified and Valid
