In [1]:
import pandas as pd
import random

# File paths
input_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master data account new.xlsx"
output_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master_data_account_new_Masked.xlsx"

# Masking functions
def generate_dummy_phone():
    """Generate a random 10-digit phone number."""
    return ''.join([str(random.randint(0, 9)) for _ in range(10)])

def mask_email(value):
    """Generate a completely random email address."""
    if pd.isnull(value):
        return value
    domains = ["example.com", "mail.com", "test.com", "dummy.net", "sample.com"]
    first = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))
    last = ''.join(random.choices("abcdefghijklmnopqrstuvwxyz", k=7))
    return f"{first}.{last}@{random.choice(domains)}"

def mask_name(value, name_list):
    """Replace a name with a random first or last name."""
    if pd.isnull(value):
        return value
    return random.choice(name_list)

def mask_address(value):
    """Replace the address with a dummy address."""
    if pd.isnull(value) or not isinstance(value, str):
        return value
    streets = ["Maple", "Oak", "Pine", "Elm", "Cedar", "Birch", "Willow", "Ash", "Chestnut", "Walnut"]
    return f"{random.randint(100, 999)} {random.choice(streets)} St, City, State, ZIP"

# Extended name lists
first_names = [
    "John", "Jane", "Michael", "Sarah", "Chris", "Emily", "Matthew", "Sophia", "Daniel", "Emma",
    "David", "Olivia", "James", "Isabella", "Andrew", "Mia", "Joshua", "Charlotte", "Ryan", "Amelia",
    "Jacob", "Abigail", "Noah", "Elizabeth", "Alexander", "Madison", "Anthony", "Chloe", "Tyler", "Ella"
]

last_names = [
    "Smith", "Johnson", "Brown", "Taylor", "Anderson", "Lee", "Martin", "Thompson", "Garcia", "Martinez",
    "Davis", "Harris", "Clark", "Lewis", "Robinson", "Walker", "Young", "Allen", "King", "Wright",
    "Scott", "Hill", "Green", "Adams", "Baker", "Nelson", "Carter", "Mitchell", "Perez", "Roberts"
]

# Load the Excel file
try:
    # Read the Excel file
    df = pd.read_excel(input_file)
    df.columns = [col.strip() for col in df.columns]  # Clean column names

    # Mask FirstName
    if "FirstName" in df.columns:
        df["FirstName"] = df["FirstName"].apply(lambda x: mask_name(x, first_names))

    # Mask LastName
    if "LastName" in df.columns:
        df["LastName"] = df["LastName"].apply(lambda x: mask_name(x, last_names))

    # Combine FirstName and LastName into Name
    if "FirstName" in df.columns and "LastName" in df.columns:
        df["Name"] = df["FirstName"] + " " + df["LastName"]

    # Mask Email__c
    if "Email__c" in df.columns:
        df["Email__c"] = df["Email__c"].apply(mask_email)

    # Mask PersonMobilePhone
    if "PersonMobilePhone" in df.columns:
        df["PersonMobilePhone"] = df["PersonMobilePhone"].apply(lambda x: generate_dummy_phone())

    # Mask PersonEmail
    if "PersonEmail" in df.columns:
        df["PersonEmail"] = df["PersonEmail"].apply(mask_email)

    # Mask Phone
    if "Phone" in df.columns:
        df["Phone"] = df["Phone"].apply(lambda x: generate_dummy_phone())

    # Mask Billing_Street__pc
    if "Billing_Street__pc" in df.columns:
        df["Billing_Street__pc"] = df["Billing_Street__pc"].apply(mask_address)

    # Save the masked data
    df.to_excel(output_file, index=False)
    print(f"Masked file saved to: {output_file}")

except Exception as e:
    print(f"Error processing file: {e}")


Masked file saved to: D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master_data_account_new_Masked.xlsx


In [2]:
import pandas as pd

# File paths
original_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master data account new.xlsx"
modified_file = r"D:\Jupyter_projects\Salesforce_dataset\master_data_25112024\master_data_account_new_Masked.xlsx"

# Columns to check
columns_to_check = ["Phone", "PersonMobilePhone", "Email__c", "Billing_Street__pc", "PersonEmail"]

# Load the files
try:
    original_df = pd.read_excel(original_file)
    modified_df = pd.read_excel(modified_file)

    original_df.columns = [col.strip() for col in original_df.columns]
    modified_df.columns = [col.strip() for col in modified_df.columns]

    comparison_results = {}

    for column in columns_to_check:
        if column in original_df.columns and column in modified_df.columns:
            original_values = original_df[column].fillna("NULL").astype(str).str.strip()
            modified_values = modified_df[column].fillna("NULL").astype(str).str.strip()

            non_null_mask = (original_values != "NULL") & (modified_values != "NULL")
            original_non_null = original_values[non_null_mask]
            modified_non_null = modified_values[non_null_mask]

            differences = original_non_null != modified_non_null

            modified_count = differences.sum()
            total_non_null = non_null_mask.sum()
            modified_percentage = (modified_count / total_non_null) * 100 if total_non_null > 0 else 0

            difference_sample = pd.DataFrame({
                "Original": original_non_null[differences].head(5).values,
                "Modified": modified_non_null[differences].head(5).values,
            })

            comparison_results[column] = {
                "Modified Count": modified_count,
                "Total Non-NULL Rows": total_non_null,
                "Modified Percentage": modified_percentage,
                "Sample Differences": difference_sample,
            }
        else:
            comparison_results[column] = "Column not found in both files"

    print("\nDetailed Comparison Results:")
    for col, result in comparison_results.items():
        if isinstance(result, dict):
            print(f"\nColumn: {col}")
            print(f"  Modified Count: {result['Modified Count']}")
            print(f"  Total Non-NULL Rows: {result['Total Non-NULL Rows']}")
            print(f"  Modified Percentage: {result['Modified Percentage']:.2f}%")
            print(f"  Sample Differences:\n{result['Sample Differences']}")
        else:
            print(f"\nColumn: {col} - {result}")

except Exception as e:
    print(f"Error processing files: {e}")



Detailed Comparison Results:

Column: Phone
  Modified Count: 2358
  Total Non-NULL Rows: 2358
  Modified Percentage: 100.00%
  Sample Differences:
         Original    Modified
0  (713) 838-8318  1143919141
1      5126953072  6392983724
2    630-362-4211  9885086944
3      5024173174  5958071452
4      7132561004  5711244183

Column: PersonMobilePhone
  Modified Count: 1565
  Total Non-NULL Rows: 1565
  Modified Percentage: 100.00%
  Sample Differences:
       Original    Modified
0  706-536-6133  1953517785
1  630-362-4211  4007187732
2  214-533-1220  4625919674
3  214-212-3679  5140149136
4  601-757-5148  2833833710

Column: Email__c
  Modified Count: 1280
  Total Non-NULL Rows: 1280
  Modified Percentage: 100.00%
  Sample Differences:
                   Original                   Modified
0  ayonke@americanortho.com     chsny.fpooosx@test.com
1        flexned@runbox.com     pvmsu.strmqmo@test.com
2         rlyons@qideas.org   qujox.rgyoqzo@sample.com
3     kaymantooth@yahoo.com   