Click [here](https://medium.com/@morihosseini/get-started-with-data-anonymization-40ee967152fd) to access the associated Medium article.

# Setup


In [30]:
!pip install -q pandas numpy cryptography

[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

# Randomization


In [10]:
import pandas as pd
import numpy as np

# Create a synthetic dataset
df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "Age": [25, 35, 45, 55, 65],
        "Salary": [50000, 60000, 70000, 80000, 90000],
        "Phone Number": [
            "555-1234",
            "555-2345",
            "555-3456",
            "555-4567",
            "555-5678",
        ],
    }
)


# Define a function to randomize column values
def randomize_values(col_values):
    col_values_list = list(col_values)  # convert string to list
    np.random.shuffle(col_values_list)
    return "".join(col_values_list)  # convert list back to string


# Apply the function to the desired column(s)
column_to_randomize = "Name"
df[column_to_randomize] = df[column_to_randomize].apply(randomize_values)

# Print the anonymized dataset
print(df)

      Name  Age  Salary Phone Number
0    Ailec   25   50000     555-1234
1      obB   35   60000     555-2345
2  aClireh   45   70000     555-3456
3    iadvD   55   80000     555-4567
4      veE   65   90000     555-5678


# Generalization


In [11]:
import pandas as pd

# Create a synthetic dataset
df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "Age": [25, 35, 45, 55, 65],
        "Salary": [50000, 60000, 70000, 80000, 90000],
        "Phone Number": [
            "555-1234",
            "555-2345",
            "555-3456",
            "555-4567",
            "555-5678",
        ],
    }
)


# Define a function to generalize column values
def generalize_values(col_values):
    return col_values.apply(
        lambda x: str(int(x / 10) * 10) + "-" + str(int(x / 10) * 10 + 9)
    )


# Apply the function to the desired column(s)
column_to_generalize = "Age"
df[column_to_generalize] = generalize_values(df[column_to_generalize])

# Print the anonymized dataset
print(df)

      Name    Age  Salary Phone Number
0    Alice  20-29   50000     555-1234
1      Bob  30-39   60000     555-2345
2  Charlie  40-49   70000     555-3456
3    David  50-59   80000     555-4567
4      Eve  60-69   90000     555-5678


# Masking


In [12]:
import pandas as pd

# Create a synthetic dataset
df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "Age": [25, 35, 45, 55, 65],
        "Salary": [50000, 60000, 70000, 80000, 90000],
        "Phone Number": [
            "555-1234",
            "555-2345",
            "555-3456",
            "555-4567",
            "555-5678",
        ],
    }
)


# Define a function to mask column values
def mask_values(col_values):
    return col_values.apply(lambda x: x[:-4] + "****")


# Apply the function to the desired column(s)
column_to_mask = "Phone Number"
df[column_to_mask] = mask_values(df[column_to_mask])

# Print the anonymized dataset
print(df)

      Name  Age  Salary Phone Number
0    Alice   25   50000     555-****
1      Bob   35   60000     555-****
2  Charlie   45   70000     555-****
3    David   55   80000     555-****
4      Eve   65   90000     555-****


# Perturbation


In [13]:
import pandas as pd
import numpy as np

# Create a synthetic dataset
df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "Age": [25, 35, 45, 55, 65],
        "Salary": [50000, 60000, 70000, 80000, 90000],
        "Phone Number": [
            "555-1234",
            "555-2345",
            "555-3456",
            "555-4567",
            "555-5678",
        ],
    }
)


# Define a function to perturb column values
def perturb_values(col_values):
    return col_values.apply(lambda x: x + np.random.randint(0, 5))


# Apply the function to the desired column(s)
column_to_perturb = "Age"
df[column_to_perturb] = perturb_values(df[column_to_perturb])

# Print the anonymized dataset
print(df)

      Name  Age  Salary Phone Number
0    Alice   25   50000     555-1234
1      Bob   35   60000     555-2345
2  Charlie   46   70000     555-3456
3    David   58   80000     555-4567
4      Eve   67   90000     555-5678


# Encryption


In [201]:
import pandas as pd
from cryptography.fernet import Fernet

# Create a synthetic dataset
df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "Age": [25, 35, 45, 55, 65],
        "Salary": [50000, 60000, 70000, 80000, 90000],
        "Phone Number": [
            "555-1234",
            "555-2345",
            "555-3456",
            "555-4567",
            "555-5678",
        ],
    }
)


# Define a function to encrypt column values
def encrypt_values(col_values):
    encrypted_col = []
    for value in col_values.values:
        key = Fernet.generate_key()
        fernet = Fernet(key)
        encrypted_value = fernet.encrypt(str(value).encode())
        encrypted_col.append(encrypted_value.decode())
    return encrypted_col


# Apply the function to the desired column(s)
column_to_encrypt = ["Name"]
df[column_to_encrypt] = df[column_to_encrypt].apply(encrypt_values)

# Print the anonymized dataset
print(df)

                                                Name  Age  Salary Phone Number
0  gAAAAABlPtvAPJ2unH6ofQ4gIzyxmuODlSyxk70o7bH1bX...   25   50000     555-1234
1  gAAAAABlPtvAZA4xYTcsp6H0PBrfZJUw1-gX1jd24keyp6...   35   60000     555-2345
2  gAAAAABlPtvAD377dSN7-lwu8VFCTHhWyKe-rzyZvSyWNH...   45   70000     555-3456
3  gAAAAABlPtvA_9rEOvFsHtdc1J_Hj9Rjmdeu9umdnUAJ6d...   55   80000     555-4567
4  gAAAAABlPtvASkzkeIhovEAB8X96MDjpEEjdFdCmZrn-82...   65   90000     555-5678
