In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker for generating fake data
fake = Faker()

# Define number of entries
num_entries = 100

# Generate dataset
data = pd.DataFrame({
    'customer_id': ['CUST' + str(i+1).zfill(5) for i in range(num_entries)],
    'name': [fake.name() for _ in range(num_entries)],                # Fake customer names
    'email': [fake.email() for _ in range(num_entries)],              # Fake email addresses
    'purchase_amount': np.round(np.random.uniform(10, 1000, num_entries), 2),  # Random purchase amounts
    'purchase_date': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_entries)],  # Dates within the last year
    'product_category': [random.choice(['Electronics', 'Clothing', 'Books', 'Groceries', 'Home']) for _ in range(num_entries)]
})

# Display the first few rows of the dataset
data.head()


Unnamed: 0,customer_id,name,email,purchase_amount,purchase_date,product_category
0,CUST00001,Jesse Davis,johnfarmer@example.com,772.15,2024-07-25,Books
1,CUST00002,Peter Ferrell,lmartinez@example.com,552.96,2024-05-25,Home
2,CUST00003,Stacey Wells,ejones@example.com,802.21,2024-03-15,Clothing
3,CUST00004,David Powell,kelly69@example.org,34.75,2024-02-04,Books
4,CUST00005,Renee Ellis,gerald71@example.com,808.14,2024-08-11,Groceries


In [5]:
import pandas as pd
import numpy as np

# Load sample data (replace 'sample_data.csv' with your actual dataset)
# Assume this is a customer transaction dataset with sensitive information
try:
    data = pd.read_csv('sample_data.csv')
except FileNotFoundError:
    # Create a sample dataset if you don't have one
    data = pd.DataFrame({
        'customer_id': np.arange(1, 11),
        'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Hannah', 'Isaac', 'Jack'],
        'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com', 'eve@example.com', 
                  'frank@example.com', 'grace@example.com', 'hannah@example.com', 'isaac@example.com', 'jack@example.com'],
        'purchase_amount': np.random.uniform(20, 500, 10),
        'purchase_date': pd.date_range(start='2023-01-01', periods=10, freq='M')
    })

data.head()


Unnamed: 0,customer_id,name,email,purchase_amount,purchase_date
0,1,Alice,alice@example.com,45.445419,2023-01-31
1,2,Bob,bob@example.com,216.656872,2023-02-28
2,3,Charlie,charlie@example.com,106.04978,2023-03-31
3,4,David,david@example.com,437.202585,2023-04-30
4,5,Eve,eve@example.com,33.637118,2023-05-31


In [6]:
# Anonymize Sensitive Data

In [7]:
# Drop or anonymize sensitive columns (e.g., names and emails)
data = data.drop(columns=['name', 'email'])
data['customer_id'] = 'CUST' + data['customer_id'].astype(str)  # Mask customer IDs

# Show the anonymized data
data.head()


Unnamed: 0,customer_id,purchase_amount,purchase_date
0,CUST1,45.445419,2023-01-31
1,CUST2,216.656872,2023-02-28
2,CUST3,106.04978,2023-03-31
3,CUST4,437.202585,2023-04-30
4,CUST5,33.637118,2023-05-31


In [None]:
# Save Anonymized Data for Sharing

In [8]:
# Save anonymized data for sharing with collaborators
data.to_csv('anonymized_data.csv', index=False)
print("Anonymized data saved as 'anonymized_data.csv'")


Anonymized data saved as 'anonymized_data.csv'


In [3]:

# Read and display the contents of the saved CSV file
data = pd.read_csv('anonymized_data.csv')
print(data)

  customer_id  purchase_amount purchase_date
0       CUST1        45.445419    2023-01-31
1       CUST2       216.656872    2023-02-28
2       CUST3       106.049780    2023-03-31
3       CUST4       437.202585    2023-04-30
4       CUST5        33.637118    2023-05-31
5       CUST6       351.912292    2023-06-30
6       CUST7       246.876709    2023-07-31
7       CUST8       313.934855    2023-08-31
8       CUST9       377.118913    2023-09-30
9      CUST10        94.174128    2023-10-31
