Click [here](https://medium.com/@morihosseini/synthetic-data-unleashing-possibilities-42eadd5c2fbb) to access the associated Medium article.

# Faker

In [56]:
!pip install -q faker

In [5]:
from faker import Faker

faker = Faker()

name = faker.name()
address = faker.address()
phone_number = faker.phone_number()

print(f"Name: {name}\nAddress: {address}\nPhone Number: {phone_number}")

Name: Lee Thomas
Address: 3351 Travis Hill
Garciaberg, NE 94330
Phone Number: 595-467-8251x37667


# GPT-4

In [55]:
!pip install -q openai

In [42]:
import openai

openai.api_key = "YOUR_API_KEY"

prompt = (
    "Generate a synthetic dataset with 5 records of customer orders."
    "The dataset should have the following columns: customer_id, "
    "order_id, order_date, product_id, product_name."
)
response = openai.ChatCompletion.create(
    model="gpt-4", messages=[{"role": "user", "content": prompt}]
)
synthetic_text = response.choices[0].message.content

print(synthetic_text)

customer_id | order_id | order_date | product_id | product_name | product_price
--- | --- | --- | --- | --- | ---
1 | 201001 | 15-03-2022 | P101 | Apple iPhone 13 | 799
2 | 201002 | 16-03-2022 | P102 | Lenovo ThinkPad L340 | 500
3 | 201003 | 17-03-2022 | P103 | Samsung Galaxy S21 | 699
4 | 201004 | 18-03-2022 | P104 | Dell Inspiron 15 | 550
5 | 201005 | 19-03-2022 | P105 | MacBook Pro       | 2400


# Mimicking Data Distribution

In [61]:
!pip install -q numpy

In [64]:
import numpy as np

# Sample synthetic data with similar distribution as real data
mean_real = 0.5
std_real = 0.1
num_samples = 10
synthetic_data = np.random.normal(mean_real, std_real, size=num_samples)

print(synthetic_data)

[0.49302535 0.39866691 0.42177036 0.3342057  0.63561163 0.69563605
 0.54134615 0.5724373  0.32113683 0.43635857]


# Incorporating Noise

In [73]:
import random

# Introduce random noise to synthetic data points
original_data_points = [2, 3, 4, 5]
noisy_data_points = [
    point + random.uniform(-0.1, 0.1) for point in original_data_points
]

print(noisy_data_points)

[1.9029237560973817, 2.997350308120952, 4.0929803121832435, 4.912912456195437]


# Validating the Quality of Synthetic Data

In [99]:
!pip install -q scipy

In [98]:
from scipy.stats import ttest_ind

real_data = [1, 2, 3, 4, 5]
synthetic_data = [10, 2, 3, 4, 50]

# Perform a statistical test between real and synthetic data distributions.
t_statistic, p_value = ttest_ind(real_data, synthetic_data)

if p_value < 0.05:
    print("Significant difference detected!")
else:
    print("No significant difference detected!")

No significant difference detected!


# Protecting privacy

In [105]:
!pip install -q numpy scikit-learn

In [131]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

sensitive_data = ["John", "Mary", "Chris", "Sarah"]

# Encode sensitive names to numeric labels
le = LabelEncoder()
sensitive_labels = le.fit_transform(sensitive_data)

# Add noise
noisy_labels = sensitive_labels + np.random.normal(
    scale=0.5, size=len(sensitive_labels)
)

# Perturb data by shuffling
np.random.shuffle(noisy_labels)

# Transform by scaling and clipping to 0-1 range
noisy_labels = np.clip(noisy_labels * 0.75, 0, 1)

print(noisy_labels)

[0.22967237 0.3528305  0.62191547 1.        ]
