In [None]:
import numpy as np
import pandas as pd
from faker import Faker
import random

# Initialize Faker for generating realistic names, emails, etc.
fake = Faker()

# Number of samples
n = 1000

In [None]:
# Generate Customer Data
customer_ids = np.arange(1, n + 1)
customers = [fake.name() for _ in range(n)]

# Generate realistic emails with common domains
email_domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com']
emails = [
    f"{name.replace(' ', '').lower()}@{random.choice(email_domains)}" for name in customers]

# Create customer data dictionary
customer_data = {
    'customer_id': customer_ids,
    'customer_name': customers,
    'email': emails,
    'loyalty_points': np.random.randint(0, 1000, n)
}

# Create the Customers DataFrame
df_customers = pd.DataFrame(customer_data)

# Introduce Missing Data for `email` (Simulating customers without email)
missing_email_indices = np.random.choice(
    df_customers.index, size=50, replace=False)
df_customers.loc[missing_email_indices, 'email'] = np.nan

# Introduce Duplicate Data (Duplicate some customers)
# Randomly sample 50 rows to duplicate
duplicates = df_customers.sample(n=50, replace=True)
df_customers = pd.concat([df_customers, duplicates])

In [None]:
# Generate Product Data (Create permutations of products and real brands)
product_names = [
    'Laptop', 'Smartphone', 'Tablet', 'Headphones', 'Smartwatch', 'TV', 'Washing Machine',
    'Refrigerator', 'Shoes', 'T-shirt', 'Jeans', 'Jacket', 'Air Conditioner', 'Blender',
    'Microwave', 'Couch', 'Desk Chair', 'Coffee Maker', 'Toaster'
]
product_brands = ['Apple', 'Samsung', 'Nike', 'Sony',
                  'LG', 'Dell', 'Bosch', 'Philips', 'Nike', 'Adidas']

# Create permutations of product names and brands
products = []
for product in product_names:
    for brand in product_brands:
        products.append((product, brand))

# Now we have a list of (product, brand) pairs
product_ids = np.arange(1, len(products) + 1)

# Random price generation for each product
product_price = np.random.uniform(50, 2000, len(products))

# Random product ratings for each product
product_rating = np.random.uniform(1, 5, len(products))

# Creating the DataFrame with the products
product_data = {
    'product_id': product_ids,
    'product_name': [item[0] for item in products],
    'product_brand': [item[1] for item in products],
    'product_price': product_price,
    'product_rating': product_rating
}

df_products = pd.DataFrame(product_data)

# Introduce Missing Data for `product_rating` (Simulating missing product ratings)
missing_rating_indices = np.random.choice(
    df_products.index, size=30, replace=False)
df_products.loc[missing_rating_indices, 'product_rating'] = np.nan

In [None]:
# Generate Orders Data (Linking Customers to Products)
# In the case of composite key, we'll use customer_id and product_id together as the primary key
orders_data = {
    'customer_id': np.random.choice(customer_ids, size=n),
    'product_id': np.random.choice(product_ids, size=n),
    'quantity': np.random.randint(1, 5, n),
    'total_price': np.random.uniform(50, 2000, n),
    'order_status': np.random.choice(['Pending', 'Shipped', 'Delivered', 'Returned'], n),
    'feedback_rating': np.random.choice(['Very Satisfied', 'Satisfied', 'Neutral', 'Unsatisfied'], n, p=[0.4, 0.3, 0.2, 0.1])
}

# Make the `customer_id` and `product_id` the composite key
df_orders = pd.DataFrame(orders_data)

# Introduce Missing Data for `order_status` (Simulating missing statuses)
missing_status_indices = np.random.choice(
    df_orders.index, size=40, replace=False)
df_orders.loc[missing_status_indices, 'order_status'] = np.nan

# Introduce Duplicate Data for Orders (Simulate duplicate orders for the same product by the same customer)
order_duplicates = df_orders.sample(n=30, replace=True)  # Duplicate 30 orders
df_orders = pd.concat([df_orders, order_duplicates])

In [None]:
# Generate Shipping Data (Linking Orders to Shipping Details)
shipping_ids = np.arange(1, n + 1)
shipping_data = {
    'shipping_id': shipping_ids,
    'order_id': np.random.choice(df_orders.index, size=n),
    'shipping_method': np.random.choice(['Standard', 'Expedited', 'Two-day'], size=n),
    'shipping_cost': np.random.uniform(5, 50, n),
    'discount_applied': np.random.uniform(0, 0.2, n)
}
df_shipping = pd.DataFrame(shipping_data)

In [None]:

# Generating Customer Behavior Data
behavior_ids = np.arange(1, n + 1)
behavior_data = {
    'behavior_id': behavior_ids,
    'customer_id': np.random.choice(customer_ids, size=n),
    'cart_abandonment': np.random.choice([True, False], size=n),
    'customer_segment': np.random.choice(['VIP', 'Regular', 'New User'], size=n)
}
df_behavior = pd.DataFrame(behavior_data)

In [None]:
# Exporting data to CSV files
df_customers.to_csv(
    'data/Customers.csv', index=False)
df_products.to_csv('data/Products.csv', index=False)
df_orders.to_csv(
    'data/Orders.csv', index=False)
df_shipping.to_csv('data/Shipping.csv', index=False)
df_behavior.to_csv('data/CustomerBehavior.csv', index=False)