In [0]:
%pip install faker
dbutils.library.restartPython()

In [0]:

from faker import Faker
import random
from datetime import datetime, timedelta
import pandas as pd
import uuid
import time

import numpy as np


In [0]:

class SyntheticDataGenerator:
    def __init__(self):
        self.fake = Faker()
        self.reference_data = {
            'TRANSACTION_TYPES': ["DEBIT", "CASH", "PAYMENT", "TRANSFER"],
            'DELIVERY_STATUSES': ["Advance shipping", "Late delivery", "Shipping canceled", "Shipping on time"],
            'CUSTOMER_SEGMENTS': ["Consumer", "Corporate", "Home Office"],
            'MARKETS': ["Africa", "Europe", "LATAM", "Pacific Asia", "USCA"],
            'ORDER_STATUSES': ["COMPLETE", "PENDING", "CLOSED", "PENDING_PAYMENT", "CANCELED", 
                              "PROCESSING", "SUSPECTED_FRAUD", "ON_HOLD", "PAYMENT_REVIEW"],
            'SHIPPING_MODES': ["Standard Class", "First Class", "Second Class", "Same Day"],
            'CATEGORIES': {
                1: "Electronics", 2: "Clothing", 3: "Books", 4: "Home & Garden",
                5: "Sports & Outdoors", 6: "Toys & Games", 7: "Health & Beauty",
                8: "Automotive", 9: "Grocery", 10: "Office Supplies"
            },
            'DEPARTMENTS': {
                1: "Sales", 2: "Marketing", 3: "Operations", 4: "Customer Service",
                5: "Warehouse", 6: "IT", 7: "Finance"
            },
            'ORDER_REGIONS': [
                "Southeast Asia", "South Asia", "Oceania", "Eastern Asia", "West Asia",
                "West of USA", "US Center", "West Africa", "Central Africa", "North Africa",
                "Western Europe", "Northern Europe", "Caribbean", "South America", "East Africa",
                "Southern Europe", "East of USA", "Canada", "Southern Africa", "Central Asia",
                "Central America", "Eastern Europe", "South of USA"
            ],
            'PRODUCT_NAMES': {
                "Electronics": ["Laptop", "Smartphone", "Tablet", "Headphones", "Smart Watch"],
                "Clothing": ["T-Shirt", "Jeans", "Dress", "Jacket", "Shoes"],
                "Books": ["Fiction Novel", "Textbook", "Cookbook", "Biography"],
                "Home & Garden": ["Furniture Set", "Garden Tools", "Kitchen Appliance", "Lamp"],
                "Sports & Outdoors": ["Bicycle", "Tent", "Yoga Mat", "Running Shoes"],
                "Toys & Games": ["Board Game", "Action Figure", "Puzzle", "Doll"],
                "Health & Beauty": ["Skincare Set", "Makeup Kit", "Vitamins", "Perfume"],
                "Automotive": ["Car Parts", "Tools", "GPS Device", "Tire"],
                "Grocery": ["Snack Pack", "Beverage Set", "Organic Food", "Coffee"],
                "Office Supplies": ["Desk Chair", "Notebook Set", "Printer", "Pen Set"]
            }
        }

    def generate_order_date(self):
        """Generate random date in 2018 with time"""
        start = datetime(2018, 1, 1, 0, 0, 0)
        end = datetime(2018, 12, 31, 23, 59, 59)
        delta = end - start
        random_seconds = random.randint(0, int(delta.total_seconds()))
        return (start + timedelta(seconds=random_seconds)).strftime("%m/%d/%Y %H:%M")

    def generate_shipping_date(self, order_date_str, days):
        """Calculate shipping date based on order date"""
        order_date = datetime.strptime(order_date_str, "%m/%d/%Y %H:%M")
        return (order_date + timedelta(days=days)).strftime("%m/%d/%Y %H:%M")

    def generate_record(self, i, product_price_override=None, customer_segment_override=None, customer_zipcode_override=None):
        """Generate a single synthetic record"""
        # Extract reference data
        TRANSACTION_TYPES = self.reference_data['TRANSACTION_TYPES']
        DELIVERY_STATUSES = self.reference_data['DELIVERY_STATUSES']
        CUSTOMER_SEGMENTS = self.reference_data['CUSTOMER_SEGMENTS']
        MARKETS = self.reference_data['MARKETS']
        ORDER_STATUSES = self.reference_data['ORDER_STATUSES']
        SHIPPING_MODES = self.reference_data['SHIPPING_MODES']
        CATEGORIES = self.reference_data['CATEGORIES']
        DEPARTMENTS = self.reference_data['DEPARTMENTS']
        ORDER_REGIONS = self.reference_data['ORDER_REGIONS']
        PRODUCT_NAMES = self.reference_data['PRODUCT_NAMES']
        
        # Customer info using Faker
        customer_id = self.fake.random_int(min=1000, max=9999)
        customer_fname = self.fake.first_name()
        customer_lname = self.fake.last_name()
        customer_email = self.fake.email()
        customer_password = self.fake.password(length=12)
        customer_segment = customer_segment_override if customer_segment_override else random.choice(CUSTOMER_SEGMENTS)
        customer_city = self.fake.city()
        customer_state = self.fake.state()
        customer_country = "United States"
        customer_street = self.fake.street_address()
        customer_zipcode = customer_zipcode_override if customer_zipcode_override else int(self.fake.zipcode()[:5])
        
        # Store/Department
        dept_id = random.choice(list(DEPARTMENTS.keys()))
        dept_name = DEPARTMENTS[dept_id]
        latitude = round(self.fake.latitude(), 4)
        longitude = round(self.fake.longitude(), 4)
        
        # Product
        category_id = random.choice(list(CATEGORIES.keys()))
        category_name = CATEGORIES[category_id]
        product_card_id = self.fake.random_int(min=1000, max=9999)
        product_name = random.choice(PRODUCT_NAMES[category_name])
        product_description = f"{product_name} - High quality {category_name.lower()} product"
        product_image = f"http://example.com/images/product_{product_card_id}.jpg"
        
        # Use overridden product price if provided, otherwise generate normally
        if product_price_override is not None:
            product_price = product_price_override
        else:
            product_price = round(random.uniform(10.0, 2000.0), 2)
        
        product_status = random.choice([0, 1])
        
        # Order
        order_id = 10000 + i
        order_item_id = 20000 + i
        order_customer_id = customer_id
        order_date = self.generate_order_date()
        
        # Order location
        order_city = self.fake.city()
        order_state = self.fake.state()
        order_country = "United States"
        order_region = random.choice(ORDER_REGIONS)
        order_zipcode = int(self.fake.zipcode()[:5])
        market = random.choice(MARKETS)
        
        # Order details
        order_item_quantity = random.randint(1, 10)
        order_item_product_price = product_price
        order_item_discount = round(random.uniform(0, product_price * 0.3), 2)
        order_item_discount_rate = round(order_item_discount / product_price, 4) if product_price > 0 else 0
        order_item_total = round((product_price - order_item_discount) * order_item_quantity, 2)
        
        # Profit
        sales = order_item_total
        order_item_profit_ratio = round(random.uniform(0.1, 0.4), 4)
        order_profit_per_order = round(order_item_total * order_item_profit_ratio, 2)
        benefit_per_order = order_profit_per_order
        sales_per_customer = round(random.uniform(100, 5000), 2)
        
        # Shipping
        shipping_mode = random.choice(SHIPPING_MODES)
        if shipping_mode == "Same Day":
            days_scheduled = 1
        elif shipping_mode == "First Class":
            days_scheduled = random.randint(2, 3)
        elif shipping_mode == "Second Class":
            days_scheduled = random.randint(4, 6)
        else:
            days_scheduled = random.randint(5, 10)
        
        days_actual = days_scheduled + random.randint(-1, 3)
        if days_actual < 1:
            days_actual = 1
        
        late_delivery_risk = 1 if days_actual > days_scheduled else 0
        
        if late_delivery_risk == 1:
            delivery_status = "Late delivery"
        elif random.random() < 0.1:
            delivery_status = random.choice(["Shipping canceled", "Advance shipping"])
        else:
            delivery_status = "Shipping on time"
        
        shipping_date = self.generate_shipping_date(order_date, days_actual)
        
        # Order status
        if delivery_status == "Shipping canceled":
            order_status = "CANCELED"
        elif delivery_status == "Late delivery":
            order_status = random.choice(["PENDING", "PROCESSING", "ON_HOLD"])
        else:
            order_status = random.choice(["COMPLETE", "COMPLETE", "COMPLETE", "PENDING", "CLOSED"])
        
        transaction_type = random.choice(TRANSACTION_TYPES)
        
        # Return dictionary for DataFrame
        return {
            'Type': transaction_type,
            'Days for shipping (real)': days_actual,
            'Days for shipment (scheduled)': days_scheduled,
            'Benefit per order': benefit_per_order,
            'Sales per customer': sales_per_customer,
            'Delivery Status': delivery_status,
            'Late_delivery_risk': late_delivery_risk,
            'Category Id': category_id,
            'Category Name': category_name,
            'Customer City': customer_city,
            'Customer Country': customer_country,
            'Customer Email': customer_email,
            'Customer Fname': customer_fname,
            'Customer Id': customer_id,
            'Customer Lname': customer_lname,
            'Customer Password': customer_password,
            'Customer Segment': customer_segment,
            'Customer State': customer_state,
            'Customer Street': customer_street,
            'Customer Zipcode': customer_zipcode,
            'Department Id': dept_id,
            'Department Name': dept_name,
            'Latitude': latitude,
            'Longitude': longitude,
            'Market': market,
            'Order City': order_city,
            'Order Country': order_country,
            'Order Customer Id': order_customer_id,
            'order date (DateOrders)': order_date,
            'Order Id': order_id,
            'Order Item Cardprod Id': product_card_id,
            'Order Item Discount': order_item_discount,
            'Order Item Discount Rate': order_item_discount_rate,
            'Order Item Id': order_item_id,
            'Order Item Product Price': order_item_product_price,
            'Order Item Profit Ratio': order_item_profit_ratio,
            'Order Item Quantity': order_item_quantity,
            'Sales': sales,
            'Order Item Total': order_item_total,
            'Order Profit Per Order': order_profit_per_order,
            'Order Region': order_region,
            'Order State': order_state,
            'Order Status': order_status,
            'Order Zipcode': order_zipcode,
            'Product Card Id': product_card_id,
            'Product Category Id': category_id,
            'Product Description': product_description,
            'Product Image': product_image,
            'Product Name': product_name,
            'Product Price': product_price,
            'Product Status': product_status,
            'shipping date (DateOrders)': shipping_date,
            'Shipping Mode': shipping_mode
        }

    def modify_record(self, record):
        """Modify a record by changing Product status, Customer Segment, and Customer Zipcode"""
        # Override values
        new_product_status = 1 if record['Product Status'] == 0 else 0
        new_customer_zipcode = self.fake.zipcode()
        new_customer_segment = random.choice([seg for seg in self.reference_data['CUSTOMER_SEGMENTS'] if seg != record['Customer Segment']])
        new_customer_zipcode = self.fake.zipcode()[:5]
        modified_record = record.copy()
        modified_record['Customer Segment'] = new_customer_segment
        modified_record['Customer Zipcode'] = new_customer_zipcode
        modified_record['Product Status'] = new_product_status
        
        
        return modified_record

    def generate_synthetic_data(self, num_records=1000):
        """Generate synthetic e-commerce data records"""
        # Generate initial records
        print(f"Generating {num_records} base records...")
        data = []
        for i in range(1, num_records + 1):
            record = self.generate_record(i)
            data.append(record)
        
        return pd.DataFrame(data)

    def generate_modified_data(self, original_df, num_modified_records=10):
        """Generate modified data by randomly selecting records from provided dataset and modifying them"""
        print(f"Generating {num_modified_records} modified records...")
        customers = original_df['Customer Id'].dropna().unique()
        sample_customers = np.random.choice(customers, num_modified_records, replace=False)
        sample_df = original_df.loc[original_df['Customer Id'].isin(sample_customers)]
        modified_records = []
        for _, record in sample_df.iterrows():
            modified_record = self.modify_record(record)
            modified_records.append(modified_record)
        
        return pd.DataFrame(modified_records)

    def save_data(self, df, output_path="/Volumes/ecomm_e2e/ecomm_raw/raw_volume/raw_data/synthetic_incremental_load.csv"):
        """Save DataFrame to CSV"""
        df.to_csv(output_path, index=False,header=True)


def main():
    """Main function to run the data generation"""
    generator = SyntheticDataGenerator()
    df1 = generator.generate_synthetic_data(1000)
    original_data = pd.read_csv('/Volumes/ecomm_e2e/ecomm_raw/raw_volume/raw_data/DataCoSupplyChainDataset.csv',encoding='latin1')
    df2 = generator.generate_modified_data(original_df=original_data, num_modified_records=5)
    idx = uuid.uuid4()
    generator.save_data(df1, output_path=f"/Volumes/ecomm_e2e/ecomm_raw/raw_volume/raw_data/{idx}_synthetic_incremental_load_.csv")
    time.sleep(30)
    generator.save_data(df2, output_path=f"/Volumes/ecomm_e2e/ecomm_raw/raw_volume/raw_data/{idx}_synthetic_incremental_load_scd2.csv")
    print("Data generation completed")


if __name__ == "__main__":
    main()