# Synthetic Dataset Creation
This notebook generates a synthetic dataset of customer purchases for retail scenario.
The csv is created to have the following columns: Customer ID, Product ID, Product Category, Purchase Amount, Purchase Date


1. Import Necessary Libraries

In [4]:
import csv
import random
import os
from datetime import datetime, timedelta

2. Generate Products

    Create 50 products across 5 categories.

In [2]:
def generate_products():
    """Generate 50 products across 5 categories."""
    categories = ["Electronics", "Clothing", "Home & Kitchen", "Books", "Sports"]
    products = []
    product_id = 1
    
    for category in categories:
        for _ in range(10):  # 10 products per category
            products.append({
                "Product ID": f"P{product_id}",
                "Category": category
            })
            product_id += 1
    return products


3. Generate Customers

    Create 500 customer IDs (C1 to C500).

In [5]:
def generate_customers():
    """Generate 500 customer IDs (C1 to C500)."""
    return [f"C{i}" for i in range(1, 501)]

4. Generate Random Dates
    
    Generate random dates between two dates.

In [6]:
def random_date(start_date, end_date):
    """Generate a random date between two dates."""
    delta = end_date - start_date
    random_days = random.randint(0, delta.days)
    return start_date + timedelta(days=random_days)

5. Generate Purchases 

    Generate order id, order date, customer id, product id and product category


In [7]:
def generate_purchases(customers, products, start_date, end_date, num_orders=5000):
    """Generate purchase records with 1 to 6 items per order and add Order ID."""
    purchases = []
    order_id = 1  # Start order IDs from 1
    
    for _ in range(num_orders):
        customer = random.choice(customers)
        order_date = random_date(start_date, end_date)
        
        # Number of items in this order (1 to 6)
        num_items = random.randint(1, 6)
        
        for _ in range(num_items):
            product = random.choice(products)
            category = product["Category"]
            
            # Generate realistic purchase amounts based on category
            if category == "Electronics":
                amount = round(random.uniform(100, 500), 2)
            elif category == "Clothing":
                amount = round(random.uniform(50, 200), 2)
            elif category == "Home & Kitchen":
                amount = round(random.uniform(30, 300), 2)
            elif category == "Books":
                amount = round(random.uniform(10, 100), 2)
            else:  # Sports
                amount = round(random.uniform(80, 400), 2)
            
            # Add purchase record with Order ID
            purchases.append({
                "Order ID": f"O{order_id}",  # Unique Order ID for this order
                "Customer ID": customer,
                "Product ID": product["Product ID"],
                "Product Category": category,
                "Purchase Amount": amount,
                "Purchase Date": order_date.strftime("%Y-%m-%d")
            })
        
        # Increment Order ID for the next order
        order_id += 1
    
    return purchases

6. Save Dataset to CSV
 
    Save the generated purchase records to a CSV file.

In [8]:
def save_to_csv(purchases, filename):
    """Save purchase records to a CSV file."""
    with open(filename, "w", newline="") as csvfile:
        fieldnames = ["Order ID", "Customer ID", "Product ID", "Product Category", 
                     "Purchase Amount", "Purchase Date"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(purchases)

7. Main Function

    Generate the dataset and save it to a CSV file.

In [9]:
def main():
    # Generate base data
    products = generate_products()
    customers = generate_customers()
    
    # Date range for purchases (entire 2023)
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 12, 31)
    
    # Create data directory if needed
    os.makedirs("data", exist_ok=True)
    
    # Generate purchase records
    purchases = generate_purchases(customers, products, start_date, end_date)
    
    # Save to CSV
    save_to_csv(purchases, "data/customer_purchases3.csv")
    print("Dataset generated and saved to 'data/customer_purchases3.csv'.")


Dataset generated and saved to 'data/customer_purchases2.csv'.


8. Run the Script

    Execute the main function to generate the dataset.

In [None]:
if __name__ == "__main__":
    main()