In [1]:
# ----- CONFIG --
SALES_INPUT_PATH = "../data/sales_data.json"
CUSTOMERS_INPUT_PATH = "../data/customer_data.json"
OUTPUT_PATH = "../data/raw/sales_data_scaled.json"
TARGET_RECORDS = 10   # change to any value between 500–1000

In [2]:
# ----- IMPORTS --
import json
import random
import uuid
from datetime import datetime, timedelta

In [3]:
# ----- LOAD DATA --

with open(SALES_INPUT_PATH, "r") as f:
    base_sales = json.load(f)

with open(CUSTOMERS_INPUT_PATH, "r") as f:
    customers = json.load(f)

In [4]:
# Genearate New Sales for only valid customer_ids
valid_customers = [
    c for c in customers if c.get("customer_id")
]
customer_ids = [c["customer_id"] for c in valid_customers]
regions = ["North", "South", "East", "West"]

In [None]:
# ----- HELPER FUNCTIONS
def generate_transaction_id(i):
    return f"T{i:06d}"

def random_date(start="2023-01-01", end="2023-12-31"): 
    start_dt = datetime.strptime(start, "%Y-%m-%d")
    end_dt = datetime.strptime(end, "%Y-%m-%d")
    delta = end_dt - start_dt
    random_days = random.randint(0, delta.days)
    random_seconds = random.randint(0, 86399)  # seconds in a day
    return (start_dt + timedelta(days=random_days, seconds=random_seconds)).strftime("%Y-%m-%d %H:%M:%S")


In [6]:
# ----- GENERATING ADDITIONAL SALES

scaled_sales = base_sales.copy()  # keep the first original 5 transactions in every run

start_index = len(scaled_sales) + 1

for i in range(TARGET_RECORDS):
    template = random.choice(base_sales)

    record = {
        "transaction_id": generate_transaction_id(start_index + i),
        "customer_id": random.choice(customer_ids + [None] * 2),  # introduce some nulls
        "product": {
            "id": template["product"]["id"],
            "name": template["product"]["name"],
            "category": template["product"]["category"],
            "price": round(
                template["product"]["price"] * random.uniform(0.9, 1.1), 2
            )
        },
        "quantity": random.choice([1, 2, 3, 4, 5, -1]),  # intentional bad data
        "discount": random.choice([0, 0.05, 0.1, 0.15, None]),
        "date": random_date(),
        "region": random.choice(regions)
    }

    scaled_sales.append(record)

In [None]:
# ----- WRITE TO FINAL FILE
with open(OUTPUT_PATH, "w") as f:
    json.dump(scaled_sales, f, indent=4)

print(f"Generated {len(scaled_sales)} sales records → {OUTPUT_PATH}")


Generated 15 sales records → ../data/raw/sales_data_scaled.json
