In [3]:
import pandas as pd
import random
import faker
from datetime import datetime, timedelta
import os

# Initialize Faker for generating random names
fake = faker.Faker()

# Common street names
street_names = [
    "Main St", "Elm St", "Maple Ave", "Oak St", "Pine St", "Cedar St", 
    "Washington St", "Highland Ave", "Broadway", "Park St"
]

# Cities in MA, NH, and MN
cities_states = [
    {"City": "Boston", "State": "MA"}, {"City": "Cambridge", "State": "MA"}, {"City": "Worcester", "State": "MA"},
    {"City": "Manchester", "State": "NH"}, {"City": "Nashua", "State": "NH"}, {"City": "Concord", "State": "NH"},
    {"City": "Minneapolis", "State": "MN"}, {"City": "Saint Paul", "State": "MN"}, {"City": "Rochester", "State": "MN"}
]

# Common building and electrical installation tools
tool_names = [
    "Hammer", "Screwdriver Set", "Cordless Drill", "Pliers", "Wrench Set", "Tape Measure", "Utility Knife",
    "Level", "Stud Finder", "Circular Saw", "Jigsaw", "Angle Grinder", "Power Sander", "Wire Stripper",
    "Voltage Tester", "Multimeter", "Conduit Bender", "Cable Cutter", "Heat Gun", "Flashlight", "Toolbox",
    "Safety Glasses", "Work Gloves", "Extension Cord", "Ladder"
]

# Generate Customers
def generate_customers(num_customers=100):
    customers = []
    for i in range(1, num_customers + 1):
        city_state = random.choice(cities_states)
        customers.append({
            "CustomerID": i,
            "Name": fake.name(),
            "Address": f"{random.randint(1, 9999)} {random.choice(street_names)}",
            "City": city_state["City"],
            "State": city_state["State"],
            "ZipCode": fake.zipcode()
        })
    return pd.DataFrame(customers)

# Generate Products
def generate_products(num_products=25):
    products = []
    for i in range(1, num_products + 1):
        products.append({
            "ProductID": i,
            "ProductName": random.choice(tool_names),
            "Price": round(random.uniform(5, 500), 2)
        })
    return pd.DataFrame(products)

# Generate Stores
def generate_stores(num_stores=10):
    stores = []
    for i in range(1, num_stores + 1):
        city_state = random.choice(cities_states)
        store_name_prefix = random.choice(["Johnny's Hardware", "ACME"])
        stores.append({
            "StoreID": i,
            "StoreName": f"{store_name_prefix} {city_state['City']}",
            "Location": f"{city_state['City']}, {city_state['State']}"
        })
    return pd.DataFrame(stores)

# Generate Sales Orders
def generate_sales_orders(num_orders=10000, num_customers=100, num_stores=10):
    sales_orders = []
    start_date = datetime(2023, 1, 1)
    end_date = datetime.now()
    date_range = (end_date - start_date).days

    for i in range(1, num_orders + 1):
        sales_orders.append({
            "OrderID": i,
            "CustomerID": random.randint(1, num_customers),
            "StoreID": random.randint(1, num_stores),
            "OrderDate": (start_date + timedelta(days=random.randint(0, date_range))).strftime("%Y-%m-%d")
        })
    return pd.DataFrame(sales_orders)

# Generate Sales Details
def generate_sales_details(num_details=15000, num_orders=10000, num_products=25):
    sales_details = []
    for i in range(1, num_details + 1):
        sales_details.append({
            "DetailID": i,
            "OrderID": random.randint(1, num_orders),
            "ProductID": random.randint(1, num_products),
            "Quantity": random.randint(1, 10),
            "TotalPrice": 0  # Placeholder, will calculate later
        })
    return pd.DataFrame(sales_details)

# Save DataFrame as single-line JSON
def save_as_json(df, filename):
    with open(filename, "w") as f:
        f.write(df.to_json(orient="records", lines=True))


# Generate data
customers = generate_customers()
products = generate_products()
stores = generate_stores()
sales_orders = generate_sales_orders()
sales_details = generate_sales_details()

# Calculate TotalPrice in sales details
product_prices = products.set_index("ProductID")["Price"].to_dict()
sales_details["TotalPrice"] = sales_details.apply(
    lambda row: row["Quantity"] * product_prices[row["ProductID"]], axis=1
)


# Extract the directory from the file path
directory = os.path.dirname("../resources/generated/sales_demo/dummy.json")
# Check if the directory exists
if not os.path.exists(directory):
    # Create the directory and all intermediate levels if they don't exist
    os.makedirs(directory)
    print(f"Directory '{directory}' created.")
 
# Save to single-line JSON files
save_as_json(customers, "../resources/generated/sales_demo/customers.json")
save_as_json(products, "../resources/generated/sales_demo/products.json")
save_as_json(stores, "../resources/generated/sales_demo/stores.json")
save_as_json(sales_orders, "../resources/generated/sales_demo/sales_orders.json")
save_as_json(sales_details, "../resources/generated/sales_demo/sales_details.json")

print("Data generation complete. Files saved as single-line JSON.")




Directory '../resources/generated/sales_demo' created.
Data generation complete. Files saved as single-line JSON.


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark")\
    .config("spark.sql.warehouse.dir", "../spark-data")\
    .enableHiveSupport()\
    .getOrCreate()

sc = spark.sparkContext

print(f"spark {spark.version} {sc.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [21]:
def create_sales_demo_table(table_name):
    if spark.catalog.tableExists(f"sales_demo.{table_name}"):
        print(f"DROP TABLE sales_demo.{table_name}")
        spark.sql(f"DROP TABLE sales_demo.{table_name}")
    print(f"CREATE TABLE sales_demo.{table_name}")
    spark.catalog.createTable(
        tableName = f"sales_demo.{table_name}",
        source = "json",
        description = table_name,
        path=f"../../resources/generated/sales_demo/{table_name}.json",
        inferSchema="true", inferTimestamp="true", timestampFormat="yyyy-MM-dd[HH:mm:ss.SSSSSS]")

spark.sql("CREATE SCHEMA IF NOT EXISTS sales_demo")
spark.catalog.getDatabase("sales_demo")

create_sales_demo_table("customers")
create_sales_demo_table("products")
create_sales_demo_table("stores")
create_sales_demo_table("sales_orders")
create_sales_demo_table("sales_details")



DROP TABLE sales_demo.customers
CREATE TABLE sales_demo.customers
DROP TABLE sales_demo.products
CREATE TABLE sales_demo.products
DROP TABLE sales_demo.stores
CREATE TABLE sales_demo.stores
DROP TABLE sales_demo.sales_orders
CREATE TABLE sales_demo.sales_orders
DROP TABLE sales_demo.sales_details
CREATE TABLE sales_demo.sales_details
