In [0]:
%sql
drop table if exists orders_managed;

create table orders_managed (
  order_id long,
  sku string,
  product_name string,
  category string,
  quantity long,
  price double
)
tblproperties(
  delta.autoOptimize.optimizeWrite = false,
  delta.autoOptimize.autoCompact = false
)


In [0]:
%sql
desc detail orders_managed

In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import lit
import random
import string
import time

# Function to generate random string
def random_string(length=8):
    return ''.join(random.choice(string.ascii_letters) for _ in range(length))

# Function to generate random category
def random_category():
    return random.choice(["Electronics", "Grocery", "Toys", "Sports", "Books"])

# Function to generate random SKU
def random_sku():
    return "SKU-" + ''.join(random.choice(string.digits) for _ in range(6))

# -------------------------------------------------------------------
# Function to generate 100 INSERT rows and append to orders_managed
# -------------------------------------------------------------------

def insert_orders(num_rows=100):
    rows = []
    
    for i in range(num_rows):
        row = Row(
            order_id = int(time.time() * 1000) + i,   # unique order_id
            sku = random_sku(),
            product_name = random_string(10),
            category = random_category(),
            quantity = random.randint(1, 10),
            price = round(random.uniform(10, 500), 2)
        )
        rows.append(row)

    df = spark.createDataFrame(rows)

    # Display first few rows before writing
    display(df)

    # Append to Delta table
    df.write.format("delta").mode("append").saveAsTable("orders_managed")

    print(f"{num_rows} rows inserted into orders_managed.")

# -------------------------------------------------------------------
# Run function to insert 100 rows
# -------------------------------------------------------------------

insert_orders(100)


In [0]:
%sql
select count(*) from orders_managed;

In [0]:
%sql
desc extended orders_managed

In [0]:
%sql
desc history orders_managed

In [0]:
%sql
select _metadata.file_path from orders_managed

In [0]:
%sql
select * from orders_managed
where order_id = 1765716855678;

In [0]:
query = "select avg(price) as avg_price from orders_managed"
res = spark.sql(query).collect()
print(res)

In [0]:
%sql
desc history orders_managed;

In [0]:
%sql
OPTIMIZE orders_managed;
    

In [0]:
#post optimize
query = "select avg(price) as avg_price from orders_managed"
res = spark.sql(query).collect()
print(res)