# Setup Environment for Lakeflow Declarative Pipelines

This notebook sets up the environment for the Lakeflow Declarative Pipelines course.

It will:
- Create catalog: `ldp_demo`
- Create schema: `ldp_schema`
- Create volume: `raw` within the schema
- Create folders (customers, orders, status) in the volume
- Copy sample data files to the volume


In [0]:
# Define catalog, schema, and volume names
CATALOG_NAME = 'cetpa_external_catalog'
SCHEMA_NAME = 'ldp_schema'
VOLUME_NAME = 'raw'

# Define the base volume path
VOLUME_PATH = f'/Volumes/{CATALOG_NAME}/{SCHEMA_NAME}/{VOLUME_NAME}'

print(f'Catalog: {CATALOG_NAME}')
print(f'Schema: {SCHEMA_NAME}')
print(f'Volume: {VOLUME_NAME}')
print(f'Volume Path: {VOLUME_PATH}')


## Step 1: Create Catalog


In [0]:
%sql
-- Create catalog if it doesn't exist
 CREATE CATALOG IF NOT EXISTS cetpa_external_catalog;


## Step 2: Create Schema


In [0]:
%sql
-- Create schema within the catalog
CREATE SCHEMA IF NOT EXISTS cetpa_external_catalog.ldp_schema;


## Step 3: Create Volume


In [0]:
%sql
-- Create volume within the schema
CREATE VOLUME IF NOT EXISTS cetpa_external_catalog.ldp_schema.raw;


## Step 4: Create Directories in Volume


In [0]:
def create_directory_in_volume(volume_path: str, folder_names: list):
    '''
    Creates multiple directories in the specified volume path using dbutils.fs.

    Parameters:
    - volume_path (str): The base volume path
    - folder_names (list): A list of folder names to create
    '''
    print('----------------------------------------------------------------------------------------')
    for folder in folder_names:
        folder_path = f'{volume_path}/{folder}'
        try:
            # Try to list the directory to check if it exists
            dbutils.fs.ls(folder_path)
            print(f'Directory {folder_path} already exists. No action taken.')
        except:
            # Directory doesn't exist, create it
            dbutils.fs.mkdirs(folder_path)
            print(f'Creating folder: {folder_path}')
    print('----------------------------------------------------------------------------------------\n')

# Create folders for customers, orders, and status
create_directory_in_volume(VOLUME_PATH, ['customers', 'orders', 'status'])


## Step 5: Delete Existing Files (if resetting)


In [0]:
def delete_source_files(source_path: str):
    """
    Deletes all files in the specified source volume.

    Parameters:
    - source_path: The path to the volume containing the files to delete
    """
    import os

    print(f'\nSearching for files in {source_path} to delete...')
    if os.path.exists(source_path):
        list_of_files = sorted(os.listdir(source_path))
    else:
        list_of_files = None

    if not list_of_files:
        print(f'No files found in {source_path}.\n')
    else:
        for file in list_of_files:
            file_to_delete = os.path.join(source_path, file)
            print(f'Deleting file: {file_to_delete}')
            dbutils.fs.rm(file_to_delete)

# Delete existing files if resetting
delete_source_files(f'{VOLUME_PATH}/customers/')
delete_source_files(f'{VOLUME_PATH}/orders/')
delete_source_files(f'{VOLUME_PATH}/status/')


## Step 6: Create Sample JSON Data Files

This step creates sample JSON files programmatically based on the course data structure. The files are created directly in your volume without needing to copy from external sources.


In [0]:
def create_sample_json_files():
    """
    Create sample JSON files based on the course data structure.
    This generates realistic sample data matching the course requirements:
    - Orders: 174 records (order_id, order_timestamp, customer_id, notifications)
    - Status: Multiple status records per order (order_id, order_status, status_timestamp)
    - Customers: 939 records (customer_id, name, email, address, city, state, operation, timestamp)
    """
    import json
    import random
    from datetime import datetime, timedelta

    print("\n----------------Creating sample JSON files----------------")

    # Generate sample orders data (174 rows as mentioned in the course)
    # Orders structure: order_id, order_timestamp, customer_id, notifications
    base_date = datetime(2024, 1, 15, 10, 0, 0)
    sample_orders = []
    for i in range(174):
        order_id = 75000 + i
        order_timestamp = (base_date + timedelta(minutes=i*5)).strftime("%Y-%m-%dT%H:%M:%SZ")
        customer_id = 1000 + (i % 50)  # Cycle through 50 customers
        notifications = random.choice([True, False])
        sample_orders.append({
            "order_id": order_id,
            "order_timestamp": order_timestamp,
            "customer_id": customer_id,
            "notifications": notifications
        })

    # Generate sample status data
    # Status structure: order_id, order_status, status_timestamp
    # Multiple statuses per order (placed -> preparing -> on the way -> delivered)
    sample_status = []
    for order in sample_orders[:50]:  # Create statuses for first 50 orders
        order_id = order["order_id"]
        base_ts = datetime.strptime(order["order_timestamp"], "%Y-%m-%dT%H:%M:%SZ")

        # Each order gets multiple status updates
        for idx, status in enumerate(['placed', 'preparing', 'on the way', 'delivered']):
            status_timestamp = (base_ts + timedelta(hours=idx*2)).strftime("%Y-%m-%dT%H:%M:%SZ")
            sample_status.append({
                "order_id": order_id,
                "order_status": status,
                "status_timestamp": status_timestamp
            })

    # Add some canceled orders
    for i in range(5):
        order_id = sample_orders[i*10]["order_id"]
        base_ts = datetime.strptime(sample_orders[i*10]["order_timestamp"], "%Y-%m-%dT%H:%M:%SZ")
        status_timestamp = (base_ts + timedelta(hours=1)).strftime("%Y-%m-%dT%H:%M:%SZ")
        sample_status.append({
            "order_id": order_id,
            "order_status": "canceled",
            "status_timestamp": status_timestamp
        })

    # Generate sample customers data (939 customers as mentioned in the course)
    # Customers structure: customer_id, name, email, address, city, state, operation, timestamp
    first_names = ["John", "Jane", "Bob", "Alice", "Charlie", "Diana", "Eve", "Frank", "Grace", "Henry",
                   "Ivy", "Jack", "Kate", "Liam", "Mary", "Noah", "Olivia", "Paul", "Quinn", "Rachel"]
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
                  "Wilson", "Anderson", "Taylor", "Thomas", "Hernandez", "Moore", "Martin", "Jackson", "Thompson", "White"]
    cities_states = [
        ("New York", "NY"), ("Los Angeles", "CA"), ("Chicago", "IL"), ("Houston", "TX"),
        ("Phoenix", "AZ"), ("Philadelphia", "PA"), ("San Antonio", "TX"), ("San Diego", "CA"),
        ("Dallas", "TX"), ("San Jose", "CA"), ("Austin", "TX"), ("Jacksonville", "FL"),
        ("Fort Worth", "TX"), ("Columbus", "OH"), ("Charlotte", "NC"), ("San Francisco", "CA"),
        ("Indianapolis", "IN"), ("Seattle", "WA"), ("Denver", "CO"), ("Boston", "MA")
    ]
    street_names = ["Main", "Oak", "Pine", "Elm", "Maple", "Park", "First", "Second", "Third", "Fourth"]

    sample_customers = []
    base_customer_date = datetime(2024, 1, 15, 10, 0, 0)

    for i in range(939):
        customer_id = 1000 + i
        first_name = random.choice(first_names)
        last_name = random.choice(last_names)
        name = f"{first_name} {last_name}"
        email = f"{first_name.lower()}.{last_name.lower()}{i}@example.com"
        city, state = random.choice(cities_states)
        street_num = random.randint(100, 9999)
        street = random.choice(street_names)
        address = f"{street_num} {street} St"
        operation = "NEW"  # First file has all NEW operations
        timestamp = (base_customer_date + timedelta(seconds=i*10)).strftime("%Y-%m-%dT%H:%M:%SZ")

        sample_customers.append({
            "customer_id": customer_id,
            "name": name,
            "email": email,
            "address": address,
            "city": city,
            "state": state,
            "operation": operation,
            "timestamp": timestamp
        })

    # Write files using dbutils.fs.put (works with volumes, no DBFS root needed)
    # Format: newline-delimited JSON (one JSON object per line)
    try:
        # Orders file
        orders_file = f'{VOLUME_PATH}/orders/00.json'
        orders_json_lines = [json.dumps(order) for order in sample_orders]
        orders_content = '\n'.join(orders_json_lines)
        dbutils.fs.put(orders_file, orders_content, overwrite=True)
        print(f'‚úÖ Created orders file: {orders_file} ({len(sample_orders)} records)')

        # Status file
        status_file = f'{VOLUME_PATH}/status/00.json'
        status_json_lines = [json.dumps(status) for status in sample_status]
        status_content = '\n'.join(status_json_lines)
        dbutils.fs.put(status_file, status_content, overwrite=True)
        print(f'‚úÖ Created status file: {status_file} ({len(sample_status)} records)')

        # Customers file
        customers_file = f'{VOLUME_PATH}/customers/00.json'
        customers_json_lines = [json.dumps(customer) for customer in sample_customers]
        customers_content = '\n'.join(customers_json_lines)
        dbutils.fs.put(customers_file, customers_content, overwrite=True)
        print(f'‚úÖ Created customers file: {customers_file} ({len(sample_customers)} records)')

        return True
    except Exception as e:
        print(f'‚ùå Error creating sample files: {e}')
        print('Note: You may need to create JSON files manually.')
        import traceback
        traceback.print_exc()
        return False

# Create sample JSON files directly
print('\nüìù Creating sample JSON files for the course...')
sample_created = create_sample_json_files()

if sample_created:
    print('\n‚úÖ Successfully created all sample JSON files!')
    print(f'\nFiles created in: {VOLUME_PATH}')
    print('  - orders/00.json (174 orders)')
    print('  - status/00.json (multiple status records)')
    print('  - customers/00.json (939 customers)')
else:
    print('\n‚ùå Could not create sample files automatically.')
    print(f'\nPlease manually add JSON files to:')
    print(f'  - {VOLUME_PATH}/customers/')
    print(f'  - {VOLUME_PATH}/orders/')
    print(f'  - {VOLUME_PATH}/status/')


## Step 7: Verify Setup


In [0]:
%sql
-- Verify catalog exists
SHOW CATALOGS LIKE 'cetpa_external_catalog';


In [0]:
%sql
-- Verify schema exists
SHOW SCHEMAS IN cetpa_external_catalog LIKE 'ldp_schema';


In [0]:
%sql
-- Verify volume exists
SHOW VOLUMES IN cetpa_external_catalog.ldp_schema LIKE 'raw';


In [0]:
# Verify folders and files
print(f'\nVerifying volume structure:')
print(f'Volume path: {VOLUME_PATH}')
print(f'\nFolders:')
for folder in ['customers', 'orders', 'status']:
    folder_path = f'{VOLUME_PATH}/{folder}'
    try:
        files = dbutils.fs.ls(folder_path)
        file_list = [f.name for f in files if not f.isDir()]
        print(f'  {folder}/: {len(file_list)} file(s)')
        if file_list:
            for file in sorted(file_list)[:3]:  # Show first 3 files
                print(f'    - {file}')
    except Exception as e:
        print(f'  {folder}/: NOT FOUND or ERROR - {e}')


## Setup Complete!

Your environment is now configured with:
- Catalog: `ldp_demo`
- Schema: `ldp_schema`
- Volume: `raw` at `/Volumes/ldp_demo/ldp_schema/raw`
- Folders: `customers`, `orders`, `status`

All tables will be created in the `ldp_demo.ldp_schema` schema.

You can now proceed with the course notebooks.
