In [None]:
%%configure -f 
{
  "defaultLakehouse": {
    "name": "<Lakehouse-Name>",
    "id": "<Lakehouse-ID>",
    "workspaceId": "<Workspace-ID>"
  }
}

In [None]:
%pip install -U openai

In [None]:
import openai
import os
import pandas as pd
import json
import re
import time
import requests
from pyspark.sql import SparkSession

In [None]:
# Set up OpenAI
os.environ["OPENAI_API_VERSION"] = "2023-05-15"

# OUTPUT Configuration (where to save results)
output_workspace_id = '<WorkspaceId>'  # Change this to your output workspace ID
output_lakehouse_id = '<LakehouseId>'  # Change this to your output lakehouse ID
output_table_name = "<OutputTableName>"  # Updated table name to reflect attached approach
output_csv_full_path = "abfss://<>@onelake.dfs.fabric.microsoft.com/<LHName>.Lakehouse/Files/<Name>.csv" # Change this to your desired output CSV path in Fabric LH

# Configuration - Modify these variables to change processing behavior
# tables_to_process = "ALL"  # Can be: "table_name", ["table1", "table2"], or "ALL"
tables_to_process = ["All"]  # Can be: "table_name", ["table1", "table2"], or "ALL"
tables_to_skip = []  # List of tables to skip: ["temp_table", "staging_data"]

In [None]:
# Get attached lakehouse information
try:
    # Get current lakehouse info from Spark catalog
    current_db = spark.catalog.currentDatabase()
    print(f"Current database/lakehouse: {current_db}")
    
    # Extract workspace and lakehouse IDs from the configuration cell above
    # These values come from the %%configure cell
    input_workspace_id = notebookutils.runtime.context.get('defaultLakehouseWorkspaceId')
    input_lakehouse_id = notebookutils.runtime.context.get('defaultLakehouseId')
    
    print(f"Using attached lakehouse:")
    print(f"  Workspace ID: {input_workspace_id}")
    print(f"  Lakehouse ID: {input_lakehouse_id}")
    
except Exception as e:
    print(f"Error getting attached lakehouse info: {e}")

In [None]:
# Helper functions required for the metadata generator
def get_workspace_and_lakehouse_names(workspace_id, lakehouse_id):
    """Get workspace and lakehouse names from their IDs using Fabric REST APIs"""
    try:
        auth_token = notebookutils.credentials.getToken("https://api.fabric.microsoft.com/")
        headers = {
            'Authorization': f'Bearer {auth_token}',
            'Content-Type': 'application/json'
        }
        workspace_name = "Unknown_Workspace"
        try:
            workspace_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}"
            workspace_response = requests.get(workspace_url, headers=headers)
            if workspace_response.status_code == 200:
                workspace_data = workspace_response.json()
                workspace_name = workspace_data.get('displayName', f"Workspace_{workspace_id[:8]}")
                print(f"  Retrieved workspace name: {workspace_name}")
            else:
                print(f"  Warning: Could not get workspace name (status: {workspace_response.status_code})")
                workspace_name = f"Workspace_{workspace_id[:8]}"
        except Exception as e:
            print(f"  Warning: Error getting workspace name: {e}")
            workspace_name = f"Workspace_{workspace_id[:8]}"
        lakehouse_name = "Unknown_Lakehouse"
        try:
            lakehouse_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}"
            lakehouse_response = requests.get(lakehouse_url, headers=headers)
            if lakehouse_response.status_code == 200:
                lakehouse_data = lakehouse_response.json()
                lakehouse_name = lakehouse_data.get('displayName', f"Lakehouse_{lakehouse_id[:8]}")
                print(f"  Retrieved lakehouse name: {lakehouse_name}")
            else:
                print(f"  Warning: Could not get lakehouse name (status: {lakehouse_response.status_code})")
                lakehouse_name = f"Lakehouse_{lakehouse_id[:8]}"
        except Exception as e:
            print(f"  Warning: Error getting lakehouse name: {e}")
            lakehouse_name = f"Lakehouse_{lakehouse_id[:8]}"
        return workspace_name, lakehouse_name
    except Exception as e:
        print(f"Error getting workspace/lakehouse names via API: {e}")
        workspace_name = f"Workspace_{workspace_id[:8]}"
        lakehouse_name = f"Lakehouse_{lakehouse_id[:8]}"
        return workspace_name, lakehouse_name

def get_current_timestamp_ist():
    from datetime import datetime, timezone, timedelta
    ist = timezone(timedelta(hours=5, minutes=30))
    return datetime.now(ist).strftime('%Y-%m-%d %H:%M:%S IST')

def get_column_datatypes(df):
    try:
        return {field.name: str(field.dataType) for field in df.schema.fields}
    except Exception as e:
        print(f"  Warning: Could not get column datatypes: {e}")
        return {}

def get_efficient_sample(df, table_name, min_sample_size=500, max_sample_size=1000):
    try:
        row_count = df.count()
        print(f"  Table {table_name} has {row_count:,} rows")
        if row_count == 0:
            print(f"  Warning: Table {table_name} is empty")
            return df.limit(0)
        # Guarantee target_sample_size is at least min_sample_size
        target_sample_size = max(min_sample_size, min(max_sample_size, int(row_count * 0.1)))
        if row_count <= min_sample_size:
            sample_df = df
            print(f"  Table is small ({row_count} rows), taking all data for analysis")
        elif row_count <= max_sample_size:
            sample_df = df
            print(f"  Table is medium-sized ({row_count} rows), taking all data for comprehensive analysis")
        else:
            fraction = max(target_sample_size / row_count, min_sample_size / row_count)
            fraction = min(fraction, 1.0)
            print(f"  Large table detected - using extensive randomized sampling")
            print(f"  Target sample size: {target_sample_size:,} rows (fraction: {fraction:.4f})")
            samples = []
            for seed in [42, 123, 456, 789]:
                try:
                    seed_sample = df.sample(fraction=fraction/4, seed=seed).limit(target_sample_size//4)
                    samples.append(seed_sample)
                except Exception as e:
                    print(f"    Warning: Seed {seed} sampling failed: {e}")
                    continue
            try:
                date_cols = [field.name for field in df.schema.fields if 'timestamp' in str(field.dataType).lower() or 'date' in str(field.dataType).lower()]
                if date_cols and samples:
                    print(f"    Found date/time columns: {date_cols[:2]} - ensuring temporal diversity")
                    date_col = date_cols[0]
                    temporal_sample = df.orderBy(date_col).sample(fraction=fraction/2, seed=999).limit(target_sample_size//2)
                    samples.append(temporal_sample)
            except Exception as e:
                print(f"    Note: Temporal sampling not available: {e}")
            if samples:
                sample_df = samples[0]
                for s in samples[1:]:
                    sample_df = sample_df.union(s)
                id_cols = [field.name for field in sample_df.schema.fields if 'id' in field.name.lower()]
                if id_cols:
                    sample_df = sample_df.dropDuplicates([id_cols[0]])
                sample_df = sample_df.limit(max_sample_size)
            else:
                print(f"    Using fallback random sampling")
                sample_df = df.sample(fraction=fraction, seed=42).limit(target_sample_size)
        final_sample_size = sample_df.count()
        coverage_percent = (final_sample_size / row_count) * 100
        print(f"  ✓ Extensive sampling complete: {final_sample_size:,} rows ({coverage_percent:.2f}% coverage)")
        # Guarantee minimum target_sample_size if possible
        if final_sample_size < target_sample_size and row_count > target_sample_size:
            print(f"  ⚠ Sample size {final_sample_size} below target {target_sample_size}, taking random {target_sample_size} rows as fallback")
            sample_df = df.sample(fraction=target_sample_size/row_count, seed=9999).limit(target_sample_size)
            final_sample_size = sample_df.count()
            print(f"  ✓ Fallback sample size: {final_sample_size}")
        if final_sample_size >= target_sample_size or final_sample_size == row_count:
            print(f"  ✓ Sample meets target size requirement ({target_sample_size} records)")
        else:
            print(f"  ⚠ Warning: Sample size {final_sample_size} below recommended target of {target_sample_size}")
        return sample_df
    except Exception as e:
        print(f"  Error sampling table {table_name}: {e}")
        print(f"  Attempting basic fallback sampling...")
        try:
            basic_sample = df.limit(min_sample_size)
            print(f"  Fallback successful: {basic_sample.count()} rows")
            return basic_sample
        except Exception as e2:
            print(f"  Fallback also failed: {e2}")
            return df.limit(0)

def get_column_samples(sample_df, max_samples=8):
    column_samples = {}
    columns = sample_df.columns
    for col in columns:
        try:
            non_null_values = sample_df.select(col).where(f"{col} IS NOT NULL").distinct().limit(max_samples * 2).collect()
            values = [row[col] for row in non_null_values if row[col] is not None]
            if not values:
                column_samples[col] = ["(all null values)"]
                continue
            sample_values = []
            sorted_vals = sorted(values)
            n = len(sorted_vals)
            indices = [0, n//4, n//2, 3*n//4, n-1]
            for i in indices:
                if i < n:
                    sample_values.append(str(sorted_vals[i]))
            remaining_slots = max_samples - len(sample_values)
            if remaining_slots > 0:
                import random
                random_samples = random.sample(values, min(remaining_slots, len(values)))
                sample_values.extend([str(val) for val in random_samples])
            seen = set()
            unique_samples = []
            for val in sample_values:
                if val not in seen:
                    seen.add(val)
                    unique_samples.append(val)
            column_samples[col] = unique_samples[:max_samples]
        except Exception as e:
            column_samples[col] = [f"(error: {str(e)[:50]})"]
    return column_samples

# Global variable to cache table metadata
table_metadata_cache = {}

def get_all_tables_in_lakehouse_attached():
    global table_metadata_cache
    table_metadata_cache = {}
    try:
        print("Discovering tables using Spark catalog...")
        tables = spark.catalog.listTables()
        table_names = []
        print(f"Found {len(tables)} tables in Spark catalog")
        for table in tables:
            table_name = table.name
            table_names.append(table_name)
            table_metadata_cache[table_name] = {
                'catalog': table.catalog,
                'namespace': '.'.join(table.namespace) if table.namespace else '',
                'tableType': table.tableType,
                'isTemporary': table.isTemporary,
                'description': table.description
            }
            print(f"  Found table: {table_name} ({table.tableType}) in {table.catalog}.{'.'.join(table.namespace) if table.namespace else 'default'}")
        return table_names
    except Exception as e:
        print(f"Error getting tables from attached lakehouse: {e}")
        return []

def determine_tables_to_process(tables_to_process, tables_to_skip):
    global table_metadata_cache
    all_available_tables = get_all_tables_in_lakehouse_attached()
    if not all_available_tables:
        print("❌ No tables found in attached lakehouse!")
        return []
    # print(f"\nAvailable tables: {all_available_tables}")
    if tables_to_process == "ALL":
        selected_tables = all_available_tables
    elif isinstance(tables_to_process, list):
        selected_tables = []
        for table in tables_to_process:
            if table in all_available_tables:
                selected_tables.append(table)
                # print(f"✓ Table '{table}' found and will be processed")
            else:
                print(f"❌ Table '{table}' not found in attached lakehouse")
        if not selected_tables:
            print("❌ None of the specified tables were found!")
            return []
    elif isinstance(tables_to_process, str):
        if tables_to_process in all_available_tables:
            selected_tables = [tables_to_process]
            # print(f"✓ Table '{tables_to_process}' found and will be processed")
        else:
            print(f"❌ Table '{tables_to_process}' not found in attached lakehouse")
            return []
    else:
        print("❌ Invalid tables_to_process configuration")
        return []
    if tables_to_skip:
        selected_tables = [t for t in selected_tables if t not in tables_to_skip]
        print(f"Tables after applying skip list: {selected_tables}")
    tables_before_view_filter = selected_tables.copy()
    selected_tables = []
    view_tables_skipped = []
    for table_name in tables_before_view_filter:
        table_metadata = table_metadata_cache.get(table_name)
        if table_metadata and table_metadata.get('tableType') == 'VIEW':
            view_tables_skipped.append(table_name)
            print(f"⚠ Skipping VIEW table: {table_name}")
        else:
            selected_tables.append(table_name)
    if view_tables_skipped:
        print(f"Skipped {len(view_tables_skipped)} VIEW tables: {view_tables_skipped}")
    print(f"Final tables to process: {selected_tables}")
    return selected_tables

In [None]:
def process_table_attached(table_name):
    """Process a single table from attached lakehouse and return column descriptions"""
    print(f"\nProcessing table: {table_name}")
    try:
        # Load the table using Spark table() method for attached lakehouse
        df = spark.table(table_name)
        # Get table metadata from cache, or fetch directly if not cached
        global table_metadata_cache
        table_metadata = table_metadata_cache.get(table_name)
        if not table_metadata:
            print(f"  Metadata not in cache, fetching directly for {table_name}")
            try:
                tables = spark.catalog.listTables()
                for table in tables:
                    if table.name == table_name:
                        table_metadata = {
                            'catalog': table.catalog,
                            'namespace': '.'.join(table.namespace) if table.namespace else '',
                            'tableType': table.tableType,
                            'isTemporary': table.isTemporary,
                            'description': table.description
                        }
                        table_metadata_cache[table_name] = table_metadata
                        break
            except Exception as e:
                print(f"  Warning: Could not fetch metadata for {table_name}: {e}")
        if not table_metadata:
            table_metadata = {
                'catalog': 'unknown',
                'namespace': 'unknown',
                'tableType': 'unknown',
                'isTemporary': False,
                'description': None
            }
        column_datatypes = get_column_datatypes(df)
        columns = df.columns
        column_samples = get_column_samples(df)
        print(f"  Analyzing {len(columns)} columns with sample data and datatypes")
        prompt = f"""You are a metadata documentation assistant for an enterprise data catalog. For the table '{table_name}', you need to:
                1. **Entity (Table) Description** — a detailed summary of what data this table has (e.g., employee master data, payroll transactions, job history, etc.), based on the sample data.
                2. For each column, generate a description, that describes the values in column, if data is Not confidential data, share a minimum of 4 example values! And assign a sensitivity label!
                Sensitivity Labels:
                - General: Non-sensitive data like IDs, dates, categories, technical codes
                - Confidential: Business data that could impact operations if disclosed
                - Highly Confidential: Financial data, proprietary business information, customer data
                - HR Confidential: Employee personal information, salary, performance data
                Table: {table_name}
                Columns with datatypes and sample data:
            """
        for col, samples in column_samples.items():
            datatype = column_datatypes.get(col, "unknown")
            prompt += f"Column: {col} (DataType: {datatype})\nSample values: {samples}\n"
        prompt += """
            Return ONLY a valid JSON object with the following structure:
            {
            "entity_description": "A detailed summary of what data this table has, as observed in the sample data.",
            "columns": {
                "column_name": {
                "description": "Explain everything observed in the sample data, including patterns, formats, and examples. If data is not confidential, share a minimum of 4 example values in column!",
                "sensitivity_label": ""
                }
            }
            }

            The sensitivity_label must be one of: "General", "Confidential", "Highly Confidential", "HR Confidential".
            DO NOT INCLUDE ANY EXTRA TEXT, EXPLANATION, MARKDOWN, OR FORMATTING. OUTPUT ONLY VALID JSON OBJECT.
            """
        response = openai.chat.completions.create(
            model='gpt-4.1',
            messages=[
                {"role": "system", "content": "You are a metadata documentation assistant for an enterprise data catalog. I will provide sample records from a table used. Your task is to analyze the data and generate detailed metadata documentation. "},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=8192
        )
        content = response.choices[0].message.content.strip()
        def extract_json(text):
            import re
            match = re.search(r'\{.*\}', text, re.DOTALL)
            if match:
                return match.group(0)
            return text
        def try_fix_json(s):
            s = s.strip()
            if s.endswith(','):
                s = s[:-1]
            if s.count('{') > s.count('}') and not s.endswith('}'):
                s += '"}'
            elif not s.endswith('}'):
                s += '}'
            return s
        try:
            json_content = extract_json(content)
            import json
            response_data = json.loads(json_content)
        except Exception as e:
            try:
                fixed_content = try_fix_json(json_content)
                response_data = json.loads(fixed_content)
                print(f"  Warning: JSON was auto-corrected for {table_name}")
            except Exception as e2:
                print(f'  Failed to parse response for {table_name}: {e2}')
                return []
        entity_description = response_data.get('entity_description', f'Table containing data related to {table_name}')
        if 'columns' in response_data:
            descriptions = response_data['columns']
        else:
            descriptions = {k: v for k, v in response_data.items() if k != 'entity_description'}
            if not entity_description or entity_description == f'Table containing data related to {table_name}':
                entity_description = f'Data table with {len(descriptions)} columns including {", ".join(list(descriptions.keys())[:3])}'
        generated_timestamp = get_current_timestamp_ist()
        generated_by = "OpenAI gpt-4.1"
        table_data = []
        for col, details in descriptions.items():
            if isinstance(details, str):
                desc = details
                sensitivity = "General"
            else:
                desc = details.get('description', 'No description provided')
                sensitivity = details.get('sensitivity_label', 'General')
            datatype = column_datatypes.get(col, "unknown")
            table_data.append({
                'WorkspaceName': input_workspace_name,
                'WorkspaceId': input_workspace_id,
                'LakehouseName': input_lakehouse_name,
                'LakehouseId': input_lakehouse_id,
                'Entity': table_name,
                'EntityDescription': entity_description,
                'Catalog': table_metadata.get('catalog', 'unknown'),
                'Namespace': table_metadata.get('namespace', 'unknown'),
                'TableType': table_metadata.get('tableType', 'unknown'),
                'IsTemporary': table_metadata.get('isTemporary', False),
                'Attribute': col,
                'AttributeDescription': desc,
                'DataType': datatype,
                'SensitivityLabel': sensitivity,
                'GeneratedTimestampIST': generated_timestamp,
                'GeneratedBy': generated_by
            })
        print(f"  Successfully processed {len(table_data)} columns for {table_name}")
        print(f"  Entity description: {entity_description[:100]}...")
        print(f"  Table metadata: {table_metadata['tableType']} in {table_metadata['catalog']}.{table_metadata['namespace']}")
        print(f"  Generated at: {generated_timestamp}")
        return table_data
    except Exception as e:
        error_msg = str(e)
        print(f"  ❌ Error processing table {table_name}: {error_msg[:100]}...")
        return []

In [None]:
# Get input workspace and lakehouse names automatically
print("Retrieving input workspace and lakehouse names...")
input_workspace_name, input_lakehouse_name = get_workspace_and_lakehouse_names(input_workspace_id, input_lakehouse_id)
print(f"Input Workspace: {input_workspace_name}")
print(f"Input Lakehouse: {input_lakehouse_name}")
print(f"Using attached lakehouse for table discovery")

# Get output workspace and lakehouse names automatically
print("\nRetrieving output workspace and lakehouse names...")
output_workspace_name, output_lakehouse_name = get_workspace_and_lakehouse_names(output_workspace_id, output_lakehouse_id)
print(f"Output Workspace: {output_workspace_name}")
print(f"Output Lakehouse: {output_lakehouse_name}")
print(f"Output Table: {output_table_name}")

# Show current lakehouse info
try:
    current_db = spark.catalog.currentDatabase()
    print(f"\nCurrent Spark database: {current_db}")
except:
    print("\nCould not retrieve current Spark database")

output_path = f"abfss://{output_workspace_id}@onelake.dfs.fabric.microsoft.com/{output_lakehouse_id}/Tables/{output_table_name}"
print(f"Output path: {output_path}")

# Main execution
print("\n=== OpenAI Table Analysis Started (Attached Lakehouse Mode ONLY) ===")

# Get all available tables first for comprehensive reporting
print("\n=== Table Discovery and Processing Summary ===")
all_available_tables = get_all_tables_in_lakehouse_attached()

if not all_available_tables:
    print("❌ No tables found in attached lakehouse!")
else:
    print(f"\n📊 Table Discovery Summary:")
    print(f"Total tables found in lakehouse: {len(all_available_tables)}")
    print(f"Available tables: {sorted(all_available_tables)}")
    
    # Show configuration
    print(f"\n⚙️ Processing Configuration:")
    if tables_to_process == "ALL":
        print(f"  • Target tables: ALL TABLES")
    elif isinstance(tables_to_process, list):
        print(f"  • Target tables: {tables_to_process}")
    else:
        print(f"  • Target tables: {tables_to_process}")
    
    if tables_to_skip:
        print(f"  • Tables to skip: {tables_to_skip}")
    else:
        print(f"  • Tables to skip: None")

# Determine which tables to process
tables = determine_tables_to_process(tables_to_process, tables_to_skip)

# Enhanced reporting of table processing decisions
if all_available_tables:
    print(f"\n📋 Detailed Table Processing Report:")
    
    # Tables that will be processed
    if tables:
        print(f"\n✅ Tables TO BE PROCESSED ({len(tables)}):")
        for i, table_name in enumerate(sorted(tables), 1):
            table_metadata = table_metadata_cache.get(table_name, {})
            table_type = table_metadata.get('tableType', 'unknown')
            catalog = table_metadata.get('catalog', 'unknown')
            namespace = table_metadata.get('namespace', 'unknown')
            print(f"  {i:2d}. {table_name} ({table_type}) in {catalog}.{namespace}")
    
    # Tables that are skipped due to configuration
    skipped_by_config = []
    if tables_to_process != "ALL":
        if isinstance(tables_to_process, list):
            skipped_by_config = [t for t in all_available_tables if t not in tables_to_process]
        elif isinstance(tables_to_process, str) and tables_to_process != "ALL":
            skipped_by_config = [t for t in all_available_tables if t != tables_to_process]
    
    if skipped_by_config:
        print(f"\n⏭️ Tables SKIPPED (not in target list) ({len(skipped_by_config)}):")
        for i, table_name in enumerate(sorted(skipped_by_config), 1):
            table_metadata = table_metadata_cache.get(table_name, {})
            table_type = table_metadata.get('tableType', 'unknown')
            catalog = table_metadata.get('catalog', 'unknown')
            namespace = table_metadata.get('namespace', 'unknown')
            print(f"  {i:2d}. {table_name} ({table_type}) in {catalog}.{namespace}")
    
    # Tables explicitly skipped
    explicitly_skipped = [t for t in all_available_tables if t in tables_to_skip]
    if explicitly_skipped:
        print(f"\n🚫 Tables EXPLICITLY SKIPPED ({len(explicitly_skipped)}):")
        for i, table_name in enumerate(sorted(explicitly_skipped), 1):
            table_metadata = table_metadata_cache.get(table_name, {})
            table_type = table_metadata.get('tableType', 'unknown')
            catalog = table_metadata.get('catalog', 'unknown')
            namespace = table_metadata.get('namespace', 'unknown')
            print(f"  {i:2d}. {table_name} ({table_type}) in {catalog}.{namespace} - Reason: In skip list")
    
    # Tables skipped because they are views
    view_tables = [t for t in all_available_tables if table_metadata_cache.get(t, {}).get('tableType') == 'VIEW']
    if view_tables:
        print(f"\n👁️ Tables SKIPPED (VIEW type) ({len(view_tables)}):")
        for i, table_name in enumerate(sorted(view_tables), 1):
            table_metadata = table_metadata_cache.get(table_name, {})
            catalog = table_metadata.get('catalog', 'unknown')
            namespace = table_metadata.get('namespace', 'unknown')
            print(f"  {i:2d}. {table_name} (VIEW) in {catalog}.{namespace} - Reason: Views are not processed")
    
    # Tables that were requested but not found
    if isinstance(tables_to_process, list):
        not_found_tables = [t for t in tables_to_process if t not in all_available_tables]
        if not_found_tables:
            print(f"\n❌ Tables REQUESTED but NOT FOUND ({len(not_found_tables)}):")
            for i, table_name in enumerate(sorted(not_found_tables), 1):
                print(f"  {i:2d}. {table_name} - Reason: Table does not exist in lakehouse")
    elif isinstance(tables_to_process, str) and tables_to_process != "ALL":
        if tables_to_process not in all_available_tables:
            print(f"\n❌ Table REQUESTED but NOT FOUND:")
            print(f"  1. {tables_to_process} - Reason: Table does not exist in lakehouse")

if not tables:
    print("\n❌ No tables to process! Check your lakehouse attachment and table availability.")
    if all_available_tables:
        print(f"\n💡 Suggestions:")
        print(f"  • Available tables you could process: {sorted(all_available_tables)}")
        print(f"  • Update tables_to_process to include one of these tables")
        print(f"  • Or set tables_to_process = 'ALL' to process all tables")
else:
    # Process all tables using ONLY attached lakehouse method
    print(f"\n🚀 Starting processing of {len(tables)} table(s)...")
    all_results = []
    processed_successfully = []
    failed_tables = []
    empty_tables = []
    start_time = time.time()
    
    for i, table_name in enumerate(tables, 1):
        print(f"\n[{i}/{len(tables)}] Processing table: {table_name}")
        try:
            table_results = process_table_attached(table_name)
            if table_results:
                all_results.extend(table_results)
                processed_successfully.append(table_name)
                print(f"  ✓ Added {len(table_results)} columns from {table_name}")
            else:
                empty_tables.append(table_name)
                print(f"  ⚠ No results for {table_name} - table may be empty or inaccessible")
        except Exception as e:
            failed_tables.append((table_name, str(e)))
            print(f"  ❌ Failed to process {table_name}: {str(e)}")
        if i < len(tables):
            time.sleep(1)
    
    # Final processing summary
    print(f"\n=== Final Processing Summary ===")
    print(f"Processing time: {time.time() - start_time:.1f} seconds")
    print(f"Total tables attempted: {len(tables)}")
    print(f"Successfully processed: {len(processed_successfully)}")
    print(f"Empty/inaccessible tables: {len(empty_tables)}")
    print(f"Failed tables: {len(failed_tables)}")
    
    if processed_successfully:
        print(f"\n✅ Successfully processed tables ({len(processed_successfully)}):")
        for table_name in sorted(processed_successfully):
            table_columns = len([r for r in all_results if r['Entity'] == table_name])
            print(f"  • {table_name}: {table_columns} columns")
    
    if empty_tables:
        print(f"\n⚠️ Empty/inaccessible tables ({len(empty_tables)}):")
        for table_name in sorted(empty_tables):
            print(f"  • {table_name}")
    
    if failed_tables:
        print(f"\n❌ Failed tables ({len(failed_tables)}):")
        for table_name, error in failed_tables:
            print(f"  • {table_name}: {error[:100]}...")
    
    if all_results:
        print(f"\n=== Saving Results ===")
        print(f"Total columns processed: {len(all_results)}")
        print(f"Tables with data: {len(set([r['Entity'] for r in all_results]))}")
        
        # Convert to Spark DataFrame for saving
        spark_result_df = spark.createDataFrame(all_results)
        
        # Save to Delta table in the specified output lakehouse
        print(f"\nSaving results to output location...")
        print(f"Output Workspace: {output_workspace_name}")
        print(f"Output Lakehouse: {output_lakehouse_name}")
        print(f"Output Table: {output_table_name}")
        
        try:
            notebookutils.fs.rm(output_path, True)
            print(f"Existing table dropped: {output_path}")
        except:
            print(f"No existing table to drop at: {output_path}")
        
        spark_result_df.write.format("delta").mode("overwrite").save(output_path)
        print(f"Results saved to: {output_path}")
        
        # Save results as a single CSV file
        print(f"Saving results as a single CSV file...")
        
        # Parse the CSV path to separate directory and filename
        import os
        csv_directory = os.path.dirname(output_csv_full_path)
        csv_filename = os.path.basename(output_csv_full_path)
        
        print(f"CSV Directory: {csv_directory}")
        print(f"CSV Filename: {csv_filename}")
        
        # First, remove any existing CSV file
        try:
            notebookutils.fs.rm(output_csv_full_path, True)
            print(f"Existing CSV file removed: {output_csv_full_path}")
        except:
            print(f"No existing CSV file to remove at: {output_csv_full_path}")
        
        # Create a temporary directory for Spark to write to
        temp_csv_dir = f"{csv_directory}/temp_csv_output"
        
        try:
            notebookutils.fs.rm(temp_csv_dir, True)
        except:
            pass
        
        # Coalesce to a single partition and write as CSV to temp directory
        single_partition_df = spark_result_df.coalesce(1)
        single_partition_df.write.mode("overwrite").option("header", True).csv(temp_csv_dir)
        
        # Find the actual CSV file (Spark creates a part-xxxxx.csv file)
        temp_csv_files = notebookutils.fs.ls(temp_csv_dir)
        part_csv_file = None
        for file in temp_csv_files:
            if file.name.startswith("part-") and file.name.endswith(".csv"):
                part_csv_file = file.path
                break
        
        if part_csv_file:
            # Move the part file to the final location with the specified filename
            try:
                notebookutils.fs.mv(part_csv_file, output_csv_full_path)
                print(f"✓ Results saved as single CSV file: {output_csv_full_path}")
                
                # Clean up temporary directory
                try:
                    notebookutils.fs.rm(temp_csv_dir, True)
                    print(f"Temporary directory cleaned up: {temp_csv_dir}")
                except:
                    print(f"Note: Could not clean up temporary directory: {temp_csv_dir}")
                    
            except Exception as e:
                print(f"Error moving CSV file: {e}")
                print(f"CSV file is available at: {part_csv_file}")
        else:
            print(f"Error: Could not find the generated CSV file in: {temp_csv_dir}")
            print(f"Files in temp directory: {[f.name for f in temp_csv_files]}")
        
        print("\nEntities processed:")
        entities = [row['Entity'] for row in spark_result_df.select('Entity').distinct().collect()]
        for entity in sorted(entities):
            entity_data = spark_result_df.filter(spark_result_df['Entity'] == entity)
            count = entity_data.count()
            datatypes = entity_data.select('Datatype').distinct().count()
            table_type = entity_data.select('TableType').first()['TableType'] if count > 0 else 'unknown'
            catalog = entity_data.select('Catalog').first()['Catalog'] if count > 0 else 'unknown'
            namespace = entity_data.select('Namespace').first()['Namespace'] if count > 0 else 'unknown'
            is_temporary = entity_data.select('IsTemporary').first()['IsTemporary'] if count > 0 else 'unknown'
            print(f"  {entity} ({table_type}): {count} columns, {datatypes} unique datatypes")
            print(f"    Location: {catalog}.{namespace}, Temporary: {is_temporary}")
    else:
        print("\n❌ No results generated - check table names and data availability in attached lakehouse")
        print("Ensure your lakehouse is properly attached and contains accessible tables")