In [None]:
%%configure -f
{
  "conf": {
    "spark.app.name": "Tabular Support Logs Monitoring",
        
        
    "master": "yarn",
    "deployMode": "client",
    "executor.instances": "1",
    "executor.memory": "1G",
    "executor.cores": "1",
    "driver.memory": "1G",
    "driver.cores": "1",
    "dynamicAllocation.enabled": "false",
      
    "spark.dynamicAllocation.minExecutors": "1",    
    
    "spark.sql.catalog.my_warehouse": "org.apache.iceberg.spark.SparkCatalog",
    "spark.sql.catalog.my_warehouse.catalog-impl": "org.apache.iceberg.rest.RESTCatalog",
    "spark.sql.catalog.my_warehouse.credential": "t-asdf:asdf-asdf",
    "spark.sql.catalog.my_warehouse.region": "us-east-1",
    "spark.sql.catalog.my_warehouse.uri": "https://api.tabular.io/ws/",
    "spark.sql.catalog.my_warehouse.warehouse": "my_warehouse",
      
    "spark.sql.defaultCatalog": "my_warehouse"
    }
}

In [None]:
from datetime import datetime

def monitor():
    # Query to get the latest batch ID and compute the job duration
    result = spark.sql("""
        with latest_batch as (
            select 
                batch_id,
                max(event_ts) as last_message_received_at,
                min(event_ts) as first_message_received_at
            from
                tabular_support.logs
            group by 
                batch_id
            order by 
                last_message_received_at desc
            limit 1
        )
        select 
            batch_id,
            cast(last_message_received_at as timestamp) as last_message_received_at,
            cast(first_message_received_at as timestamp) as first_message_received_at
        from 
            latest_batch
    """).collect()

    if result:
        latest_batch = result[0]
        batch_id = latest_batch[0]
        last_message_received_at = latest_batch[1]
        first_message_received_at = latest_batch[2]

        duration = last_message_received_at - first_message_received_at
        hours, remainder = divmod(duration.total_seconds(), 3600)
        minutes, seconds = divmod(remainder, 60)

        print("="*80)
        print(f"Batch ID:   {batch_id}")
        print(f"Start Time: {first_message_received_at}")
        print(f"End Time:   {last_message_received_at}")
        print(f"Duration:   {int(hours)}h {int(minutes)}m {int(seconds)}s")
        print("="*80 + "\n")

        # Query to get the messages for the latest batch
        logs = spark.sql(f"""
            select 
                event_message
            from 
                tabular_support.logs
            where 
                batch_id = '{batch_id}'
            order by 
                event_ts
        """)

        for row in logs.collect():
            print(f"{row[0]}")
            
        print("="*80)
        
    else:
        print("No batches found.")

monitor()