# Query Execution Statistics Analysis

This notebook analyzes query execution statistics from Databricks System Tables.

**Data Source:** `system.query.history`

**Metrics Collected:**
- Query execution duration (how long queries ran)
- Longest running SQL queries
- Data scanning metrics (bytes and rows scanned)
- Query execution by users

---

## Setup and Configuration

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window
import pandas as pd
from datetime import datetime, timedelta

# Configuration - Adjust as needed
ANALYSIS_DAYS = 30  # Number of days to analyze
TOP_N_QUERIES = 20  # Number of top queries to show

print(f"Analysis Period: Last {ANALYSIS_DAYS} days")
print(f"Top N Queries: {TOP_N_QUERIES}")
print(f"Analysis Run Time: {datetime.now()}")

## Data Loading and Basic Statistics

In [0]:
# Load query history data
query_history_df = spark.table("system.query.history")

# Filter for the analysis period
analysis_start_date = (datetime.now() - timedelta(days=ANALYSIS_DAYS)).strftime('%Y-%m-%d')

filtered_queries = query_history_df.filter(
    f"start_time >= '{analysis_start_date}'"
)

# Cache the filtered dataset for better performance
# filtered_queries.cache()

# Basic statistics
total_queries = filtered_queries.count()
successful_queries = filtered_queries.filter("execution_status = 'FINISHED'").count()
failed_queries = filtered_queries.filter("execution_status = 'FAILED'").count()
canceled_queries = filtered_queries.filter("execution_status = 'CANCELED'").count()

print(f"\n{'='*60}")
print(f"QUERY EXECUTION OVERVIEW (Last {ANALYSIS_DAYS} days)")
print(f"{'='*60}")
print(f"Total Queries:      {total_queries:,}")
print(f"Successful:         {successful_queries:,} ({successful_queries/total_queries*100:.1f}%)")
print(f"Failed:             {failed_queries:,} ({failed_queries/total_queries*100:.1f}%)")
print(f"Canceled:           {canceled_queries:,} ({canceled_queries/total_queries*100:.1f}%)")
print(f"{'='*60}\n")

## 1. Query Duration Analysis

Analyzing how long queries run, including:
- Total execution time
- Waiting time for compute
- Actual execution time
- Compilation time

In [0]:
%sql
-- Query Duration Summary Statistics
SELECT 
  'Total Duration' as metric,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_seconds,
  ROUND(PERCENTILE(total_duration_ms, 0.5)/1000, 2) as median_seconds,
  ROUND(PERCENTILE(total_duration_ms, 0.95)/1000, 2) as p95_seconds,
  ROUND(PERCENTILE(total_duration_ms, 0.99)/1000, 2) as p99_seconds,
  ROUND(MAX(total_duration_ms)/1000, 2) as max_seconds
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'

UNION ALL

SELECT 
  'Execution Only' as metric,
  ROUND(AVG(execution_duration_ms)/1000, 2) as avg_seconds,
  ROUND(PERCENTILE(execution_duration_ms, 0.5)/1000, 2) as median_seconds,
  ROUND(PERCENTILE(execution_duration_ms, 0.95)/1000, 2) as p95_seconds,
  ROUND(PERCENTILE(execution_duration_ms, 0.99)/1000, 2) as p99_seconds,
  ROUND(MAX(execution_duration_ms)/1000, 2) as max_seconds
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'

UNION ALL

SELECT 
  'Waiting for Compute' as metric,
  ROUND(AVG(waiting_for_compute_duration_ms)/1000, 2) as avg_seconds,
  ROUND(PERCENTILE(waiting_for_compute_duration_ms, 0.5)/1000, 2) as median_seconds,
  ROUND(PERCENTILE(waiting_for_compute_duration_ms, 0.95)/1000, 2) as p95_seconds,
  ROUND(PERCENTILE(waiting_for_compute_duration_ms, 0.99)/1000, 2) as p99_seconds,
  ROUND(MAX(waiting_for_compute_duration_ms)/1000, 2) as max_seconds
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'

### Query Duration Over Time

In [0]:
%sql
-- Daily query duration trends
SELECT 
  DATE(start_time) as query_date,
  COUNT(*) as total_queries,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_duration_seconds,
  ROUND(PERCENTILE(total_duration_ms, 0.95)/1000, 2) as p95_duration_seconds,
  ROUND(MAX(total_duration_ms)/1000, 2) as max_duration_seconds,
  ROUND(SUM(total_duration_ms)/3600000, 2) as total_hours
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'
GROUP BY DATE(start_time)
ORDER BY query_date DESC

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

## 2. Longest Running SQL Queries

Identifying the queries that take the most time to execute

In [0]:
%sql
-- Top 20 Longest Running Queries
SELECT 
  statement_id,
  workspace_id,
  executed_by,
  ROUND(total_duration_ms/1000, 2) as duration_seconds,
  ROUND(total_duration_ms/60000, 2) as duration_minutes,
  execution_status,
  start_time,
  end_time,
  statement_type,
  client_application,
  CASE 
    WHEN LENGTH(statement_text) > 200 THEN CONCAT(SUBSTRING(statement_text, 1, 200), '...')
    ELSE statement_text 
  END as query_preview,
  compute.warehouse_id,
  read_bytes,
  read_rows
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'
  AND total_duration_ms IS NOT NULL
ORDER BY total_duration_ms DESC
LIMIT 20

### Longest Running Queries by Statement Type

In [0]:
%sql
-- Average duration by statement type
SELECT 
  statement_type,
  COUNT(*) as query_count,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_duration_seconds,
  ROUND(PERCENTILE(total_duration_ms, 0.95)/1000, 2) as p95_duration_seconds,
  ROUND(MAX(total_duration_ms)/1000, 2) as max_duration_seconds,
  ROUND(SUM(total_duration_ms)/3600000, 2) as total_hours
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'
  AND statement_type IS NOT NULL
GROUP BY statement_type
ORDER BY total_hours DESC

## 3. Data Scanning Metrics

Analyzing how much data queries are scanning (bytes and rows)

In [0]:
%sql
-- Data Scanning Summary
SELECT 
  COUNT(*) as total_queries,
  ROUND(SUM(read_bytes)/1024/1024/1024, 2) as total_gb_scanned,
  ROUND(AVG(read_bytes)/1024/1024, 2) as avg_mb_per_query,
  ROUND(PERCENTILE(read_bytes, 0.95)/1024/1024, 2) as p95_mb_per_query,
  ROUND(MAX(read_bytes)/1024/1024/1024, 2) as max_gb_scanned,
  FORMAT_NUMBER(SUM(read_rows), 0) as total_rows_scanned,
  FORMAT_NUMBER(AVG(read_rows), 0) as avg_rows_per_query,
  FORMAT_NUMBER(MAX(read_rows), 0) as max_rows_scanned
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'

### Top Data Scanning Queries

In [0]:
%sql
-- Top 20 queries by data scanned (bytes)
SELECT 
  statement_id,
  executed_by,
  ROUND(read_bytes/1024/1024/1024, 2) as gb_scanned,
  FORMAT_NUMBER(read_rows, 0) as rows_scanned,
  ROUND(read_bytes/1024/1024, 2) as mb_scanned,
  read_files,
  ROUND(total_duration_ms/1000, 2) as duration_seconds,
  statement_type,
  start_time,
  CASE 
    WHEN LENGTH(statement_text) > 200 THEN CONCAT(SUBSTRING(statement_text, 1, 200), '...')
    ELSE statement_text 
  END as query_preview
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'
  AND read_bytes > 0
ORDER BY read_bytes DESC
LIMIT 20

### Daily Data Scanning Trends

In [0]:
%sql
-- Daily data scanning trends
SELECT 
  DATE(start_time) as query_date,
  COUNT(*) as total_queries,
  ROUND(SUM(read_bytes)/1024/1024/1024, 2) as total_gb_scanned,
  ROUND(AVG(read_bytes)/1024/1024, 2) as avg_mb_per_query,
  FORMAT_NUMBER(SUM(read_rows), 0) as total_rows_scanned,
  FORMAT_NUMBER(AVG(read_rows), 0) as avg_rows_per_query
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'
GROUP BY DATE(start_time)
ORDER BY query_date DESC

Databricks visualization. Run in Databricks to view.

## 4. Query Execution by Users

Analyzing who is running queries and their usage patterns

In [0]:
%sql
-- Top 20 Users by Query Count
SELECT 
  executed_by,
  executed_by_user_id,
  COUNT(*) as total_queries,
  COUNT(CASE WHEN execution_status = 'FINISHED' THEN 1 END) as successful_queries,
  COUNT(CASE WHEN execution_status = 'FAILED' THEN 1 END) as failed_queries,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_duration_seconds,
  ROUND(SUM(total_duration_ms)/3600000, 2) as total_compute_hours,
  ROUND(SUM(read_bytes)/1024/1024/1024, 2) as total_gb_scanned,
  FORMAT_NUMBER(SUM(read_rows), 0) as total_rows_scanned
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
GROUP BY executed_by, executed_by_user_id
ORDER BY total_queries DESC
LIMIT 20

### User Activity by Application

In [0]:
%sql
-- User activity by client application
SELECT 
  executed_by,
  client_application,
  COUNT(*) as query_count,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_duration_seconds,
  ROUND(SUM(total_duration_ms)/3600000, 2) as total_hours
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND client_application IS NOT NULL
GROUP BY executed_by, client_application
ORDER BY query_count DESC
LIMIT 30

### Hourly User Activity Pattern

In [0]:
%sql
-- Query activity by hour of day
SELECT 
  HOUR(start_time) as hour_of_day,
  COUNT(*) as query_count,
  COUNT(DISTINCT executed_by) as unique_users,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_duration_seconds,
  ROUND(SUM(read_bytes)/1024/1024/1024, 2) as total_gb_scanned
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
GROUP BY HOUR(start_time)
ORDER BY hour_of_day

## 6. Compute Resource Utilization

In [0]:
%sql
-- Query distribution by compute type and warehouse
SELECT 
  compute.type as compute_type,
  compute.warehouse_id,
  COUNT(*) as query_count,
  COUNT(DISTINCT executed_by) as unique_users,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_duration_seconds,
  ROUND(SUM(total_duration_ms)/3600000, 2) as total_compute_hours,
  ROUND(SUM(read_bytes)/1024/1024/1024, 2) as total_gb_scanned
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'
GROUP BY compute.type, compute.warehouse_id
ORDER BY query_count DESC

## 7. Failed and Canceled Queries Analysis

In [0]:
%sql
-- Top failure reasons
SELECT 
  CASE 
    WHEN error_message IS NULL THEN 'No Error'
    WHEN LENGTH(error_message) > 100 THEN CONCAT(SUBSTRING(error_message, 1, 100), '...')
    ELSE error_message
  END as error_preview,
  execution_status,
  COUNT(*) as occurrence_count,
  COUNT(DISTINCT executed_by) as affected_users,
  ROUND(AVG(total_duration_ms)/1000, 2) as avg_duration_before_failure
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status IN ('FAILED', 'CANCELED')
GROUP BY 
  CASE 
    WHEN error_message IS NULL THEN 'No Error'
    WHEN LENGTH(error_message) > 100 THEN CONCAT(SUBSTRING(error_message, 1, 100), '...')
    ELSE error_message
  END,
  execution_status
ORDER BY occurrence_count DESC
LIMIT 20

In [0]:
# Ensure variables are defined (in case cells run out of order)
try:
    if 'ANALYSIS_DAYS' not in dir():
        ANALYSIS_DAYS = 30
        print(f"Note: ANALYSIS_DAYS not set, defaulting to {ANALYSIS_DAYS} days")
except:
    ANALYSIS_DAYS = 30

print(f"\n{'='*80}")
print(f"OPTIMIZATION ANALYSIS - Last {ANALYSIS_DAYS} days")
print(f"{'='*80}\n")

In [0]:
%sql
-- Identify queries with high data scanning that could benefit from optimization
CREATE OR REPLACE TEMPORARY VIEW optimization_candidates AS
SELECT 
  statement_id,
  executed_by,
  statement_text,
  statement_type,
  total_duration_ms,
  read_bytes,
  read_rows,
  read_files,
  pruned_files,
  read_partitions,
  ROUND(read_bytes/1024/1024/1024, 2) as gb_scanned,
  ROUND(total_duration_ms/1000, 2) as duration_seconds,
  CASE 
    WHEN pruned_files = 0 AND read_files > 100 THEN 'NO_PRUNING'
    WHEN read_bytes > 10737418240 THEN 'HIGH_DATA_SCAN'  -- > 10GB
    WHEN total_duration_ms > 300000 THEN 'LONG_RUNNING'  -- > 5 minutes
    WHEN read_rows > 100000000 THEN 'HIGH_ROW_SCAN'  -- > 100M rows
    ELSE 'NORMAL'
  END as optimization_priority,
  start_time
FROM system.query.history
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAYS
  AND execution_status = 'FINISHED'
  AND statement_type IN ('SELECT', 'INSERT', 'UPDATE', 'DELETE', 'MERGE', 'CREATE')
  AND statement_text IS NOT NULL;

In [0]:
%sql
-- Summary of optimization candidates
SELECT 
  optimization_priority,
  COUNT(*) as query_count,
  COUNT(DISTINCT executed_by) as unique_users,
  ROUND(AVG(gb_scanned), 2) as avg_gb_scanned,
  ROUND(AVG(duration_seconds), 2) as avg_duration_seconds,
  ROUND(SUM(gb_scanned), 2) as total_gb_scanned,
  ROUND(SUM(duration_seconds)/3600, 2) as total_hours
FROM optimization_candidates
WHERE optimization_priority != 'NORMAL'
GROUP BY optimization_priority
ORDER BY total_gb_scanned DESC