## Listing and query Kafka Topics as Iceberg tables

In [None]:
# Start a Spark Session

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Streambased").getOrCreate()

spark

In [None]:
# In Streambased a Kafka cluster is equivalent to an Iceberg namespace we only have 1 in this demo but 
# it could operate over many 

In [None]:
%%sql

USE isk.isk

In [None]:
# List Kafka topics that are represented as Iceberg tables

In [None]:
%%sql

show tables;

In [None]:
# Describe a topic/table - these descriptions are driven by Schema Registry but could come from other sources

In [None]:
%%sql

DESCRIBE transactions;

In [None]:
# Iceberg gives us the ability to inspect the data files that make up a table's population. 
# Note the naming convention that indicates these represent chunks of Kafka offsets 

# Also note the partitioning applied, in this case we are partitioned by hour and have chosen to represent each hour 
# as one file

In [None]:
%%sql

SELECT *
FROM isk.isk.transactions.files;

In [None]:
# We can now query our Kafka data directly. This query fetches data for a single daya and performs a common aggregation on it.

In [None]:
%%sql
-- Total taken in each currency - for single day - 21/04/2025.    
    
select 
    round(sum(t.amount),2) as total_taken,
    p.currency  
from 
    transactions t 
join 
    payment_terms p 
on 
    t.paymenttermcode=p.termcode
where 
    t.transactiontime between '2025-04-21 00:00:00' AND '2025-04-21 23:59:59'
group by 
    p.currency 
order by 
    p.currency asc;

In [None]:
# After the above query has executed please navigate to: http://localhost:4041/SQL/ and look at the details for the 
# latest completed query. You should see only 24 files were read for the job "BatchScan isk.isk.transactions". This demonstrates 
# that partitioning is working correctly (1 file per hour for 1 day = 24 files required to be read).

# Now we will remove the time bounds and run the query again

In [None]:
%%sql
-- Total taken in each currency - without time bounds
    
select 
    round(sum(t.amount),2) as total_taken,
    p.currency  
from 
    transactions t 
join 
    payment_terms p 
on 
    t.paymenttermcode=p.termcode
--where 
--    t.transactiontime between '2025-04-21 00:00:00' AND '2025-04-21 23:59:59'
group by 
    p.currency 
order by 
    p.currency asc;

In [None]:
# returning to the latest query http://localhost:4041/SQL/ you will see this query read a far greater number of files.

In [None]:
# Please return to the demo script.