# Lectures 13-15
- extracts from lectures 13-15
- 16 will be covered in next course
- good to have some extra tools in your toolkit
- connecting duckdb to python

In [50]:
with open("sql/ingestion.sql", 'r') as sql_script:
    sql_code = sql_script.read()

sql_code

"CREATE SCHEMA IF NOT EXISTS staging;\n\nCREATE TABLE\n    IF NOT EXISTS staging.students AS (\n        SELECT\n            *\n        FROM\n            read_csv_auto ('data/students.csv')\n    );\n\nCREATE TABLE\n    IF NOT EXISTS staging.teachers AS (\n        SELECT\n            *\n        FROM\n            read_csv_auto ('data/teachers.csv')\n    );\n\nCREATE TABLE\n    IF NOT EXISTS staging.housing_sales AS (\n        SELECT\n            *\n        FROM\n            read_csv_auto ('data/housing_sales.csv')\n    );\n\nCREATE TABLE\n    IF NOT EXISTS staging.neighborhoods AS (\n        SELECT\n            *\n        FROM\n            read_csv_auto ('data/neighborhoods.csv')\n    );\n\nCREATE TABLE\n    IF NOT EXISTS staging.categories AS (\n        SELECT\n            *\n        FROM\n            read_csv_auto ('data/categories.csv')\n    );\n\nCREATE TABLE\n    IF NOT EXISTS staging.sales AS (\n        SELECT\n            *\n        FROM\n            read_csv_auto ('data/sales.csv'

### with statement to connect to a duckdb
- with handles connection and closes it down properly
- if this database doesn't exist -> it creates it for us

In [51]:
import duckdb

with duckdb.connect("data/lecture_13_15.duckdb") as conn:
    conn.execute(sql_code)
    describe = conn.execute("DESC;").df()

describe

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,lecture_13_15,staging,average_price_type,"[property_type, avg_price]","[VARCHAR, INTEGER]",False
1,lecture_13_15,staging,categories,"[category_id, category_name, parent_category_id]","[BIGINT, VARCHAR, BIGINT]",False
2,lecture_13_15,staging,housing_sales,"[sale_id, neighborhood_id, address, date_of_sa...","[VARCHAR, BIGINT, VARCHAR, DATE, VARCHAR, BIGI...",False
3,lecture_13_15,staging,neighborhoods,"[neighborhood_id, name, city, postal_code, med...","[BIGINT, VARCHAR, VARCHAR, BIGINT, BIGINT, BIG...",False
4,lecture_13_15,staging,sales,"[product_id, product_name, category_id, quanti...","[VARCHAR, VARCHAR, BIGINT, BIGINT, BIGINT]",False
5,lecture_13_15,staging,students,"[student_id, grade_math, class_name]","[VARCHAR, BIGINT, VARCHAR]",False
6,lecture_13_15,staging,teachers,"[class_name, teacher_name]","[VARCHAR, VARCHAR]",False


In [52]:
# this doesn't work as the connection is closed
# duckdb.sql("FROM staging.students;")

In [53]:
from utils import query_database

query_database("FROM staging.students;").head()

Unnamed: 0,student_id,grade_math,class_name
0,S001,92,A
1,S002,85,A
2,S003,78,B
3,S004,88,B
4,S005,95,C


## Subquery

- a query inside of another query
- gets a temporary result set, which can be used


In [54]:
avg = query_database("""--sql
    SELECT
        AVG(grade_math)
    FROM
        staging.students;
    """)

### find all rows where score is higher than average

In [55]:
query_database("""--sql
    SELECT
    *
    FROM
        staging.students
    WHERE grade_math > (
        SELECT
            AVG(grade_math)
        FROM
            staging.students
    );
""")

Unnamed: 0,student_id,grade_math,class_name
0,S001,92,A
1,S002,85,A
2,S004,88,B
3,S005,95,C
4,S009,89,E
5,S010,93,E


### solve without subquery

In [56]:
avg['avg(grade_math)'].values[0]

np.float64(84.2)

In [57]:
query_database(f"""
    SELECT
        *
    FROM 
        staging.students
    WHERE grade_math > {avg['avg(grade_math)'].values[0]}
""")

Unnamed: 0,student_id,grade_math,class_name
0,S001,92,A
1,S002,85,A
2,S004,88,B
3,S005,95,C
4,S009,89,E
5,S010,93,E


### Purposes of Views

In [58]:
query_database("""
    FROM staging.housing_sales;

""").head()

Unnamed: 0,sale_id,neighborhood_id,address,date_of_sale,property_type,rooms,living_area_sqm,year_built,sale_price_usd
0,S-24001,1,124 Maple Crest Ave,2024-10-09,Apartment,2,58,2018,245575
1,S-24002,1,9 Birch Hollow Ln,2025-02-21,Townhouse,4,112,2006,454575
2,S-24003,2,77 Riverwalk Way,2025-03-18,Detached,5,168,1999,757625
3,S-24004,2,18 Marina Bluff Ct,2025-07-05,Apartment,3,86,2015,412775
4,S-24005,3,401 Quarry Gate Rd,2024-12-12,Semi-Detached,4,124,1987,381425


pick out average of property types and put into a view to give to downsream users

- if storing this as a table and the underlying data changes e.g. more houses are added then the "average sales table" is wrong

- with a view - it will run the sql query when the downstream user is querying the view - which will give fresh results all the time

In [59]:
query_database("""--sql
CREATE OR REPLACE VIEW staging.average_price_type AS
SELECT
    property_type,
    AVG(sale_price_usd):: INT AS avg_price
FROM staging.housing_sales
GROUP BY property_type


""")

Unnamed: 0,Count


In [60]:
query_database("FROM staging.average_price_type;")

Unnamed: 0,property_type,avg_price
0,Terraced,350075
1,Loft,316112
2,Semi-Detached,391875
3,Bungalow,423225
4,Detached,699105
5,Apartment,290361
6,Townhouse,477913
7,Condo,312803


In [61]:
query_database("""--sql
        FROM information_schema.views 
        WHERE table_catalog = 'lecture_13_15'AND table_schema = 'staging';
""")

Unnamed: 0,table_catalog,table_schema,table_name,view_definition,check_option,is_updatable,is_insertable_into,is_trigger_updatable,is_trigger_deletable,is_trigger_insertable_into
0,lecture_13_15,staging,average_price_type,CREATE VIEW staging.average_price_type AS SELE...,NONE,NO,NO,NO,NO,NO


## CTE - Common table expression

- use WITH statement
- simplify complex queries and making them more readable
- temporary result set that you can use further - "variable"

In [62]:
query_database("FROM staging.sales;").head()


Unnamed: 0,product_id,product_name,category_id,quantity,unit_price_sek
0,P-0001,"Ultrabook 13\"" 8GB/256GB",102,2,11990
1,P-0002,"Gaming Laptop 15\"" 16GB/512GB",102,1,15990
2,P-0003,Office Desktop i5,103,3,7990
3,P-0004,"All-in-One PC 24\""",103,1,9990
4,P-0005,"Android Tablet 10\""",104,2,3490


In [71]:
query_database("FROM staging.categories;").head()

Unnamed: 0,category_id,category_name,parent_category_id
0,100,Electronics,
1,101,Computers,100.0
2,102,Laptops,101.0
3,103,Desktops,101.0
4,104,Tablets,101.0


In [74]:
query_database("""--sql
WITH avg_sales AS (
    SELECT 
        category_id,
        AVG(quantity*unit_price_sek)::INT AS avg_sales_sek
    FROM staging.sales
    GROUP BY category_id
)
SELECT 
    c.category_name, 
    avg_sales_sek 
FROM avg_sales a
JOIN staging.categories c ON c.category_id = a.category_id
ORDER BY avg_sales_sek DESC;
""")

Unnamed: 0,category_name,avg_sales_sek
0,Laptops,18820
1,Desktops,15485
2,Phones,7189
3,Furniture,3240
4,Tablets,2929
5,Electronics,2790
6,Appliances,2590
7,Fitness,1563
8,Camping,1059
9,Home & Kitchen,1035
