In [1]:
import requests
import zipfile
import duckdb
import os
import time

# URL of the ZIP file
url1 = "https://www.stats.govt.nz/assets/Uploads/Business-employment-data/Business-employment-data-March-2024-quarter/Download-data/business-employment-data-march-2024-quarter.zip"

local_zip_path1 = './business_employment_data.zip'

extracted_dir = './extracted/'

csv_file_name1 = 'machine-readable-business-employment-data-mar-2024-quarter.csv'

csv_file_path1 = os.path.join(extracted_dir, csv_file_name1)


# Download zip file 1 and Extract
response = requests.get(url1)
with open(local_zip_path1, 'wb') as f:
    f.write(response.content)

with zipfile.ZipFile(local_zip_path1, 'r') as zip_ref:
    # Extract all files
    zip_ref.extractall(extracted_dir)


# create Duckdb Database
conn = duckdb.connect("employment.db")

create_table_query1 = f"""
CREATE OR REPLACE TABLE  business_employment AS
SELECT * FROM read_csv_auto('{csv_file_path1}');
"""


# Create Tables
conn.execute(create_table_query1)


conn.sql ("SHOW ALL TABLES").show()
cur_time = time.time()

conn.sql("""
-- Step 1: Filter records to include only those with 'Filled jobs' in Series_title_1
-- and 'Territorial authority' in the 'Group' field, and where 'data_value' is not null.
WITH FilteredRecords AS (
    SELECT Period,"Group" as group_name, Series_title_1, data_value
    FROM business_employment
    WHERE Series_title_1 LIKE 'Filled jobs'
      AND "Group" LIKE 'Territorial authority%'
      AND data_value IS NOT NULL
)

-- Step 2: Aggregate the filtered records by 'Period', calculating the sum and average of 'data_value'.
SELECT
    Period, group_name, Series_title_1,
    SUM(data_value) AS total_data_value,
    AVG(data_value) AS avg_data_value
FROM FilteredRecords
GROUP BY Period, group_name, Series_title_1

-- Step 3: Order the results by the average 'data_value' in descending order.
ORDER BY avg_data_value DESC;

""").show()

print(f"time2: {time.time() - cur_time}")

conn.close()

┌────────────┬─────────┬─────────────────────┬──────────────────────┬──────────────────────────────────────┬───────────┐
│  database  │ schema  │        name         │     column_names     │             column_types             │ temporary │
│  varchar   │ varchar │       varchar       │      varchar[]       │              varchar[]               │  boolean  │
├────────────┼─────────┼─────────────────────┼──────────────────────┼──────────────────────────────────────┼───────────┤
│ employment │ main    │ business_employment │ [Series_reference,…  │ [VARCHAR, DOUBLE, DOUBLE, VARCHAR,…  │ false     │
└────────────┴─────────┴─────────────────────┴──────────────────────┴──────────────────────────────────────┴───────────┘

┌─────────┬──────────────────────────────────────────────┬────────────────┬──────────────────┬────────────────────┐
│ Period  │                  group_name                  │ Series_title_1 │ total_data_value │   avg_data_value   │
│ double  │                   varchar    