In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Ridership Open Lakehouse Demo (Part 1): Load data to BigQuery Iceberg tables

This notebook will demonstrate a strategy to implement an open lakehouse on GCP, using Apache Iceberg,
as an open source standard for managing data, while still leveraging GCP native capabilities. This demo will use
BigQuery Manged Iceberg Tables, Managed Apache Kafka and Apache Kafka Connect to ingest streaming data, Vertex AI for Generative AI queries on top of the data and Dataplex to govern tables.

This notebook will load data into BigQuery, backed by Parquet files, in the Apache Iceberg specification.

All data in this notebook was prepared in the previous `part0` notebook.

## Setup the environment

In [None]:
import os
USER_AGENT = "cloud-solutions/data-to-ai-nb-v3"

PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
BQ_DATASET = "ridership_lakehouse"
BUCKET_NAME = f"{PROJECT_ID}-ridership-lakehouse"
LOCATION = "us-central1"
BQ_CONNECTION_NAME = "cloud-resources-connection"

print(PROJECT_ID)
print(BUCKET_NAME)

In [None]:
from google.cloud import bigquery, storage
from google.api_core.client_info import ClientInfo

bigquery_client = bigquery.Client(
    project=PROJECT_ID,
    location=LOCATION,
    client_info=ClientInfo(user_agent=USER_AGENT)
)
storage_client = storage.Client(
    project=PROJECT_ID,
    client_info=ClientInfo(user_agent=USER_AGENT)
)

## Create the tables and load data

In [None]:
bus_stops_uri = f"gs://{BUCKET_NAME}/iceberg_data/bus_stations/"

bigquery_client.query(f"DROP TABLE IF EXISTS {BQ_DATASET}.bus_stations;").result()
query = f"""
CREATE TABLE {BQ_DATASET}.bus_stations
(
  bus_stop_id INTEGER,
  address STRING,
  school_zone BOOLEAN,
  seating BOOLEAN,
  borough STRING,
  latitude FLOAT64,
  longtitude FLOAT64
)
WITH CONNECTION `{PROJECT_ID}.{LOCATION}.{BQ_CONNECTION_NAME}`
OPTIONS (
  file_format = 'PARQUET',
  table_format = 'ICEBERG',
  storage_uri = '{bus_stops_uri}');
"""
bigquery_client.query(query).result()

In [None]:
bus_lines_uri = f"gs://{BUCKET_NAME}/iceberg_data/bus_lines/"

bigquery_client.query(
    f'DROP TABLE IF EXISTS {BQ_DATASET}.bus_lines;'
).result()
_create_table_stmt = f"""
    CREATE TABLE {BQ_DATASET}.bus_lines (
        bus_line_id INTEGER,
        bus_line STRING,
        number_of_stops INTEGER,
        stops ARRAY<INTEGER>,
        frequency_minutes INTEGER
    )
    WITH CONNECTION `{PROJECT_ID}.{LOCATION}.{BQ_CONNECTION_NAME}`
    OPTIONS (
        file_format = 'PARQUET',
        table_format = 'ICEBERG',
        storage_uri = '{bus_lines_uri}'
    );
"""
bigquery_client.query(_create_table_stmt).result()

In [None]:
ridership_uri = f"gs://{BUCKET_NAME}/iceberg_data/ridership/"

bigquery_client.query(
    f'DROP TABLE IF EXISTS {BQ_DATASET}.ridership;'
).result()
_create_table_stmt = f"""
    CREATE TABLE {BQ_DATASET}.ridership (
        transit_timestamp TIMESTAMP,
        station_id INTEGER,
        ridership INTEGER
    )
    WITH CONNECTION `{PROJECT_ID}.{LOCATION}.{BQ_CONNECTION_NAME}`
    OPTIONS (
        file_format = 'PARQUET',
        table_format = 'ICEBERG',
        storage_uri = '{ridership_uri}'
    );
"""
bigquery_client.query(_create_table_stmt).result()

In [None]:
dataset_ref = bigquery_client.dataset(BQ_DATASET)
table_ref = dataset_ref.table("bus_lines")

# BQ tables for Apache Iceberg do not support load with truncating, so we will truncate manually, and then load
truncate = bigquery_client.query(f"DELETE FROM {BQ_DATASET}.bus_lines WHERE TRUE")
truncate.result()

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)

job = bigquery_client.load_table_from_uri(
    f"gs://{BUCKET_NAME}/mta_staging_data/bus_lines.json",
    table_ref,
    job_config=job_config,
)

job.result()

In [None]:
table_ref = dataset_ref.table("bus_stations")

# BQ tables for Apache Iceberg do not support load with truncating, so we will truncate manually, and then load
truncate = bigquery_client.query(f"DELETE FROM {BQ_DATASET}.bus_stations WHERE TRUE")
truncate.result()

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
)

job = bigquery_client.load_table_from_uri(
    f"gs://{BUCKET_NAME}/mta_staging_data/bus_stations.csv",
    table_ref,
    job_config=job_config,
)

job.result()

In [None]:
table_ref = dataset_ref.table("ridership")

# BQ tables for Apache Iceberg do not support load with truncating, so we will truncate manually, and then load
truncate = bigquery_client.query(f"DELETE FROM {BQ_DATASET}.ridership WHERE TRUE")
truncate.result()

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
)

job = bigquery_client.load_table_from_uri(
    f"gs://{BUCKET_NAME}/mta_staging_data/ridership/*.csv",
    table_ref,
    job_config=job_config,
)

job.result()

## Basic Analytics
After loading the data to our open data lakehouse, we will demonstrate some basic analytics, but we will repeat the process with several different engines
- BigQuery
- Spark (serverless?)
- Dataflow