============================================================================
# BRONZE LAYER - Raw Data Ingestion
============================================================================


In [1]:
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")
spark.sql("SHOW DATABASES").show()

25/10/24 10:45:45 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


+---------+
|namespace|
+---------+
|   bronze|
|  default|
|     gold|
|   silver|
+---------+



In [2]:
transactions_row = """
                   CREATE TABLE IF NOT EXISTS bronze.transactions_raw
                   (
                       transaction_id
                       STRING,
                       customer_id
                       STRING,
                       transaction_timestamp
                       STRING,
                       channel
                       STRING, -- online, mobile, in-store, marketplace
                       store_id
                       STRING,
                       payment_method
                       STRING,
                       payment_status
                       STRING,
                       subtotal
                       STRING,
                       tax_amount
                       STRING,
                       shipping_cost
                       STRING,
                       discount_amount
                       STRING,
                       total_amount
                       STRING,
                       currency
                       STRING,
                       loyalty_points_earned
                       STRING,
                       loyalty_points_redeemed
                       STRING,
                       coupon_codes
                       STRING, -- JSON array

                       -- Metadata
                       _source_system
                       STRING,
                       _ingestion_timestamp
                       TIMESTAMP,
                       _file_name
                       STRING,
                       _record_offset
                       LONG
                   )
                       USING PARQUET
                       PARTITIONED BY
                   (
                       _ingestion_timestamp
                   )
                       LOCATION 's3a://data/bronze/transactions_raw' \
                   """
spark.sql(transactions_row).show()

25/10/24 10:45:47 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


++
||
++
++



In [3]:
transaction_items_raw = """
                        CREATE TABLE IF NOT EXISTS bronze.transaction_items_raw
                        (
                            transaction_item_id
                            STRING,
                            transaction_id
                            STRING,
                            product_id
                            STRING,
                            variant_id
                            STRING,
                            quantity
                            STRING,
                            unit_price
                            STRING,
                            discount_percentage
                            STRING,
                            tax_rate
                            STRING,
                            return_quantity
                            STRING,
                            return_reason
                            STRING,
                            fulfillment_status
                            STRING,
                            warehouse_id
                            STRING,

                            -- Metadata
                            _source_system
                            STRING,
                            _ingestion_timestamp
                            TIMESTAMP,
                            _file_name
                            STRING,
                            _record_offset
                            LONG
                        )
                            USING PARQUET
                            PARTITIONED BY
                        (
                            _ingestion_timestamp
                        )
                            LOCATION 's3a://data/bronze/transaction_items_raw' \
                        """
spark.sql(transaction_items_raw).show()

++
||
++
++



In [4]:
subscriptions_raw = """
                    CREATE TABLE IF NOT EXISTS bronze.subscriptions_raw
                    (
                        subscription_id
                        STRING,
                        customer_id
                        STRING,
                        subscription_type
                        STRING,
                        plan_id
                        STRING,
                        start_date
                        STRING,
                        end_date
                        STRING,
                        status
                        STRING, -- active, cancelled, paused, expired
                        billing_frequency
                        STRING, -- monthly, quarterly, annual
                        subscription_amount
                        STRING,
                        next_billing_date
                        STRING,
                        auto_renewal
                        STRING,
                        cancellation_date
                        STRING,
                        cancellation_reason
                        STRING,

                        -- Metadata
                        _source_system
                        STRING,
                        _ingestion_timestamp
                        TIMESTAMP,
                        _file_name
                        STRING,
                        _record_offset
                        LONG
                    )
                        USING PARQUET
                        PARTITIONED BY
                    (
                        _ingestion_timestamp
                    )
                        LOCATION 's3a://data/bronze/subscriptions_raw' \
                    """
spark.sql(subscriptions_raw).show()

++
||
++
++



In [5]:
customer_interactions_raw = """
                            CREATE TABLE IF NOT EXISTS bronze.customer_interactions_raw
                            (
                                interaction_id
                                STRING,
                                customer_id
                                STRING,
                                interaction_timestamp
                                STRING,
                                interaction_type
                                STRING, -- support_ticket, chat, email, phone, social_media
                                channel
                                STRING,
                                agent_id
                                STRING,
                                category
                                STRING,
                                subcategory
                                STRING,
                                sentiment_score
                                STRING,
                                resolution_time_minutes
                                STRING,
                                satisfaction_rating
                                STRING,
                                notes
                                STRING,

                                -- Metadata
                                _source_system
                                STRING,
                                _ingestion_timestamp
                                TIMESTAMP,
                                _file_name
                                STRING,
                                _record_offset
                                LONG
                            )
                                USING PARQUET
                                PARTITIONED BY
                            (
                                _ingestion_timestamp
                            )
                                LOCATION 's3a://data/bronze/customer_interactions_raw' \
                            """
spark.sql(customer_interactions_raw).show()

++
||
++
++



In [6]:
product_catalog_raw = """
                      CREATE TABLE IF NOT EXISTS bronze.product_catalog_raw
                      (
                          product_id
                          STRING,
                          product_name
                          STRING,
                          category_level1
                          STRING,
                          category_level2
                          STRING,
                          category_level3
                          STRING,
                          brand
                          STRING,
                          manufacturer
                          STRING,
                          unit_cost
                          STRING,
                          list_price
                          STRING,
                          margin_percentage
                          STRING,
                          supplier_id
                          STRING,
                          lead_time_days
                          STRING,
                          weight_kg
                          STRING,
                          dimensions
                          STRING, -- JSON
                          tags
                          STRING, -- JSON array
                          launch_date
                          STRING,
                          discontinuation_date
                          STRING,

                          -- Metadata
                          _source_system
                          STRING,
                          _ingestion_timestamp
                          TIMESTAMP,
                          _file_name
                          STRING,
                          _record_offset
                          LONG
                      )
                          USING PARQUET
                          PARTITIONED BY
                      (
                          _ingestion_timestamp
                      )
                          LOCATION 's3a://data/bronze/product_catalog_raw' \
                      """
spark.sql(product_catalog_raw).show()

++
||
++
++



In [7]:
inventory_snapshots_raw = """
                          CREATE TABLE IF NOT EXISTS bronze.inventory_snapshots_raw
                          (
                              snapshot_id
                              STRING,
                              snapshot_timestamp
                              STRING,
                              product_id
                              STRING,
                              variant_id
                              STRING,
                              warehouse_id
                              STRING,
                              quantity_on_hand
                              STRING,
                              quantity_reserved
                              STRING,
                              quantity_available
                              STRING,
                              reorder_point
                              STRING,
                              reorder_quantity
                              STRING,

                              -- Metadata
                              _source_system
                              STRING,
                              _ingestion_timestamp
                              TIMESTAMP,
                              _file_name
                              STRING,
                              _record_offset
                              LONG
                          )
                              USING PARQUET
                              PARTITIONED BY
                          (
                              _ingestion_timestamp
                          )
                              LOCATION 's3a://data/bronze/inventory_snapshots_raw' \
                          """
spark.sql(inventory_snapshots_raw).show()

++
||
++
++



In [8]:
marketing_campaigns_raw = """
                          CREATE TABLE IF NOT EXISTS bronze.marketing_campaigns_raw
                          (
                              campaign_id
                              STRING,
                              campaign_name
                              STRING,
                              campaign_type
                              STRING, -- email, social, display, search, affiliate
                              channel
                              STRING,
                              start_date
                              STRING,
                              end_date
                              STRING,
                              budget
                              STRING,
                              target_audience
                              STRING, -- JSON
                              creative_id
                              STRING,

                              -- Metadata
                              _source_system
                              STRING,
                              _ingestion_timestamp
                              TIMESTAMP,
                              _file_name
                              STRING,
                              _record_offset
                              LONG
                          )
                              USING PARQUET
                              PARTITIONED BY
                          (
                              _ingestion_timestamp
                          )
                              LOCATION 's3a://data/bronze/marketing_campaigns_raw' \
                          """
spark.sql(marketing_campaigns_raw).show()

++
||
++
++



In [9]:
campaign_events_raw = """
                      CREATE TABLE IF NOT EXISTS bronze.campaign_events_raw
                      (
                          event_id
                          STRING,
                          campaign_id
                          STRING,
                          customer_id
                          STRING,
                          event_timestamp
                          STRING,
                          event_type
                          STRING, -- impression, click, conversion, unsubscribe
                          device_type
                          STRING,
                          location
                          STRING,
                          attributed_revenue
                          STRING,

                          -- Metadata
                          _source_system
                          STRING,
                          _ingestion_timestamp
                          TIMESTAMP,
                          _file_name
                          STRING,
                          _record_offset
                          LONG
                      )
                          USING PARQUET
                          PARTITIONED BY
                      (
                          _ingestion_timestamp
                      )
                          LOCATION 's3a://data/bronze/campaign_events_raw'; \
                      """
spark.sql(campaign_events_raw).show()

++
||
++
++



In [10]:
spark.sql("SHOW TABLES IN bronze").show(truncate=False)

+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|bronze   |campaign_events_raw      |false      |
|bronze   |customer_interactions_raw|false      |
|bronze   |inventory_snapshots_raw  |false      |
|bronze   |marketing_campaigns_raw  |false      |
|bronze   |product_catalog_raw      |false      |
|bronze   |subscriptions_raw        |false      |
|bronze   |transaction_items_raw    |false      |
|bronze   |transactions_raw         |false      |
+---------+-------------------------+-----------+



In [1]:
tables = [
    'campaign_events_raw',
    'customer_interactions_raw',
    'inventory_snapshots_raw',
    'marketing_campaigns_raw',
    'product_catalog_raw',
    'subscriptions_raw',
    'transaction_items_raw',
    'transactions_raw',
]
df = None
for table in tables:
    loop_df = spark.sql(
    f"""
    select '{table}' as tt,
    count(*) as cnt
    from bronze.{table}
    """)
    if df is None:
        df = loop_df
    else:
        df = df.union(loop_df)
df.show()

25/10/24 11:13:03 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+--------------------+---+
|                  tt|cnt|
+--------------------+---+
| campaign_events_raw| 45|
|customer_interact...| 20|
|inventory_snapsho...| 40|
|marketing_campaig...| 15|
| product_catalog_raw| 20|
|   subscriptions_raw| 20|
|transaction_items...| 25|
|    transactions_raw| 20|
+--------------------+---+

