In [2]:
from pyspark.shell import spark
from pyspark.sql import DataFrame
from pyspark.sql.types import LongType

table_config = {
    "name": "transactions_raw",
    "csv_file": "bronze_transactions_raw.csv",
    "database": "bronze"
}

In [3]:
table_name = table_config["name"]
csv_file = table_config["csv_file"]
database = table_config["database"]
full_table_name = f"{database}.{table_name}"
print(csv_file)
print(full_table_name)

bronze_transactions_raw.csv
bronze.transactions_raw


In [75]:
input_path = f"s3a://data/source_data/{csv_file}"
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "false") \
    .option("quote", "\"") \
    .csv(input_path)

In [76]:
df.show(truncate=False)

+--------------+-----------+---------------------+-----------+--------+--------------+--------------+--------+----------+-------------+---------------+------------+--------+---------------------+-----------------------+--------------+---------------+--------------------+--------------------------+--------------+
|transaction_id|customer_id|transaction_timestamp|channel    |store_id|payment_method|payment_status|subtotal|tax_amount|shipping_cost|discount_amount|total_amount|currency|loyalty_points_earned|loyalty_points_redeemed|coupon_codes  |_source_system |_ingestion_timestamp|_file_name                |_record_offset|
+--------------+-----------+---------------------+-----------+--------+--------------+--------------+--------+----------+-------------+---------------+------------+--------+---------------------+-----------------------+--------------+---------------+--------------------+--------------------------+--------------+
|TXN001        |CUST10001  |2025-10-20 14:32:15  |online  

In [77]:
from pyspark.sql.functions import to_timestamp, col
from pyspark.sql import DataFrame

df: DataFrame = df.withColumn("_ingestion_timestamp_casted", to_timestamp(col("_ingestion_timestamp"), "yyyy-MM-dd HH:mm:ss"))
df: DataFrame = df.withColumn('_record_offset_casted', df['_record_offset'].astype('bigint'))
df = df.drop('_record_offset').withColumnRenamed('_record_offset_casted', '_record_offset')\
        .drop('_ingestion_timestamp').withColumnRenamed('_ingestion_timestamp_casted', '_ingestion_timestamp')

In [78]:
df.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- transaction_timestamp: string (nullable = true)
 |-- channel: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- payment_status: string (nullable = true)
 |-- subtotal: string (nullable = true)
 |-- tax_amount: string (nullable = true)
 |-- shipping_cost: string (nullable = true)
 |-- discount_amount: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- loyalty_points_earned: string (nullable = true)
 |-- loyalty_points_redeemed: string (nullable = true)
 |-- coupon_codes: string (nullable = true)
 |-- _source_system: string (nullable = true)
 |-- _file_name: string (nullable = true)
 |-- _ingestion_timestamp: timestamp (nullable = true)
 |-- _record_offset: long (nullable = true)



In [82]:
cols = [col.col_name for col in spark.sql(f"SHOW COLUMNS IN {full_table_name}").select('col_name').collect()]

In [83]:
print(cols)

['transaction_id', 'customer_id', 'transaction_timestamp', 'channel', 'store_id', 'payment_method', 'payment_status', 'subtotal', 'tax_amount', 'shipping_cost', 'discount_amount', 'total_amount', 'currency', 'loyalty_points_earned', 'loyalty_points_redeemed', 'coupon_codes', '_source_system', '_file_name', '_record_offset', '_ingestion_timestamp']


In [84]:
df = df.select([df[col] for col in cols])

In [85]:
df.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- transaction_timestamp: string (nullable = true)
 |-- channel: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- payment_status: string (nullable = true)
 |-- subtotal: string (nullable = true)
 |-- tax_amount: string (nullable = true)
 |-- shipping_cost: string (nullable = true)
 |-- discount_amount: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- loyalty_points_earned: string (nullable = true)
 |-- loyalty_points_redeemed: string (nullable = true)
 |-- coupon_codes: string (nullable = true)
 |-- _source_system: string (nullable = true)
 |-- _file_name: string (nullable = true)
 |-- _record_offset: long (nullable = true)
 |-- _ingestion_timestamp: timestamp (nullable = true)



In [86]:
df.write.insertInto(full_table_name)

25/10/23 15:11:21 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/10/23 15:11:21 WARN RetryingMetaStoreClient: MetaStoreClient lost connection. Attempting to reconnect (1 of 1) after 1s. listPartitionsWithAuthInfo
org.apache.thrift.transport.TTransportException: Cannot write to null outputStream
	at org.apache.thrift.transport.TIOStreamTransport.write(TIOStreamTransport.java:142)
	at org.apache.thrift.protocol.TBinaryProtocol.writeI32(TBinaryProtocol.java:185)
	at org.apache.thrift.protocol.TBinaryProtocol.writeMessageBegin(TBinaryProtocol.java:116)
	at org.apache.thrift.TServiceClient.sendBase(TServiceClient.java:70)
	at org.apache.thrift.TServiceClient.sendBase(TServiceClient.java:62)
	at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.send_get_partitions_ps_with_auth(ThriftHiveMetastore.java:2562)
	at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.get_

In [87]:
spark.sql(f"SELECT * FROM {full_table_name}").show()

+--------------+-----------+---------------------+-----------+--------+--------------+--------------+--------+----------+-------------+---------------+------------+--------+---------------------+-----------------------+--------------+---------------+--------------------+--------------+--------------------+
|transaction_id|customer_id|transaction_timestamp|    channel|store_id|payment_method|payment_status|subtotal|tax_amount|shipping_cost|discount_amount|total_amount|currency|loyalty_points_earned|loyalty_points_redeemed|  coupon_codes| _source_system|          _file_name|_record_offset|_ingestion_timestamp|
+--------------+-----------+---------------------+-----------+--------+--------------+--------------+--------+----------+-------------+---------------+------------+--------+---------------------+-----------------------+--------------+---------------+--------------------+--------------+--------------------+
|        TXN004|  CUST10004|  2025-10-20 09:23:45|     online|    NULL|   cr