### 1. Preprocess the JSON string to fix the data quality issues.

### 2. Transform JSON string to JSON object.

### 3. Write transformed data to Silver schema.



In [0]:
df_orders_raw = spark.sql("select * from gizmobox.bronze.py_orders")
display(df_orders_raw)

### 1. Preprocess the JSON string to fix the data quality issues.

In [0]:
from pyspark.sql import functions as F
df_orders_raw_fixed = (df_orders_raw
                       .select( 
                               F.regexp_replace("value",'"order_date": (\\d{4}-\\d{2}-\\d{2})', '"order_date": "$1"').alias('fixed_value'))
                       )
display(df_orders_raw_fixed)

### 2. Transform JSON string to JSON object.

In [0]:
orders_json_schema = (df_orders_raw_fixed
                            .select(F.schema_of_json(F.col("fixed_value")).alias("order_json_schema"))
                     )
display(orders_json_schema.limit(1))

In [0]:
orders_schema = '''STRUCT<customer_id: BIGINT, items: ARRAY<STRUCT<category: STRING, details: STRUCT<brand: STRING, color: STRING>, item_id: BIGINT, name: STRING, price: BIGINT, quantity: BIGINT>>, order_date: STRING, order_id: BIGINT, order_status: STRING, payment_method: STRING, total_amount: BIGINT, transaction_timestamp: STRING>'''

In [0]:
orders_json = (df_orders_raw_fixed
                            .select(F.from_json(F.col("fixed_value"), orders_schema).alias('orders_json'))
              )
            
display(orders_json)

### 3. Write transformed data to Silver schema.

In [0]:
orders_json.writeTo("gizmobox.silver.py_orders_json").createOrReplace()

In [0]:
%sql
SELECT * FROM gizmobox.silver.py_orders_json;

### 4. Normalise & Explode Arrays 

In [0]:

df_orders_normalised = spark.sql('select * from py_orders_json')
display(df_orders_normalised)

In [0]:
from pyspark.sql import functions as F
df_orders_normalised_exploded = (orders_json
                            .select(
                                'orders_json.customer_id',
                                F.explode(F.array_distinct('orders_json.items')).alias('items'),
                                'orders_json.order_date',
                                'orders_json.order_id',
                                'orders_json.order_status',
                                'orders_json.payment_method',
                                'orders_json.total_amount',
                                'orders_json.transaction_timestamp'
                                 )
                       )
display(df_orders_normalised_exploded)

In [0]:
df_orders_final = (df_orders_normalised_exploded
                    .select(
                            'customer_id',
                            'items.category',
                            'items.item_id',
                            'items.details.brand',
                            'items.details.color',
                            'items.name',
                            'items.price',
                            'items.quantity',
                            'order_date',
                            'order_id',
                            'order_status',
                            'payment_method',
                            'total_amount',
                            'transaction_timestamp'
                            )
                  )
display(df_orders_final)

In [0]:
df_orders_final.writeTo('gizmobox.silver.py_orders_final').createOrReplace()

In [0]:
%sql
SELECT * FROM gizmobox.silver.py_orders_final;