In [0]:
path = '/Volumes/databricks_simulated_retail_customer_data/v01/source_files/sales_orders.csv'

In [0]:
from pyspark.sql.functions import * 
from pyspark.sql.types import * 

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv(path)

display(df.limit(10))

In [0]:
df = spark.read\
        .option('header' , 'true')\
        .option('quote', '"')\
        .option('escape', '"')\
        .option('multiline','true')\
        .csv(path)

display(df.limit(10))

In [0]:
df.printSchema()

In [0]:
df_temp = df.select('ordered_products','clicked_items','promo_info')

display(df_temp.limit(3))

In [0]:
ordered_products_schema = ArrayType(
    StructType([
        StructField("curr", StringType()),
        StructField("id", StringType()),
        StructField("name", StringType()),
        StructField("price", StringType()),
        StructField("promotion_info", StringType()),
        StructField("qty", StringType()),
        StructField("unit", StringType())
    ])
)

In [0]:
df_ordered_products = df.withColumn('ordered_products_parsed',
                        from_json(col("ordered_products"), ordered_products_schema))

display(df_ordered_products.limit(3))

In [0]:
df_explode = df_ordered_products.withColumn('product_ult', 
                                explode(col('ordered_products_parsed')))

display(df_explode.limit(3))

In [0]:
df = df_explode.withColumn('currency' , col("product_ult.curr"))\
                .withColumn('product_id' , col("product_ult.id"))\
                .withColumn('name' , col("product_ult.name"))\
                .withColumn('product_price' , col("product_ult.price"))\
                .withColumn('product_qty' , col("product_ult.qty"))\
                .withColumn('product_unit' , col("product_ult.unit"))\
                .withColumn('promotion_info' , col('product_ult.promotion_info'))

        
display(df.limit(5))

In [0]:
df = df.drop('product_ult', 'ordered_products_parsed' ,'ordered_products' , 'clicked_items' , 'promotion_info')

display(df.limit(10))

In [0]:
promo_schema = StructType([
    StructField("promo_disc", StringType()),
    StructField("promo_id", StringType()),
    StructField("promo_item", StringType()),
    StructField("promo_qty", IntegerType())
])

df = df2.withColumn(
    "promo_struct",
    from_json(
        col("promo"),
        promo_schema
    )
).withColumn(
    "promo_disc",
    col("promo_struct.promo_disc")
).withColumn(
    "promo_id",
    col("promo_struct.promo_id")
).withColumn(
    "promo_item",
    col("promo_struct.promo_item")
).withColumn(
    "promo_qty",
    col("promo_struct.promo_qty")
).drop(
    "promo_info_array"
).drop(
    "promo_struct"
)

display(df.limit(10))

In [0]:
"""{"promo_disc":0.03,
 "promo_id":"0",
 "promo_item":"AVpfMVD-ilAPnD_xW6bu",
 "promo_qty":"2"}"""

In [0]:
df = df.drop("promo_info",'promo')

display(df.limit(10))

In [0]:
df.count()

In [0]:
df = (
    df.withColumn(
        "customer_name",
        initcap(regexp_replace(col("customer_name"), ",", ""))
    )
    .withColumn(
    "order_date",
    to_date(from_unixtime(col("order_datetime").cast("long")))
)
    .withColumn("order_number", col("order_number").cast("int"))
    .withColumn("customer_id", col("customer_id").cast("int"))
    .withColumn("number_of_line_items", col("number_of_line_items").cast("int"))
    .withColumn("product_price", col("product_price").cast("double"))
    .withColumn("product_qty", col("product_qty").cast("int"))
)

display(df.limit(10))

In [0]:
df.printSchema()

In [0]:
df.count()

In [0]:
df.write.mode("overwrite").saveAsTable('analytics_star_schema.deltashare.sales_orders')