In [None]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructField, StructType, IntegerType,
    StringType, DateType, DecimalType
)

from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient

import logging

# Setting log level

In [None]:
logging.basicConfig(level=logging.INFO)

# Create parameter

In [None]:
dbutils.widgets.text('storage_account', '0')
dbutils.widgets.text('year', '0')
dbutils.widgets.text('month', '0')
dbutils.widgets.text('day', '0')

In [None]:
storage_account = dbutils.widgets.get('storage_account')
year = dbutils.widgets.get('year')
month = dbutils.widgets.get('month')
day = dbutils.widgets.get('day')

bronze_file_path = f'abfss://bronze@{storage_account}.dfs.core.windows.net/raw_data/{year}/{month}/{day}/'
checks_file_path = '/pipeline_project/check/checks_silver.yml'
silver_storage_path = f'abfss://silver@{storage_account}.dfs.core.windows.net/transformed_data/{year}/{month}/{day}/'

# create data quality instance connected to databricks workspace
dq_engine = DQEngine(WorkspaceClient())

# Define schema

In [None]:
bronze_schema = StructType([
    StructField('Sales_Person_ID', IntegerType(), True),
    StructField('Sales_Person', StringType(), True),
    StructField('Country', StringType(), True),
    StructField('Product_ID', IntegerType(), True),
    StructField('Product', StringType(), True),
    StructField('Date', DateType(), True),
    StructField('Amount', StringType(), True),
    StructField('Boxes_Shipped', IntegerType(), True)
])

# Run common functions

In [None]:
%run ./utils/common_functions

# Define extra functions

In [None]:
def clean_data(df_bronze: DataFrame) -> DataFrame:
    """
    Clean data before adding new columns.

    Parameter:
        df_bronze: Dataframe containing bronze data.

    Return:
        Cleaned dataframe.
    """

    # drop null and duplicate
    df_cleaned = df_bronze.dropna() \
        .dropDuplicates(['Sales_Person_ID', 'Country', 'Product_ID', 'Date', 'Boxes_Shipped'])

    # change country name
    df_cleaned = df_cleaned.withColumn(
            'Country',
            F.when(
                F.col('Country') == 'UK',
                'United Kingdom'
            ).when(
                F.col('Country') == 'USA',
                'United States'
            ).otherwise(F.col('Country'))
        )
    
    # remove whitespace
    df_cleaned = df_cleaned.withColumn('Sales_Person', F.trim('Sales_Person')) \
            .withColumn('Country', F.trim('Country')) \
            .withColumn('Product', F.trim('Product')) \
            .withColumn('Amount', F.trim('Amount'))
    
    # change column type and name
    df_cleaned = df_cleaned.withColumn('Amount', F.regexp_replace('Amount', '[$,]', '').cast(IntegerType())) \
            .withColumnRenamed('Amount', 'Revenue')
    
    # remove negative value
    df_cleaned = df_cleaned.filter((df_cleaned.Revenue != 0) & (df_cleaned.Boxes_Shipped != 0))

    return df_cleaned

In [None]:
def add_columns(df_cleaned: DataFrame) -> DataFrame:
    """
    Add necessary columns to cleaned dataframe.

    Parameter:
        df_cleaned: Cleaned dataframe.

    Return:
        Dataframe with necessary columns added.
    """

    # add first name and last name columns
    df_added = df_cleaned.withColumn('First_Name', F.split('Sales_Person', ' ')[0]) \
            .withColumn('Last_Name', F.split('Sales_Person', ' ')[1])
    
    # calculate revenue per box column
    df_added = df_added.withColumn('Revenue_Per_Box', F.round(df_added['Revenue'] / df_added['Boxes_Shipped'], 2).cast(DecimalType(10, 2)))

    # extract date components
    df_added = df_added.withColumn('Date_Key', F.date_format('Date', 'yyyyMMdd')) \
                        .withColumn('Year', F.year('Date')) \
                        .withColumn('Quarter', F.quarter('Date')) \
                        .withColumn('Month', F.month('Date')) \
                        .withColumn('Day', F.dayofmonth('Date')) \
                        .withColumn('Start_Of_Year', F.trunc('Date', 'year')) \
                        .withColumn('Start_Of_Quarter', F.trunc('Date', 'quarter')) \
                        .withColumn('Start_Of_Month', F.trunc('Date', 'month'))
    
    return df_added

In [None]:
def write_data(df: DataFrame, silver_storage_path: str) -> None:
    """
    Save output dataframe to silver layer.

    Parameter:
        df: Target dataframe.
        silver_storage_path: Path to silver layer storage.

    Return:
        None.
    """

    df.write \
        .format('parquet') \
        .mode('overwrite') \
        .save(silver_storage_path)

# Define main function

In [None]:
def main() -> None:
    """
    Main function to transform data from bronze layer.

    Parameter:
        None.

    Return:
        None.
    """

    try:
        logging.info('Start transforming data.')

        # read data from bronze
        df_bronze = read_data(spark, 'parquet', bronze_schema, bronze_file_path)

        # clean data
        df_cleaned = clean_data(df_bronze)

        # add columns
        df_added = add_columns(df_cleaned)

        # data quality checks
        data_quality_checks(dq_engine, checks_file_path, df_added)

        # write data to silver
        write_data(df_added, silver_storage_path)

        logging.info('Wrote data to silver layer.')

    except Exception as e:
        logging.error(f'Error occured: {e}')
        raise

# Run

In [None]:
if __name__ == '__main__':
    main()