In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnull, count, when
from pyspark.sql.functions import count, when, col,split,to_timestamp
from pyspark.sql.types import IntegerType, DateType, FloatType, TimestampType

In [0]:
spark = SparkSession.builder \
    .appName("online_retail") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

In [0]:
df = spark.read.format("jdbc").option("url", "jdbc:sqlserver://destination-landing-db.database.windows.net:1433;database=destination").option("dbtable", 'destination.online_retail').option("user", 'adminuser@destination-landing-db').option("password", 'Test1234').load() 

In [0]:
def data_cleaning(df):
  df = df.na.fill('na', subset=['Description'])
  df = df.na.drop(subset=['CustomerID'])
  df = df.dropDuplicates()
  return df

In [0]:
def data_transformation(df):
  df = df.withColumn('InvoiceNumber', when(col('InvoiceNo').startswith('C'), split('InvoiceNo', 'C')[1]).otherwise(col('InvoiceNo')))
  df = df.withColumn('Cancellation', when(col('InvoiceNo').startswith('C'), 'C').otherwise('NC'))
  selected_columns = ['InvoiceNumber','Cancellation','StockCode','Description','Quantity','InvoiceDate','UnitPrice','CustomerID','Country']
  df_column_selected = df.select(*selected_columns)
  df_column_selected = df_column_selected.withColumn("Quantity", col("Quantity").cast(IntegerType())) \
    .withColumn("CustomerID", col("CustomerID").cast(IntegerType())) \
    .withColumn("UnitPrice", col("UnitPrice").cast(FloatType())) \
    .withColumn("InvoiceNumber", col("InvoiceNumber").cast(IntegerType())) 
#   df_column_selected = df_column_selected.withColumn("InvoiceDate", to_timestamp(col("InvoiceDate"), 'MM/dd/yyyy H:mm'))
  df_column_selected = df_column_selected.withColumn("InvoiceDate", col("InvoiceDate").cast(TimestampType()))

  return df_column_selected


In [0]:
def write_cleaned_data(df_column_selected):
    df_column_selected.write.format("jdbc").mode("append").option("url","jdbc:sqlserver://destination-landing-db.database.windows.net:1433;database=destination").option("dbtable","curated_layer.transformed_data").option("user","adminuser").option("password","Test1234").option("mode","append").save()


In [0]:
df = data_cleaning(df)
df_column_selected = data_transformation(df)
write_cleaned_data(df_column_selected)