In [0]:
dbutils.widgets.text("jdbcUsername", "", "JDBC Username")
dbutils.widgets.text("jdbcPassword", "", "JDBC Password")


In [0]:
# Define the connection properties
jdbcHostname = "server34x.database.windows.net"
jdbcPort = 1433
jdbcDatabase = "sqldb"
jdbcUsername = dbutils.widgets.get("jdbcUsername")
jdbcPassword = dbutils.widgets.get("jdbcPassword")

# Create the JDBC URL
jdbcUrl = f"jdbc:sqlserver://{jdbcHostname}:{jdbcPort};database={jdbcDatabase}"

# Set the connection properties
connectionProperties = {
  "user" : jdbcUsername,
  "password" : jdbcPassword,
  "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

In [0]:
# Read data from the SQL database
salesdf = spark.read.jdbc(url=jdbcUrl, table="sales_orders", properties=connectionProperties)

# Display the DataFrame
display(salesdf.limit(10))

Category,Product,Region,Date,Sales,Quantity
Home Decor,Cushion,East,2022-04-13,388.1,8.0
Clothing,Jacket,east,2023-12-02,278.27,3.0
Books,Fiction,South,2022-08-03,-18.05,8.0
Clothing,Jeans,West,2023-01-08,280.61,8.0
Home Decor,Lamp,North,2022-05-11,-38.68,2.0
Sports,Football,South,2022-12-10,407.84,6.0
Home Decor,Cushion,South,2023-01-21,50.0,5.0
,,east,2022-06-10,,6.0
,,West,2022-09-10,187.57,1.0
Home Decor,Curtain,West,2023-04-20,286.52,3.0


In [0]:
from pyspark.sql.functions import col, sum

# Find all the missing values in salesdf
missing_values = salesdf.select([sum(col(c).isNull().cast("int")).alias(c) for c in salesdf.columns])

# Display the DataFrame with missing values count
display(missing_values)

Category,Product,Region,Date,Sales,Quantity
160,210,0,46,94,45


In [0]:
# Transpose the missing_values DataFrame
transposed_missing_values = missing_values.selectExpr("stack(" + str(len(missing_values.columns)) + ", " + 
                                                      ", ".join([f"'{c}', {c}" for c in missing_values.columns]) + 
                                                      ") as (Column, MissingCount)")

# Display the transposed DataFrame
display(transposed_missing_values)

Column,MissingCount
Category,160
Product,210
Region,0
Date,46
Sales,94
Quantity,45


Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import when, col, lit, expr

# Handle missing values for Category and Product
salesdf = salesdf.withColumn("Category", when(col("Category").isNull(), "Unknown").otherwise(col("Category")))
salesdf = salesdf.withColumn("Product", when(col("Product").isNull(), "Unknown product").otherwise(col("Product")))

# Drop rows with missing Date
salesdf = salesdf.filter(col("Date").isNotNull())

# Fill missing Quantity with median
median_quantity = salesdf.approxQuantile("Quantity", [0.5], 0.0)[0]
salesdf = salesdf.withColumn("Quantity", when(col("Quantity").isNull(), lit(median_quantity)).otherwise(col("Quantity")))

# Display the updated DataFrame
display(salesdf)

Category,Product,Region,Date,Sales,Quantity
Home Decor,Cushion,East,2022-04-13,388.1,8.0
Clothing,Jacket,east,2023-12-02,278.27,3.0
Books,Fiction,South,2022-08-03,-18.05,8.0
Clothing,Jeans,West,2023-01-08,280.61,8.0
Home Decor,Lamp,North,2022-05-11,-38.68,2.0
Sports,Football,South,2022-12-10,407.84,6.0
Home Decor,Cushion,South,2023-01-21,50.0,5.0
Unknown,Unknown product,east,2022-06-10,,6.0
Unknown,Unknown product,West,2022-09-10,187.57,1.0
Home Decor,Curtain,West,2023-04-20,286.52,3.0


In [0]:
row_count = salesdf.count()

Out[17]: 954

In [0]:
# Write the extracted data into a Parquet file
salesdf.write.mode("overwrite").parquet("dbfs:/FileStore/data/sales_data_cleaned.parquet")

In [0]:
dbutils.notebook.exit(row_count)