In [3]:
from pyspark.sql.functions import to_date, trim, col, coalesce, lit

# Fix Spark 3.x date parsing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Read employee CSV
emp_raw_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("Files/Landing/emp.csv")

# Process columns
emp_df = emp_raw_df.withColumn(
    "hiredate",
    to_date(trim(col("hiredate")), "dd-MMM-yy")
).withColumn(
    "sal",
    coalesce(col("sal"), lit(0))       # fill missing salary with 0 if needed
).withColumn(
    "comm",
    coalesce(col("comm"), lit(0))      # fill missing commission with 0
).withColumn(
    "TotalSal",
    col("sal") + col("comm5")           # total salary = sal + comm
)

# Show schema and sample data
emp_df.printSchema()
display(emp_df.limit(5))

StatementMeta(, c4d0f7f6-134c-4798-9519-72d87cd91f6d, 5, Finished, Available, Finished)

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: date (nullable = true)
 |-- sal: integer (nullable = false)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)
 |-- comm5: integer (nullable = false)
 |-- TotalSal: integer (nullable = false)



SynapseWidget(Synapse.DataFrame, 8f07f524-adbf-4112-bc23-dec8632fcd43)

In [2]:
col?

StatementMeta(, c4d0f7f6-134c-4798-9519-72d87cd91f6d, 4, Finished, Available, Finished)

In [6]:
from pyspark.sql.functions import to_date, trim, col, coalesce, lit

# Fix Spark 3.x date parsing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Read employee CSV
emp_raw_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("Files/Landing/emp.csv")

# Process columns
emp_df = emp_raw_df.withColumn(
    "hiredate",
    to_date(trim(col("hiredate")), "dd-MMM-yy")
).withColumn(
    "comm",
    coalesce(col("comm"), lit(0))      # fill missing commission with 0
).withColumn(
    "TotalSal",
    col("sal") + col("comm")           # total salary = sal + comm
)

# Show schema and sample data
emp_df.printSchema()
emp_df.show(5)


StatementMeta(, c4d0f7f6-134c-4798-9519-72d87cd91f6d, 8, Finished, Available, Finished)

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: date (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = false)
 |-- deptno: integer (nullable = true)
 |-- TotalSal: integer (nullable = true)

+-----+------+--------+----+----------+----+----+------+--------+
|empno| ename|     job| mgr|  hiredate| sal|comm|deptno|TotalSal|
+-----+------+--------+----+----------+----+----+------+--------+
| 7369| SMITH|   CLERK|7902|1980-12-17| 800|   0|    20|     800|
| 7900| JAMES|   CLERK|7698|1981-12-03| 950|   0|    30|     950|
| 7876| ADAMS|   CLERK|7788|1987-05-23|1000|   0|    20|    1000|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250| 500|    30|    1750|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250|1400|    30|    2650|
+-----+------+--------+----+----------+----+----+------+--------+
only showing top 5 rows

