# Read CSV File from Lakehouse

In [None]:
emp_raw_df=spark.read.format("csv").load("Files/emp.csv")
emp_raw_df.printSchema()
display(emp_raw_df.limit(5))

StatementMeta(, b69b93c2-715e-48dc-9b8e-5d6ed9b2783f, 12, Finished, Available, Finished)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



SynapseWidget(Synapse.DataFrame, ecbad3a1-16fb-462e-b605-356f5ed44964)

# Read CSV File from Lakehouse with header

In [None]:
emp_raw_df=spark.read.format("csv").option("header","true").load("Files/emp.csv")
emp_raw_df.printSchema()
display(emp_raw_df.limit(5))

StatementMeta(, 77264554-00d5-4d73-90bb-adc46d2d82c9, 58, Finished, Available, Finished)

root
 |-- empno: string (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: string (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: string (nullable = true)
 |-- comm: string (nullable = true)
 |-- deptno: string (nullable = true)



SynapseWidget(Synapse.DataFrame, 8f413f9e-4c6f-4bc0-a24e-6c7524b2e181)

# Read CSV File from Lakehouse with header/Inferschema

In [None]:
emp_raw_df=spark.read.format("csv").option("header","true").option("inferschema","true").load("Files/emp.csv")
emp_raw_df.printSchema()
display(emp_raw_df.limit(5))

StatementMeta(, 77264554-00d5-4d73-90bb-adc46d2d82c9, 59, Finished, Available, Finished)

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)



SynapseWidget(Synapse.DataFrame, 556b4784-00a6-4992-af02-f7fc8d5c5a36)

# Read CSV File from Lakehouse with below transformations
1. datatype conversion
2. replace null values

In [None]:
from pyspark.sql.functions import to_date, trim, col, coalesce, lit

# Fix Spark 3.x date parsing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

emp_raw_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("Files/emp.csv")

emp_raw_df = emp_raw_df.withColumn(
    "hiredate",
    to_date(trim(col("hiredate")), "dd-MMM-yy")
).withColumn(
    "comm",
    coalesce(col("comm"), lit(0))
)

emp_raw_df.printSchema()
display(emp_raw_df.limit(5))


StatementMeta(, b69b93c2-715e-48dc-9b8e-5d6ed9b2783f, 6, Finished, Available, Finished)

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: date (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = false)
 |-- deptno: integer (nullable = true)



SynapseWidget(Synapse.DataFrame, e14513d9-b8de-4a76-879e-b875af5111d9)

# Read CSV File from Lakehouse

In [None]:
dept_raw_df=spark.read.format("csv")\
.option("header","true")\
.option("inferschema","true").load("Files/dept.csv")

dept_raw_df.printSchema()
display(dept_raw_df)

StatementMeta(, b69b93c2-715e-48dc-9b8e-5d6ed9b2783f, 8, Finished, Available, Finished)

root
 |-- deptno: integer (nullable = true)
 |-- dname: string (nullable = true)
 |-- loc: string (nullable = true)



SynapseWidget(Synapse.DataFrame, 7c40d00c-7663-4b80-846f-63336b24c63c)