# Download from git and upload into lakehouse

In [2]:
import requests

# 1️⃣ Paths
lakehouse_path = "abfss://55732739-60eb-445b-94c4-65725b7190fa@onelake.dfs.fabric.microsoft.com/69019a9b-1026-430c-a874-1f18f5c21aa6/Files/emp.xml"
github_url = "https://raw.githubusercontent.com/rritec/Microsoft-Fabric/refs/heads/main/Labdata/emp.xml"
# 2️⃣ Download JSON from GitHub
response = requests.get(github_url)
response.raise_for_status()
data = response.text

# 3️⃣ Write JSON to Lakehouse
mssparkutils.fs.put(lakehouse_path, data, overwrite=True)

print("✅ JSON successfully written to Lakehouse!")

StatementMeta(, 0da083a2-ac04-497b-ae3f-dfe53716692a, 4, Finished, Available, Finished)

✅ JSON successfully written to Lakehouse!


# Read XML File

In [3]:
emp_df = spark.read \
    .format("xml") \
    .option("rowTag", "employee") \
    .load("Files/emp.xml")

emp_df.show()
emp_df.printSchema()


StatementMeta(, 0da083a2-ac04-497b-ae3f-dfe53716692a, 5, Finished, Available, Finished)

+----+------+-----+------+---------+---------+----+----+
|comm|deptno|empno| ename| hiredate|      job| mgr| sal|
+----+------+-----+------+---------+---------+----+----+
|NULL|    20| 7369| SMITH|17-Dec-80|    CLERK|7902| 800|
|NULL|    30| 7900| JAMES| 3-Dec-81|    CLERK|7698| 950|
|NULL|    20| 7876| ADAMS|23-May-87|    CLERK|7788|1000|
| 500|    30| 7521|  WARD|22-Feb-81| SALESMAN|7698|1250|
|1400|    30| 7654|MARTIN|28-Sep-81| SALESMAN|7698|1250|
|NULL|    10| 7934|MILLER|23-Jan-82|    CLERK|7782|1300|
|   0|    30| 7844|TURNER| 8-Sep-81| SALESMAN|7698|1500|
| 300|    30| 7499| ALLEN|20-Feb-81| SALESMAN|7698|1600|
|NULL|    10| 7782| CLARK| 9-Jun-81|  MANAGER|7839|2450|
|NULL|    30| 7698| BLAKE| 1-May-81|  MANAGER|7839|2850|
|NULL|    20| 7566| JONES| 2-Apr-81|  MANAGER|7839|2975|
|NULL|    20| 7788| SCOTT|19-Apr-87|  ANALYST|7566|3000|
|NULL|    20| 7902|  FORD| 3-Dec-81|  ANALYST|7566|3000|
|NULL|    10| 7839|  KING|17-Nov-81|PRESIDENT|NULL|5000|
|NULL|    50| 7839|   Ram|17-No

# Best Way to read file with required data by defining datatypes

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("empno", IntegerType(), True),
    StructField("ename", StringType(), True),
    StructField("sal", IntegerType(), True),
    StructField("deptno", IntegerType(), True)
])

emp_df = spark.read \
    .format("xml") \
    .option("rowTag", "employee") \
    .schema(schema) \
    .load("Files/emp.xml")

emp_df.show()
emp_df.printSchema()

StatementMeta(, 0da083a2-ac04-497b-ae3f-dfe53716692a, 9, Finished, Available, Finished)

+-----+------+----+------+
|empno| ename| sal|deptno|
+-----+------+----+------+
| 7369| SMITH| 800|    20|
| 7900| JAMES| 950|    30|
| 7876| ADAMS|1000|    20|
| 7521|  WARD|1250|    30|
| 7654|MARTIN|1250|    30|
| 7934|MILLER|1300|    10|
| 7844|TURNER|1500|    30|
| 7499| ALLEN|1600|    30|
| 7782| CLARK|2450|    10|
| 7698| BLAKE|2850|    30|
| 7566| JONES|2975|    20|
| 7788| SCOTT|3000|    20|
| 7902|  FORD|3000|    20|
| 7839|  KING|5000|    10|
| 7839|   Ram|5000|    50|
+-----+------+----+------+

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- deptno: integer (nullable = true)



In [9]:
emp_df.write \
    .mode("overwrite") \
    .format("delta") \
    .save("Tables/dbo/emp_delta_xml")


StatementMeta(, 0da083a2-ac04-497b-ae3f-dfe53716692a, 11, Finished, Available, Finished)

# Q&A

## MCQs: Reading XML Data in Microsoft Fabric

1. **Which Microsoft Fabric engine natively supports reading XML files?**  
   - A. SQL Analytics Endpoint  
   - B. Data Warehouse  
   - C. Spark (Lakehouse Notebook)  
   - D. KQL Database  
   **Answer:** C  

2. **Which option is mandatory when reading XML files using Spark in Microsoft Fabric?**  
   - A. rootTag  
   - B. rowTag  
   - C. inferSchema  
   - D. header  
   **Answer:** B  

3. **Where should XML files be stored before being read by Spark in Fabric?**  
   - A. Azure SQL Database  
   - B. OneLake Lakehouse `Files` area  
   - C. SQL Analytics tables  
   - D. Power BI datasets  
   **Answer:** B  

4. **Which Spark API is used to read XML files in Microsoft Fabric?**  
   - A. `spark.read.csv()`  
   - B. `spark.read.json()`  
   - C. `spark.read.format("xml")`  
   - D. `spark.read.loadXml()`  
   **Answer:** C  

5. **Can the SQL Analytics Endpoint directly parse and read XML files from OneLake?**  
   - A. Yes  
   - B. No  
   - C. Only using external tables  
   - D. Only using views  
   **Answer:** B  

6. **What is the recommended Fabric approach to make XML data available for SQL Analytics?**  
   - A. Load XML directly into SQL tables  
   - B. Convert XML to CSV manually  
   - C. Parse XML using Spark and write as Delta tables  
   - D. Use Power BI to transform XML  
   **Answer:** C  

7. **Which file format is preferred in Microsoft Fabric after parsing XML for analytics?**  
   - A. XML  
   - B. CSV  
   - C. JSON  
   - D. Delta  
   **Answer:** D  

8. **Why is providing an explicit schema recommended when reading XML in Fabric Spark?**  
   - A. XML does not support schema  
   - B. It improves security  
   - C. It improves performance and avoids schema drift  
   - D. It is mandatory in Fabric  
   **Answer:** C  

9. **Which Fabric component can ingest XML files using a no-code approach but cannot parse them?**  
   - A. SQL Analytics Endpoint  
   - B. Data Pipeline  
   - C. Power BI  
   - D. KQL Database  
   **Answer:** B  

10. **What happens if the `rowTag` option is not specified while reading an XML file in Spark?**  
    - A. Spark automatically infers it  
    - B. Only the first record is read  
    - C. Spark throws an error  
    - D. XML is treated as JSON  
    **Answer:** C  
