# Read and write file to Lakehouse

In [None]:
import requests

# 1️⃣ Paths
lakehouse_path = "abfss://55732739-60eb-445b-94c4-65725b7190fa@onelake.dfs.fabric.microsoft.com/69019a9b-1026-430c-a874-1f18f5c21aa6/Files/nobel-prize-winners-by-year.json"
github_url = "https://raw.githubusercontent.com/sharmadhiraj/free-json-datasets/refs/heads/master/datasets/nobel-prize-winners-by-year.json"

# 2️⃣ Download JSON from GitHub
response = requests.get(github_url)
response.raise_for_status()
data = response.text

# 3️⃣ Write JSON to Lakehouse
mssparkutils.fs.put(lakehouse_path, data, overwrite=True)

print("✅ JSON successfully written to Lakehouse!")


# Read the JSON back in Spark

In [None]:
df = spark.read \
    .option("multiLine", "true") \
    .json(lakehouse_path)

df.printSchema()
display(df)


# Flatten nested winners

In [None]:
from pyspark.sql.functions import explode, col

df_flat = df.select(
    "year",
    explode("winners").alias("category_struct")
).select(
    "year",
    "category_struct.category",
    explode("category_struct.winners").alias("winner")
).select(
    "year",
    "category",
    col("winner.name").alias("winner_name"),
    col("winner.country").alias("winner_country"),
    col("winner.achievement").alias("winner_achievement")
)

display(df_flat.limit(5))
