##1. Custom Ingesting data from REST API to Datalake (DBFS/S3)

In [0]:
import requests
import json
from datetime import datetime

api_url = "http://inceptezlabs.com/api.php"

def fetch_get_save_data():
    try:
        resp = requests.get(api_url)
        if resp.status_code != 200:
            print(f"failed to fetch the data,error code is: {resp.status_code}")
            print(f"Response: {resp.text}")
            return
        try:
            data = resp.json()
            print("Data has been received successfully")
            print(data)
        except json.JSONDecodeError:
            print("The response is not valid json data ")
            return
        
        ts = datetime.now().strftime("%Y%m%d%H%M%S")
        output_path = f"/Volumes/soundhar_catalog/soundhar_schema/soundhar_volume/apidata/posts_{ts}.json"

        try:
            dbutils.fs.put(
                output_path,
                json.dumps(data),
                overwrite = True
            )
            print("Data has been successfully wrote to output path")
        except NameError:
            print("Error: 'dbutils' is not defined. This code must run in a Databricks notebook.")
    except Exception as e:
        print(f"A unexpected error occured: {e}")

if __name__ == "__main__":
    fetch_get_save_data()

        


##2.Autoloader from datalake to bronze layer(Datalake and Lakehouse)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import functions as F

user_schema = StructType([
    StructField("uid", StringType()),
    StructField("user",StructType([
        StructField("uid", StringType()),
        StructField("name", StringType()),
        StructField("email", StringType()),
        StructField("location", StringType()),
        StructField("registered", StringType())]))])


df_raw = (spark.readStream
        .format("cloudFiles")#.schema(user_schema)
        .option("cloudFiles.format", "json")
        #.option("cloudFiles.schemaEvolutionMode", "addNewColumns")
        .option("cloudFiles.schemaLocation","/Volumes/soundhar_catalog/soundhar_schema/soundhar_volume/bronze/streamwrite41/_schema")
        .option("cloudFiles.maxFilesPerTrigger", 1)
        .load("/Volumes/soundhar_catalog/soundhar_schema/soundhar_volume/apidata/"))
#df_raw will contain a row like below...
#status     data(string datatype contains json data, we cant access using dot notation)
#success    {"uid": "7e168858-cb80-46a6-a480-7b2a54ca61d9","user": {"name": "Afsheen Williams","email": "afsheen@example.com","location": "Dubai","registered": "2026-02-07T07:05:27+00:00"}}

#Making the data in a semi structured format using from_json
parsed_df = df_raw.withColumn("data", F.from_json(F.col("data"), user_schema))#important function from_json to convert json string data in the column to the custom schema applied json data in a hierarchical fashion like data.uid or data.user.email...
#Error: Can't extract a value from "data". Need a complex type [STRUCT, ARRAY, MAP] but got "STRING". SQLSTATE: 42000

#Structurize the semi structured hierarchical json data to delimited structure format
df_user = (
    parsed_df
        .select(
            F.col("data.uid").alias("uid"),
            F.col("data.user.name").alias("user_name"),
            F.col("data.user.email").alias("user_email"),
            F.col("data.user.location").alias("user_location"),
            F.to_timestamp(
                F.col("data.user.registered")
            ).alias("user_registered_ts"),
            F.current_timestamp().alias("ingestion_ts")
        )
)

(
    df_user.writeStream
        .format("delta")
        .trigger(availableNow=True)
        .option(
            "checkpointLocation",
            "/Volumes/soundhar_catalog/soundhar_schema/soundhar_volume/bronze/streamwrite41/_checkpoint")
        .start(
            "/Volumes/soundhar_catalog/soundhar_schema/soundhar_volume/bronze/streamwrite45"
        )
)

(df_user.writeStream
        .format("delta")
        .trigger(availableNow=True)
        .option(
            "checkpointLocation",
            "/Volumes/soundhar_catalog/soundhar_schema/soundhar_volume/bronze/streamwrite41/_checkpoint"
        )
        .toTable("soundhar_catalog.soundhar_schema.bronze_user_api2"))

##3.Validate the data

In [0]:
display(spark.sql("select * from soundhar_catalog.soundhar_schema.bronze_user_api2 order by user_registered_ts desc"))