In [0]:
spark.conf.set(
    "fs.azure.account.key.projectadf1.dfs.core.windows.net",
    "CSiXNWhibZG0fqyBMd04Qn4++bAtOoS+HwZt5IWin/Pk9UrUimppMcGnXCzGUICGV/PCey4nuGzs+AStQ5MAfQ==")

In [0]:
from pyspark.sql.functions import col, to_timestamp, concat, date_format, current_timestamp
appointments_raw_df = spark.read.table("healthcare_raw.appointments")

In [0]:
from pyspark.sql.functions import col, to_date, to_timestamp, date_format, dayofmonth, current_timestamp

appointments_cleaned_df = (appointments_raw_df
    # Keep Date column as-is (string from CSV)
    .withColumn("appointment_date", col("Date"))
    
    # Convert to proper date for extracting parts
    .withColumn("date_parsed", to_date(col("Date"), "dd-MM-yyyy"))
    
    # Extract year, month, and day
    .withColumn("appointment_year", date_format(col("date_parsed"), "yyyy"))
    .withColumn("appointment_month", date_format(col("date_parsed"), "MM"))
    .withColumn("appointment_day", dayofmonth(col("date_parsed")))
    
    # Parse Time column (ISO timestamp)
    .withColumn("appointment_timestamp", to_timestamp(col("Time"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
    
    # Add ingestion timestamp
    .withColumn("ingestion_date", current_timestamp())
    
    # Deduplicate
    .dropDuplicates(["AppointmentID"])
    
    # Final selection
    .select(
        col("AppointmentID").alias("appointment_id"),
        "appointment_timestamp",
        "appointment_date",       # original string from CSV
        "appointment_year",
        "appointment_month",
        "appointment_day",        # new day column (int 1–31)
        col("PatientID").alias("patient_id"),
        col("DoctorID").alias("doctor_id"),
        "ingestion_date"
    )
)


In [0]:
(appointments_cleaned_df.write
    .mode("overwrite")
    .format("delta")
    .partitionBy("appointment_year")
    .saveAsTable("healthcare_processed.appointment"))

In [0]:
appointments_cleaned_df.printSchema()

root
 |-- appointment_id: integer (nullable = true)
 |-- appointment_timestamp: timestamp (nullable = true)
 |-- appointment_date: date (nullable = true)
 |-- appointment_year: string (nullable = true)
 |-- appointment_month: string (nullable = true)
 |-- appointment_day: integer (nullable = true)
 |-- patient_id: integer (nullable = true)
 |-- doctor_id: integer (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



In [0]:
appointments_cleaned_df.write.mode("overwrite").parquet(f"abfss://source@projectadf1.dfs.core.windows.net/appointments")

In [0]:
display(spark.read.table("healthcare_processed.appointment").limit(5))
print("Appointments ingestion completed successfully!")

appointment_id,appointment_timestamp,appointment_date,appointment_year,appointment_month,appointment_day,patient_id,doctor_id,ingestion_date
105,2023-12-23T14:33:46.41Z,2020-01-29,2020,1,29,270,912,2025-08-25T10:14:19.164Z
109,2023-12-23T14:33:46.41Z,2020-11-08,2020,11,8,439,480,2025-08-25T10:14:19.164Z
121,2023-12-23T14:33:46.411Z,2020-04-17,2020,4,17,323,660,2025-08-25T10:14:19.164Z
126,2023-12-23T14:33:46.414Z,2020-07-12,2020,7,12,190,731,2025-08-25T10:14:19.164Z
131,2023-12-23T14:33:46.41Z,2020-12-24,2020,12,24,927,449,2025-08-25T10:14:19.164Z


Appointments ingestion completed successfully!


In [0]:
dbutils.notebook.exit("Success")