In [0]:
from pyspark.sql.functions import (
    col, abs, initcap, regexp_replace, trim, split, 
    current_timestamp, to_timestamp
)
from pyspark.sql.types import IntegerType, BooleanType

# ---------------------------------------------------------
# إعدادات الاتصال بـ Azure Storage
# ---------------------------------------------------------
storage_account_name = "hospitalstorge"

# ملاحظة: يعتمد الكود على أن المفتاح مضاف مسبقاً في إعدادات الكلاستر (Spark Config).

# ---------------------------------------------------------
# إعدادات المسارات (Config)
# ---------------------------------------------------------
# مسار القراءة (Source)
source_path = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/CSVFILE"
# مسار الكتابة (Target)
target_path = f"abfss://silver@{storage_account_name}.dfs.core.windows.net/static_data"

# ---------------------------------------------------------
# 1. معالجة جدول الموظفين (Staff)
# ---------------------------------------------------------
print("Reading Staff data...")
# تم تصحيح اسم الملف هنا بناءً على الملف الذي رفعته
df_staff = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{source_path}/staff (1).csv")

df_staff_silver = df_staff \
    .dropDuplicates(["staff_id"]) \
    .filter(col("staff_id").isNotNull()) \
    .withColumn("age", abs(col("age").cast(IntegerType()))) \
    .withColumn("years_experience", col("years_experience").cast(IntegerType())) \
    .withColumn("on_leave", col("on_leave").cast(BooleanType())) \
    .drop("hospital_name") \
    .withColumn("first_name", initcap(trim(col("first_name")))) \
    .withColumn("last_name", initcap(trim(col("last_name")))) \
    .filter(col("age") >= 18) \
    .na.fill({
        "years_experience": 0,
        "qualification": "Unknown",
        "specialization": "General"
    }) \
    .withColumn("city", initcap(col("city"))) \
    .withColumn("processing_time", current_timestamp())

df_staff_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f"{target_path}/silver_staff")

print(f"✅ تم حفظ الملف في: {target_path}/silver_staff")

# ---------------------------------------------------------
# 2. معالجة جدول المستشفيات (Hospitals)
# ---------------------------------------------------------
print("Reading Hospitals data...")
df_hospitals = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{source_path}/hospitals.csv")

df_hosp_silver = df_hospitals \
    .dropDuplicates(["hospital_id"]) \
    .withColumn("hospital_capacity", col("hospital_capacity").cast(IntegerType())) \
    .withColumn("current_occupancy", col("current_occupancy").cast(IntegerType())) \
    .withColumn("emergency_services", col("emergency_services").cast(BooleanType())) \
    .withColumn("services", split(regexp_replace(col("services"), "[\\[\\]']", ""), ", ")) \
    .withColumn("departments", split(regexp_replace(col("departments"), "[\\[\\]']", ""), ", ")) \
    .withColumn("hospital_name", trim(col("hospital_name"))) \
    .withColumn("processing_time", current_timestamp())

df_hosp_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f"{target_path}/silver_hospitals")

print(f"✅ تم حفظ الملف في: {target_path}/silver_hospitals")

# ---------------------------------------------------------
# 3. معالجة جدول العلاجات (Department Treatment)
# ---------------------------------------------------------
print("Reading Department Treatment data...")
df_treatment = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{source_path}/department_treatment.csv")

df_treat_silver = df_treatment \
    .dropDuplicates(["diagnosis_id"]) \
    .withColumn("treatments", split(regexp_replace(col("treatments"), "[\\[\\]']", ""), ", ")) \
    .withColumn("department", initcap(col("department"))) \
    .withColumn("processing_time", current_timestamp())

df_treat_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f"{target_path}/silver_department_treatment")

print(f"✅ تم حفظ الملف في: {target_path}/silver_department_treatment")

# ---------------------------------------------------------
# 4. معالجة جدول الأدوية (Medicine)
# ---------------------------------------------------------
print("Reading Medicine data...")
df_medicine = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{source_path}/medicine.csv")

df_med_silver = df_medicine \
    .dropDuplicates(["medicine_id"]) \
    .withColumn("name", initcap(trim(col("name")))) \
    .withColumn("compound", initcap(trim(col("compound")))) \
    .withColumn("side_effects", split(col("side_effects"), ", ")) \
    .withColumn("processing_time", current_timestamp())

df_med_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f"{target_path}/silver_medicine")

print(f"✅ تم حفظ الملف في: {target_path}/silver_medicine")

# ---------------------------------------------------------
# 5. معالجة جدول مخزون الأدوية (Stock Medicine)
# ---------------------------------------------------------
print("Reading Stock Medicine data...")
df_stock = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{source_path}/stock_medicine.csv")

df_stock_silver = df_stock \
    .withColumn("stock_quantity", col("stock_quantity").cast(IntegerType())) \
    .withColumn("last_updated", to_timestamp(col("last_updated"))) \
    .withColumn("processing_time", current_timestamp())

df_stock_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f"{target_path}/silver_stock_medicine")

print(f"✅ تم حفظ الملف في: {target_path}/silver_stock_medicine")

Reading Staff data...
✅ تم حفظ الملف في: abfss://silver@hospitalstorge.dfs.core.windows.net/static_data/silver_staff
Reading Hospitals data...
✅ تم حفظ الملف في: abfss://silver@hospitalstorge.dfs.core.windows.net/static_data/silver_hospitals
Reading Department Treatment data...
✅ تم حفظ الملف في: abfss://silver@hospitalstorge.dfs.core.windows.net/static_data/silver_department_treatment
Reading Medicine data...
✅ تم حفظ الملف في: abfss://silver@hospitalstorge.dfs.core.windows.net/static_data/silver_medicine
Reading Stock Medicine data...
✅ تم حفظ الملف في: abfss://silver@hospitalstorge.dfs.core.windows.net/static_data/silver_stock_medicine


In [0]:
from pyspark.sql.functions import col, explode, current_timestamp
from delta.tables import *

# -------------------------------------------------------
# 1. إعداد المسارات (Configuration)
# -------------------------------------------------------
storage_account_name = "hospitalstorge"

# مسار القراءة: المكان الذي كتب فيه كود الـ Bronze السابق
source_path = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/Realtime"

# مسارات الـ Checkpoints (يجب أن تكون في Storage Account)
checkpoint_fact = f"abfss://silver@{storage_account_name}.dfs.core.windows.net/Checkpoints/fact_admissions"
checkpoint_dim = f"abfss://silver@{storage_account_name}.dfs.core.windows.net/Checkpoints/dim_patients"

# اسم قاعدة البيانات والجداول المستهدفة
target_db = "silver_data"
spark.sql(f"CREATE DATABASE IF NOT EXISTS {target_db}")

# -------------------------------------------------------
# 2. قراءة البيانات من Bronze (Stream)
# -------------------------------------------------------
# نقرأ من المسار مباشرة بصيغة Delta
df_bronze = spark.readStream.format("delta").load(source_path)

# -------------------------------------------------------
# أولاً: معالجة جدول الزيارات (Fact Table)
# -------------------------------------------------------
df_fact = (
    df_bronze
    .withColumn("rec", explode(col("records")))
    .select(
        col("rec.record_id").cast("string").alias("record_id"),
        col("patient_id").cast("string").alias("patient_id"),
        col("rec.hospital_id").cast("string").alias("hospital_id"),
        col("rec.doctor_id").cast("string").alias("doctor_id"),
        col("rec.department").alias("department"),
        col("rec.diagnosis").alias("diagnosis"),
        col("rec.treatment").alias("treatment"),
        col("rec.medicine_taken").alias("medicine_taken"),
        col("rec.severity_level").alias("severity_level"),
        col("rec.arrival_mode").alias("arrival_mode"),
        col("rec.ICU_admission").alias("ICU_admission"),
        col("rec.admission_time").cast("timestamp").alias("admission_time"),
        col("rec.discharge_time").cast("timestamp").alias("discharge_time"),
        col("ingestion_time") # مفيد للتتبع
    )
)

# كتابة الزيارات (Fact Table)
# نستخدم append لأن كل سجل هو زيارة جديدة
query_fact = (
    df_fact.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_fact)
    .option("mergeSchema", "true")
    .toTable(f"{target_db}.fact_admissions")
)

# -------------------------------------------------------
# ثانياً: معالجة جدول المرضى (Dim Table)
# -------------------------------------------------------
df_dim = (
    df_bronze
    .select(
        col("patient_id").cast("string").alias("patient_id"),
        col("full_name").alias("full_name"),
        col("first_name").alias("first_name"),
        col("last_name").alias("last_name"),
        col("age").cast("long").alias("age"),
        col("gender").alias("gender"),
        col("contact_number").alias("contact_number"),
        col("city").alias("city"),
        col("governorate").alias("governorate"),
        col("address").alias("address"),
        col("ingestion_time").alias("last_updated")
    )
    .dropDuplicates(["patient_id"]) # إزالة التكرار داخل الدفعة الواحدة
)

# دالة Upsert لدمج البيانات الجديدة مع القديمة (SCD Type 1)
def upsert_patients(microBatchDF, batchId):
    # التحقق من وجود الجدول، وإنشاؤه إذا لم يكن موجوداً
    table_full_name = f"{target_db}.dim_patients"
    
    if not spark.catalog.tableExists(table_full_name):
        microBatchDF.write.format("delta").saveAsTable(table_full_name)
    elif microBatchDF.count() > 0:
        deltaTable = DeltaTable.forName(spark, table_full_name)
        (
            deltaTable.alias("t")
            .merge(
                microBatchDF.alias("s"),
                "t.patient_id = s.patient_id"
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

# كتابة المرضى (Dim Table) باستخدام foreachBatch
query_dim = (
    df_dim.writeStream
    .format("delta")
    .foreachBatch(upsert_patients)
    .option("checkpointLocation", checkpoint_dim)
    .start()
)

print("🚀 Splitting data into Fact and Dim tables started...")
print(f"✅ Fact Table: {target_db}.fact_admissions")
print(f"✅ Dim Table: {target_db}.dim_patients")

# انتظار الدفق
query_fact.awaitTermination()
query_dim.awaitTermination()

In [0]:
# Define your database name
target_db = "your_database_name"

display(
    spark.read.format("delta").load(f"/mnt/{target_db}/Checkpoints/fact_admissions")
)
display(
    spark.read.format("delta").load(f"/mnt/{target_db}/Checkpoints/dim_patients")
)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-7070053585862413>, line 4[0m
[1;32m      1[0m [38;5;66;03m# Define your database name[39;00m
[1;32m      2[0m target_db [38;5;241m=[39m [38;5;124m"[39m[38;5;124myour_database_name[39m[38;5;124m"[39m
[0;32m----> 4[0m display(
[1;32m      5[0m     spark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m)[38;5;241m.[39mload([38;5;124mf[39m[38;5;124m"[39m[38;5;124m/mnt/[39m[38;5;132;01m{[39;00mtarget_db[38;5;132;01m}[39;00m[38;5;124m/Checkpoints/fact_admissions[39m[38;5;124m"[39m)
[1;32m      6[0m )
[1;32m      7[0m display(
[1;32m      8[0m     spark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m)[38;5;241m.[39mload([38;5;124mf[39m[38;5;124m"[39m[38;