In [0]:
# Databricks notebook source
from pyspark.sql import SparkSession, functions as f

#Reading Hospital A departments data 
df_hosa=spark.read.parquet("/mnt/bronze/hosipitala/transactions")

#Reading Hospital B departments data 
df_hosb=spark.read.parquet("/mnt/bronze/hosipitalb/transactions")

#union two departments dataframes
df_merged = df_hosa.unionByName(df_hosb)
display(df_merged)

df_merged.createOrReplaceTempView("transactions")

TransactionID,EncounterID,PatientID,ProviderID,DeptID,VisitDate,ServiceDate,PaidDate,VisitType,Amount,AmountType,PaidAmount,ClaimID,PayorID,ProcedureCode,ICDCode,LineOfBusiness,MedicaidID,MedicareID,InsertDate,ModifiedDate,datasource
TRANS000001,ENC001204,HOSP1-002372,PROV0456,DEPT002,2024-08-02,2024-05-25,2024-06-15,Routine,988.3699951171876,Medicare,315.70001220703125,CLAIM533365,PAYOR4707,94521,I19.4,Commercial,MEDI24173,MCARE19466,2021-01-16,2021-12-27,hosipitala
TRANS000002,ENC000029,HOSP1-002329,PROV0321,DEPT013,2024-05-02,2024-09-14,2024-09-19,Emergency,291.260009765625,Medicare,667.989990234375,CLAIM629724,PAYOR1481,51588,I54.0,Self-Pay,MEDI63110,MCARE97946,2021-02-06,2022-01-26,hosipitala
TRANS000003,ENC001088,HOSP1-004636,PROV0405,DEPT007,2024-07-25,2024-03-04,2024-06-13,Routine,91.51000213623048,Medicare,595.5599975585938,CLAIM305176,PAYOR8415,32053,I35.3,Medicaid,MEDI83622,MCARE77469,2022-05-09,2021-09-22,hosipitala
TRANS000004,ENC004215,HOSP1-004064,PROV0463,DEPT019,2024-01-30,2024-02-05,2024-01-21,Follow-up,893.2100219726562,Insurance,489.7999877929688,CLAIM987878,PAYOR5517,21422,I35.1,Commercial,MEDI89783,MCARE68786,2021-01-30,2023-02-20,hosipitala
TRANS000005,ENC006483,HOSP1-003625,PROV0167,DEPT011,2024-09-23,2024-04-11,2024-04-24,Emergency,729.3699951171875,Insurance,305.1099853515625,CLAIM988945,PAYOR8174,39210,I81.1,Commercial,MEDI52037,MCARE92710,2024-08-11,2022-01-28,hosipitala
TRANS000006,ENC007469,HOSP1-004823,PROV0288,DEPT018,2024-09-08,2024-05-15,2024-10-06,Routine,475.0899963378906,Co-pay,633.7100219726562,CLAIM402331,PAYOR1139,62808,I83.2,Medicaid,MEDI85005,MCARE90419,2020-12-29,2021-03-05,hosipitala
TRANS000007,ENC007100,HOSP1-001094,PROV0277,DEPT014,2024-07-04,2024-07-24,2024-07-10,Consultation,724.5900268554688,Insurance,332.010009765625,CLAIM306409,PAYOR2651,16169,I82.0,Self-Pay,MEDI42854,MCARE82454,2023-06-12,2023-03-30,hosipitala
TRANS000008,ENC004296,HOSP1-003520,PROV0100,DEPT007,2024-04-07,2024-10-06,2024-10-21,Consultation,951.3800048828124,Medicare,477.5299987792969,CLAIM896324,PAYOR6378,14394,I20.3,Medicare,MEDI80904,MCARE86559,2023-03-19,2023-12-08,hosipitala
TRANS000009,ENC001807,HOSP1-001227,PROV0164,DEPT015,2024-01-12,2024-03-19,2024-07-03,Consultation,633.22998046875,Co-pay,686.3400268554688,CLAIM231241,PAYOR8208,94338,I59.1,Commercial,MEDI57256,MCARE64019,2021-11-11,2020-10-16,hosipitala
TRANS000010,ENC006770,HOSP1-000065,PROV0130,DEPT006,2024-10-01,2024-01-16,2024-02-18,Follow-up,222.75,Medicare,792.739990234375,CLAIM201823,PAYOR5425,20590,I35.1,Self-Pay,MEDI97877,MCARE83122,2021-01-15,2021-05-17,hosipitala


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW quality_checks AS
SELECT 
    concat(TransactionID, '-', datasource) AS TransactionID,
    TransactionID AS SRC_TransactionID,
    EncounterID,
    PatientID,
    ProviderID,
    DeptID,
    VisitDate,
    ServiceDate,
    PaidDate,
    VisitType,
    Amount,
    AmountType,
    PaidAmount,
    ClaimID,
    PayorID,
    ProcedureCode,
    ICDCode,
    LineOfBusiness,
    MedicaidID,
    MedicareID,
    InsertDate AS SRC_InsertDate,
    ModifiedDate AS SRC_ModifiedDate,
    datasource,
    CASE 
        WHEN EncounterID IS NULL OR PatientID IS NULL OR TransactionID IS NULL OR VisitDate IS NULL THEN TRUE
        ELSE FALSE
    END AS is_quarantined
FROM transactions;


In [0]:
%sql
CREATE TABLE IF NOT EXISTS silver.transactions (
  TransactionID STRING,
  SRC_TransactionID STRING,
  EncounterID STRING,
  PatientID STRING,
  ProviderID STRING,
  DeptID STRING,
  VisitDate DATE,
  ServiceDate DATE,
  PaidDate DATE,
  VisitType STRING,
  Amount DOUBLE,
  AmountType STRING,
  PaidAmount DOUBLE,
  ClaimID STRING,
  PayorID STRING,
  ProcedureCode INTEGER,
  ICDCode STRING,
  LineOfBusiness STRING,
  MedicaidID STRING,
  MedicareID STRING,
  SRC_InsertDate DATE,
  SRC_ModifiedDate DATE,
  datasource STRING,
  is_quarantined BOOLEAN,
  audit_insertdate TIMESTAMP,
  audit_modifieddate TIMESTAMP,
  is_current BOOLEAN
)
USING DELTA;


In [0]:
%sql 
MERGE INTO silver.transactions AS target
USING quality_checks AS source
ON target.TransactionID = source.TransactionID
AND target.is_current = true

WHEN MATCHED AND (
    target.SRC_TransactionID != source.SRC_TransactionID OR
    target.EncounterID != source.EncounterID OR
    target.PatientID != source.PatientID OR
    target.ProviderID != source.ProviderID OR
    target.DeptID != source.DeptID OR
    target.VisitDate != source.VisitDate OR
    target.ServiceDate != source.ServiceDate OR
    target.PaidDate != source.PaidDate OR
    target.VisitType != source.VisitType OR
    target.Amount != source.Amount OR
    target.AmountType != source.AmountType OR
    target.PaidAmount != source.PaidAmount OR
    target.ClaimID != source.ClaimID OR
    target.PayorID != source.PayorID OR
    target.ProcedureCode != source.ProcedureCode OR
    target.ICDCode != source.ICDCode OR
    target.LineOfBusiness != source.LineOfBusiness OR
    target.MedicaidID != source.MedicaidID OR
    target.MedicareID != source.MedicareID OR
    target.SRC_InsertDate != source.SRC_InsertDate OR
    target.SRC_ModifiedDate != source.SRC_ModifiedDate OR
    target.datasource != source.datasource OR
    target.is_quarantined != source.is_quarantined
)
THEN UPDATE SET
    target.is_current = false,
    target.audit_modifieddate = current_timestamp()

WHEN NOT MATCHED
THEN INSERT (
    TransactionID,
    SRC_TransactionID,
    EncounterID,
    PatientID,
    ProviderID,
    DeptID,
    VisitDate,
    ServiceDate,
    PaidDate,
    VisitType,
    Amount,
    AmountType,
    PaidAmount,
    ClaimID,
    PayorID,
    ProcedureCode,
    ICDCode,
    LineOfBusiness,
    MedicaidID,
    MedicareID,
    SRC_InsertDate,
    SRC_ModifiedDate,
    datasource,
    is_quarantined,
    audit_insertdate,
    audit_modifieddate,
    is_current
)
VALUES (
    source.TransactionID,
    source.SRC_TransactionID,
    source.EncounterID,
    source.PatientID,
    source.ProviderID,
    source.DeptID,
    source.VisitDate,
    source.ServiceDate,
    source.PaidDate,
    source.VisitType,
    source.Amount,
    source.AmountType,
    source.PaidAmount,
    source.ClaimID,
    source.PayorID,
    source.ProcedureCode,
    source.ICDCode,
    source.LineOfBusiness,
    source.MedicaidID,
    source.MedicareID,
    source.SRC_InsertDate,
    source.SRC_ModifiedDate,
    source.datasource,
    source.is_quarantined,
    current_timestamp(),
    current_timestamp(),
    true
);


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
20000,0,0,20000
