In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count

# Initialize the Spark session
spark = SparkSession.builder.appName("Data_Validation_PrimaryKey").getOrCreate()

StatementMeta(SparkPool2448PP, 27, 14, Finished, Available, Finished)

In [14]:
# Define the dataset folder paths
bronze_dataset_folder_path = "abfss://bronze@adlssalesproject2448pp2.dfs.core.windows.net"
silver_dataset_folder_path = "abfss://silver@adlssalesproject2448pp2.dfs.core.windows.net"

StatementMeta(SparkPool2448PP, 27, 15, Finished, Available, Finished)

In [15]:
# Define the primary key dictionary
pk_dict = {
    'PRODUCT': 'NDC_CD',
    'PROVIDER': 'PROVIDER_ID',
    'DIAGNOSIS': 'DIAGNOSIS_CODE,ICD_VERSION_TYPE',
    'PROCEDURE': 'PROCEDURE_CODE,PRC_VERS_TYP_ID',
    'PLAN': 'PAYER_PLAN_ID',
    'RX PATIENT ACTIVITY': 'PATIENT_ID',
    'PATIENT MPD': 'PATIENT_ID,MPD_YEAR',
    'PATIENT COMMERCIAL': 'PATIENT_ID,ACTIVITY_YEAR',
    'PATIENT DEMOGRAPHICS': 'PATIENT_ID'
}

StatementMeta(SparkPool2448PP, 27, 16, Finished, Available, Finished)

In [16]:
# List all subfolders within the bronze dataset folder
bronze_subfolder_list = mssparkutils.fs.ls(bronze_dataset_folder_path)

for subfolder in bronze_subfolder_list:
    subfolder_path = subfolder.path
    
    # List all files within the subfolder
    file_list = mssparkutils.fs.ls(subfolder_path)
    
    for file in file_list:
        file_path = file.path
        
        # Read the file into a Spark DataFrame
        df = spark.read.csv(file_path, header=True, inferSchema=True)
        
        # Get the primary key column(s) for the current subfolder
        pk_columns = pk_dict[subfolder.name]
        pk_col_list = pk_columns.split(",")  # Split by comma for composite keys
        
        # Count the number of primary key columns
        pk_col_count = len(pk_col_list)
        
        # Filter rows where any of the primary key columns are null
        null_pk_df = df.filter(
            col(pk_col_list[0]).isNull() if pk_col_count == 1 else 
            (col(pk_col_list[0]).isNull() | col(pk_col_list[1]).isNull())
        )
        
        # Find duplicates based on primary key columns
        if pk_col_count == 1:
            duplicate_pk_df = df.groupBy(pk_col_list[0]).agg(count("*").alias("count")).filter(col("count") > 1)
        else:
            duplicate_pk_df = df.groupBy(pk_col_list[0], pk_col_list[1]).agg(count("*").alias("count")).filter(col("count") > 1)
        
        duplicate_pk_df = duplicate_pk_df.drop('count')

        # Combine null primary key rows and duplicate primary key rows
        reject_df = null_pk_df.union(df.join(duplicate_pk_df, on=pk_col_list, how="inner"))
        
        # Subtract rejected rows from the original DataFrame to get valid rows
        valid_df = df.subtract(reject_df)
        
        # Write valid data to the silver folder path
        valid_path = f"{silver_dataset_folder_path}/{file.name.replace('.csv', '.parquet')}"
        valid_df.coalesce(1).write.mode("overwrite").parquet(valid_path)
        
        # If there are any rejected rows, write them to a separate reject path
        if reject_df.count() > 0:
            reject_path = f"{silver_dataset_folder_path}/Reject_{file.name.replace('.csv', '.parquet')}"
            reject_df.coalesce(1).write.mode("overwrite").parquet(reject_path)
        
        # Free up memory for the intermediate DataFrames
        reject_df.unpersist()
        valid_df.unpersist()
        duplicate_pk_df.unpersist()
        null_pk_df.unpersist()
        df.unpersist()


StatementMeta(SparkPool2448PP, 27, 17, Finished, Available, Finished)