In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pandas import *
date = datetime.today().strftime('%Y_%m_%d')


In [0]:
tables = {}
table_list = spark.sql("SHOW TABLES IN workspace.`tegge-insurance-data`").toPandas()
for table in table_list['tableName']:
    if table.endswith('_bronze'):
        key = table.replace('_bronze', '_silver')
    else:
        key = table
    tables[key] = f"workspace.`tegge-insurance-data`.{table}"

In [0]:
pk_dict = spark.read.format("delta").table("workspace.`tegge-insurance-data-silver`.table_pk_fk")

pk_dict = pk_dict.select("Table_Name", "Primary_Key").toPandas()

pk_dict = dict(zip(pk_dict["Table_Name"] , pk_dict["Primary_Key"]))


In [0]:
fk_dict = spark.read.format("delta").table("workspace.`tegge-insurance-data-silver`.table_pk_fk")

fk_dict = fk_dict.select("Table_Name", "Foreign_Key").toPandas()

fk_dict = dict(zip(fk_dict["Table_Name"] , fk_dict["Foreign_Key"]))

In [0]:
dfs = {}
for df_name, table in tables.items():
    dfs[df_name] = spark.read.format("delta").table(table).where(f"load_timestamp LIKE '%{date}%'").drop("source", "load_timestamp")


In [0]:
dfs["pharmacy_claims_silver"] = dfs["pharmacy_claims_silver"].withColumnRenamed("rx_claim_id", "pharmacy_claim_id")

In [0]:
df_schemas = {}
for df_name, df in dfs.items():
    try:
        df_schemas[df_name] = df.schema.fieldNames()
    except Exception as e:
        print(f"Could not read schema for {df_name}: {e}")

In [0]:
# Process all tables in dfs
for table_name, df in dfs.items():
    # Step 1: Identify rows with null primary key
    pk_col = pk_dict.get(table_name)
    if pk_col:
        null_pk_df = df.filter(col(pk_col).isNull())
        globals()[f"{table_name}_null_pks"] = null_pk_df
        df = df.filter(col(pk_col).isNotNull())
    
    # Step 2: Identify rows with null foreign key
    fk_col = fk_dict.get(table_name)
    if fk_col:
        null_fk_df = df.filter(col(fk_col).isNull())
        globals()[f"{table_name}_null_fks"] = null_fk_df
        #df = df.filter(col(fk_col).isNotNull())
    
    # Update dfs with cleaned DataFrame
    dfs[table_name] = df

In [0]:
for table_name in dfs.keys():
    null_pk_df = globals().get(f"{table_name}_null_pks")
    if null_pk_df is not None and null_pk_df.count() > 0:
        null_pk_df.write.format("delta").mode("overwrite").saveAsTable(f"workspace.`tegge-insurance-data-anomalies`.null_pk_{table_name}_{date}")
    # null_fk_df = globals().get(f"{table_name}_null_fks")
    # if null_fk_df is not None and null_fk_df.count() > 0:
    #     null_fk_df.write.format("delta").mode("overwrite").saveAsTable(f"workspace.`tegge-insurance-data-anomalies`.null_fk_{table_name}_{date}")

In [0]:
for table_name, df in dfs.items():
    if "employer_id" in df_schemas[table_name]:
        df = df.fillna({"employer_id": 9999})
    dfs[table_name] = df

In [0]:
duplicate_pk_tables = {}

for table_name, df in dfs.items():
    pk_col = pk_dict.get(table_name)
    if pk_col and pk_col in df.columns:
        dup_df = df.groupBy(pk_col).count().filter(col("count") > 1)
        if dup_df.count() > 0:
            duplicate_pk_tables[table_name] = dup_df
            print(f"Duplicates found in {table_name} based on primary key: {pk_col}")
        else:
            print(f"No duplicates in {table_name} based on primary key: {pk_col}")

In [0]:
# import smtplib
# from email.message import EmailMessage

# def send_alert(subject, body, to_email):
#     msg = EmailMessage()
#     msg.set_content(body)
#     msg['Subject'] = subject
#     msg['From'] = 'your_email@example.com'
#     msg['To'] = to_email
#     with smtplib.SMTP('smtp.yourprovider.com') as s:
#         s.send_message(msg)

# alert_needed = False
# alert_tables = []
# for table_name in dfs.keys():
#     if globals().get(f"{table_name}_null_pks") is not None and globals()[f"{table_name}_null_pks"].count() > 0:
#         alert_needed = True
#         alert_tables.append(f"{table_name} (null PK)")
#     if globals().get(f"{table_name}_null_fks") is not None and globals()[f"{table_name}_null_fks"].count() > 0:
#         alert_needed = True
#         alert_tables.append(f"{table_name} (null FK)")

# if alert_needed:
#     send_alert(
#         subject="Null PK/FK Detected in Silver Tables",
#         body=f"Tables with null PK/FK: {', '.join(alert_tables)}",
#         to_email="analyst@example.com"
#     )

In [0]:
# # Check for null PK/FK and stop execution if any are found
# stop_execution = False
# stop_tables = []
# for table_name in dfs.keys():
#     null_pk_df = globals().get(f"{table_name}_null_pks")
#     if null_pk_df is not None and null_pk_df.count() > 0:
#         stop_execution = True
#         stop_tables.append(f"{table_name} (null PK)")

# if stop_execution:
#     raise RuntimeError(f"Null PK/FK detected in tables: {', '.join(stop_tables)}. Notebook execution stopped.")

In [0]:
from pyspark.sql.types import StructType, StructField

for table_name, df in dfs.items():
    pk_col = pk_dict.get(table_name)
    #fk_col = fk_dict.get(table_name)
    # Get current schema
    schema = df.schema
    new_fields = []
    for field in schema.fields:
        if field.name == pk_col: #or field.name == fk_col:
            new_fields.append(StructField(field.name, field.dataType, nullable=False))
        else:
            new_fields.append(field)
    new_schema = StructType(new_fields)
    # Recreate DataFrame with new schema
    df_nonnull = spark.createDataFrame(df.rdd, schema=new_schema)
    # Update dfs with new DataFrame
    dfs[table_name] = df_nonnull