In [0]:
import pandas as pd
import networkx as nx
from pyspark.sql.types import *
from datetime import datetime
date = datetime.today().strftime('%Y_%m_%d')

In [0]:
tables = {}
table_list = spark.sql("SHOW TABLES IN workspace.`tegge-insurance-data`").toPandas()
for table in table_list['tableName']:
    if table.endswith('_bronze'):
        key = table.replace('_bronze', '_silver')
    else:
        key = table
    tables[key] = f"workspace.`tegge-insurance-data`.{table}"


In [0]:
df_schemas = {}
for df_name, table_path in tables.items():
    try:
        df = spark.read.table(table_path)
        df_schemas[df_name] = df.schema.fieldNames()
    except Exception as e:
        print(f"Could not read schema for {df_name}: {e}")
# df_schemas now maps each DataFrame name to its list of column names

In [0]:
# Identify primary keys and foreign keys
pk_fk_rows = []
max_fk_count = 0
for df, fields in df_schemas.items():
    pk = [f for f in fields if f.endswith('_id') and f.startswith(df.split('_')[0])]
    if not pk:
        pk = [fields[0]]  # fallback: first field
    fk = [f for f in fields if f.endswith('_id') and f not in pk]
    max_fk_count = max(max_fk_count, len(fk))
    row = {
        'Table_Name': df,
        'Primary_Key': ', '.join(pk)
    }
    for i in range(max_fk_count):
        col_name = f'Foreign_Key_{i+1}'
        row[col_name] = fk[i] if i < len(fk) else None
    pk_fk_rows.append(row)

from pyspark.sql.types import StructType, StructField, StringType
pk_fk_schema = [
    StructField('Table_Name', StringType(), False),
    StructField('Primary_Key', StringType(), True)
]
for i in range(max_fk_count):
    pk_fk_schema.append(StructField(f'Foreign_Key_{i+1}', StringType(), True))
pk_fk_schema = StructType(pk_fk_schema)
spark.createDataFrame(pk_fk_rows, schema=pk_fk_schema).write.format("delta").mode('overwrite').saveAsTable("workspace.`tegge-insurance-data-silver`.table_pk_fk")

In [0]:
%sql
UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "pharmacy_claim_id" where table_name == "pharmacy_claims_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET unique_identifier_1 == "pharmacy_provider_id" where table_name == "pharmacy_claims_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "claim_line_id" where table_name == "claim_lines_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET unique_identifier_1 == "claim_id" where table_name == "claim_lines_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "gl_txn_id" where table_name == "gl_transactions_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET unique_identifier_1 == "gl_account_id" where table_name == "gl_transactions_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "provider_id" WHERE Table_Name == "network_providers_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET unique_identifier_1 == "network_id" WHERE Table_Name == "network_providers_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "gl_transaction_id" WHERE Table_Name == "gl_transactions_silver";

In [0]:
# Gather Unique Identifiers and save to Delta Table
pk_fk_rows = []
max_fk_count = 0
for df, fields in df_schemas.items():
    pk = [f for f in fields if f.endswith('_id') and f.startswith(df.split('_')[0])]
    if not pk:
        pk = [fields[0]]  # fallback: first field
    fk = [f for f in fields if f.endswith('_id') and f not in pk]
    max_fk_count = max(max_fk_count, len(fk))
    row = {
        'Table_Name': df,
        'Primary_Key': ', '.join(pk)
    }
    for i in range(max_fk_count):
        col_name = f'unique_identifier_{i+1}'
        row[col_name] = fk[i] if i < len(fk) else None
    pk_fk_rows.append(row)

from pyspark.sql.types import StructType, StructField, StringType
pk_fk_schema = [
    StructField('Table_Name', StringType(), False),
    StructField('Primary_Key', StringType(), True)
]
for i in range(max_fk_count):
    pk_fk_schema.append(StructField(f'unique_identifier_{i+1}', StringType(), True))
pk_fk_schema = StructType(pk_fk_schema)
spark.createDataFrame(pk_fk_rows, schema=pk_fk_schema).write.format("delta").mode('overwrite').saveAsTable("workspace.`tegge-insurance-data-silver`.table_unique_identifier")

In [0]:
%sql
UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET Primary_Key == "pharmacy_claim_id" where table_name == "pharmacy_claims_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET unique_identifier_1 == "pharmacy_provider_id" where table_name == "pharmacy_claims_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET Primary_Key == "claim_line_id" where table_name == "claim_lines_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET unique_identifier_1 == "claim_id" where table_name == "claim_lines_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET Primary_Key == "gl_txn_id" where table_name == "gl_transactions_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET unique_identifier_1 == "gl_account_id" where table_name == "gl_transactions_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET Primary_Key == "provider_id" WHERE Table_Name == "network_providers_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET unique_identifier_1 == "network_id" WHERE Table_Name == "network_providers_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET unique_identifier_1 == "specialty" WHERE Table_Name == "network_adequacy_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET unique_identifier_1 == "period_start" WHERE Table_Name == "provider_scorecards_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET unique_identifier_1 == "lob" WHERE Table_Name == "statutory_financials_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_unique_identifier SET Primary_Key == "gl_transaction_id" WHERE Table_Name == "gl_transactions_silver";


In [0]:
# Build ERD relationship table with separate columns for each foreign key
rows = []
# Fix: pk_fk_rows is a list, not a dict, so use list comprehension
max_fk_count = max(len(row['Foreign_Key'].split(',')) if row['Foreign_Key'] else 0 for row in pk_fk_rows)

for df in df_schemas:
    fk_list = foreign_keys[df]
    for fk in fk_list:
        # Find referenced table
        ref_table = None
        for t, pk in primary_keys.items():
            if fk in pk and t != df:
                ref_table = t
                break
        if ref_table:
            row = {
                'Table_Name': df,
                'Joins_To': ref_table,
                'Primary_Key': ', '.join(primary_keys[df])
            }
            # Add each foreign key as its own column
            for i in range(max_fk_count):
                col_name = f'Foreign_Key_{i+1}'
                row[col_name] = fk_list[i] if i < len(fk_list) else None
            rows.append(row)


In [0]:
schema_fields = [
    StructField('Table_Name', StringType(), False),
    StructField('Joins_To', StringType(), True),
    StructField('Primary_Key', StringType(), False)
]
for i in range(max_fk_count):
    schema_fields.append(StructField(f'Foreign_Key_{i+1}', StringType(), True))
schema = StructType(schema_fields)

In [0]:
erd_df = spark.createDataFrame(rows, schema = schema).write.format("delta").mode('overwrite').saveAsTable("workspace.`tegge-insurance-data-silver`.table_relationships")