In [0]:
import pandas as pd
import networkx as nx
from pyspark.sql.types import *
from datetime import datetime
date = datetime.today().strftime('%Y_%m_%d')

In [0]:
tables = {}
table_list = spark.sql("SHOW TABLES IN workspace.`tegge-insurance-data`").toPandas()
for table in table_list['tableName']:
    if table.endswith('_bronze'):
        key = table.replace('_bronze', '_silver')
    else:
        key = table
    tables[key] = f"workspace.`tegge-insurance-data`.{table}"


In [0]:
df_schemas = {}
for df_name, table_path in tables.items():
    try:
        df = spark.read.table(table_path)
        df_schemas[df_name] = df.schema.fieldNames()
    except Exception as e:
        print(f"Could not read schema for {df_name}: {e}")
# df_schemas now maps each DataFrame name to its list of column names

In [0]:
# Identify primary keys and foreign keys
pk_fk_rows = []
for df, fields in df_schemas.items():
    pk = [f for f in fields if f.endswith('_id') and f.startswith(df.split('_')[0])]
    if not pk:
        pk = [fields[0]]  # fallback: first field
    fk = [f for f in fields if f.endswith('_id') and f not in pk]
    pk_fk_rows.append({
        'Table_Name': df,
        'Primary_Key': ', '.join(pk),
        'Foreign_Key': ', '.join(fk)
    })

from pyspark.sql.types import StructType, StructField, StringType
pk_fk_schema = StructType([
    StructField('Table_Name', StringType(), False),
    StructField('Primary_Key', StringType(), True),
    StructField('Foreign_Key', StringType(), True)
])
spark.createDataFrame(pk_fk_rows, schema=pk_fk_schema).write.format("delta").mode('overwrite').saveAsTable("workspace.`tegge-insurance-data-silver`.table_pk_fk")

In [0]:
# Build ERD relationship table
rows = []
for df in df_schemas:
    for fk in foreign_keys[df]:
        # Find referenced table
        ref_table = None
        for t, pk in primary_keys.items():
            if fk in pk and t != df:
                ref_table = t
                break
        if ref_table:
            rows.append({
                'Table_Name': df,
                'Joins_To': ref_table,
                'Primary_Key': ', '.join(primary_keys[df]),
                'Foreign_Key': fk
            })

In [0]:
schema = StructType([
    StructField('Table_Name', StringType(), False),
    StructField('Joins_To', StringType(), True),
    StructField('Primary_Key', StringType(), False),
    StructField('Foreign_Key', StringType(), True)
])

In [0]:
erd_df = spark.createDataFrame(rows, schema = schema).write.format("delta").mode('overwrite').saveAsTable("workspace.`tegge-insurance-data-silver`.table_relationships")

In [0]:
%sql
-- UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "pharmacy_claim_id" where table_name == "pharmacy_claims_silver";
-- UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Foreign_Key == "pharmacy_provider_id,member_id, drug_id" where table_name == "pharmacy_claims_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "claim_line_id" where table_name == "claim_lines_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Foreign_Key == "claim_id" where table_name == "claim_lines_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Primary_Key == "gl_txn_id" where table_name == "gl_transactions_silver";

UPDATE workspace.`tegge-insurance-data-silver`.table_pk_fk SET Foreign_Key == "gl_account_id" where table_name == "gl_transactions_silver";
