In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import pandas as pd
from datetime import datetime


In [0]:
tables_metadata = [
    {"table_name": "customers", "path": "abfss://bronze@migrationstoragerinith.dfs.core.windows.net/second_method/customers"},
    {"table_name": "products", "path": "abfss://bronze@migrationstoragerinith.dfs.core.windows.net/second_method/products"},
    {"table_name": "orders", "path": "abfss://bronze@migrationstoragerinith.dfs.core.windows.net/second_method/orders"},
    {"table_name": "dates", "path": "abfss://bronze@migrationstoragerinith.dfs.core.windows.net/second_method/dates"}
]


In [0]:
from pyspark.sql.types import StructType
import json

def validate_schema(table_name, df_new):
    schema_path = f"abfss://schemafolder@migrationstoragerinith.dfs.core.windows.net/second_method/{table_name}_schema.json"
    
    try:
        # Read existing schema JSON from ADLS
        baseline_json = (
            spark.read.text(schema_path)
            .collect()[0][0]
        )
        baseline_schema = StructType.fromJson(json.loads(baseline_json))
        
        # Compare columns
        baseline_cols = set([f.name for f in baseline_schema.fields])
        new_cols = set(df_new.columns)
        
        added_cols = new_cols - baseline_cols
        removed_cols = baseline_cols - new_cols

        if added_cols or removed_cols:
             raise Exception(f" Schema drift in `{table_name}`:\n Added: {added_cols}\n Removed: {removed_cols}")
        else:
            print(f"Schema matches for `{table_name}`")

    except Exception as e:
        print(f"⚠️ No baseline schema found or error occurred: {e}")
        print(f"🔁 Saving new schema for `{table_name}`")

        schema_json = df_new.schema.json()

        # Save as JSON string using Spark (correctly to ADLS)
        spark.createDataFrame([(schema_json,)], ["schema"]) \
            .write.mode("overwrite") \
            .text(schema_path)








In [0]:
for table in tables_metadata:
    table_name = table["table_name"]
    path = table["path"]

    try:
        df = spark.read.option("header", True).csv(path)
        validate_schema(table_name, df)
        print(f"Processed schema check for: {table_name}")
    except Exception as e:
        print(f"Failed for {table_name}: {str(e)}")