In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()
base_path = '/Volumes/main/default/medical_data_volume/medical_data_complete_unzipped/medical_data_local/drug-product-permission-summary-info/'

print("=== DRUG PRODUCT PERMISSION SUMMARY INFO - EXPLORATION ===")

# Check directory and files
print(f"\n1. Checking directory: {base_path}")

try:
    files = dbutils.fs.ls(base_path)
    json_files = [f for f in files if f.name.endswith('.json') and f.name.startswith('chunk_')]
    json_files.sort(key=lambda x: int(x.name.split('_')[1].split('.')[0]))
    
    print(f"Found {len(json_files)} JSON files")
    
    # Check expected files (chunk_001.json to chunk_012.json)
    expected_files = [f"chunk_{i:03d}.json" for i in range(1, 13)]
    actual_files = [f.name for f in json_files]
    
    missing_files = set(expected_files) - set(actual_files)
    extra_files = set(actual_files) - set(expected_files)
    
    print(f"\n2. File Inventory:")
    print(f"   Expected files: 12 (chunk_001.json to chunk_012.json)")
    print(f"   Found files: {len(actual_files)}")
    print(f"   Missing files: {len(missing_files)}")
    print(f"   Extra files: {len(extra_files)}")
    
    if missing_files:
        print(f"   Missing: {sorted(list(missing_files))}")
    if extra_files:
        print(f"   Extra: {sorted(list(extra_files))}")

except Exception as e:
    print(f"ERROR accessing directory: {str(e)}")

# Test sample files for schema consistency
test_files = [
    'chunk_001.json',  # First file
    'chunk_006.json',  # Middle file
    'chunk_012.json'   # Last file
]

schemas = {}
record_counts = {}
successful_reads = []
failed_reads = []

print(f"\n3. Testing {len(test_files)} sample files for schema consistency:")

for file_name in test_files:
    file_path = f"{base_path}{file_name}"
    print(f"\n   Testing: {file_name}")
    
    try:
        df = spark.read.option("multiline", "true").option("encoding", "UTF-8").json(file_path)
        schema = df.schema
        count = df.count()
        
        schemas[file_name] = schema
        record_counts[file_name] = count
        successful_reads.append(file_name)
        
        print(f"   ✅ Success - Records: {count:,}, Columns: {len(schema.fields)}")
        
    except Exception as e:
        failed_reads.append((file_name, str(e)))
        print(f"   ❌ Failed - Error: {str(e)}")

print(f"\n4. Schema Validation Results:")
print(f"   Successful reads: {len(successful_reads)}")
print(f"   Failed reads: {len(failed_reads)}")

if failed_reads:
    print(f"   Failed files:")
    for file_name, error in failed_reads:
        print(f"     {file_name}: {error}")

if successful_reads:
    # Compare schemas
    reference_file = successful_reads[0]
    reference_schema = schemas[reference_file]
    reference_fields = {field.name: field.dataType for field in reference_schema.fields}
    
    print(f"\n5. Schema Consistency Analysis:")
    print(f"   Reference schema from: {reference_file}")
    print(f"   Reference columns: {len(reference_fields)}")
    
    all_schemas_match = True
    
    for file_name in successful_reads[1:]:
        current_schema = schemas[file_name]
        current_fields = {field.name: field.dataType for field in current_schema.fields}
        
        ref_columns = set(reference_fields.keys())
        curr_columns = set(current_fields.keys())
        
        missing_in_current = ref_columns - curr_columns
        extra_in_current = curr_columns - ref_columns
        
        type_differences = []
        common_columns = ref_columns & curr_columns
        for col in common_columns:
            if reference_fields[col] != current_fields[col]:
                type_differences.append((col, reference_fields[col], current_fields[col]))
        
        if missing_in_current or extra_in_current or type_differences:
            all_schemas_match = False
        
        print(f"   {file_name}: Columns={len(current_fields)} ", end="")
        if missing_in_current or extra_in_current or type_differences:
            print("❌ Schema differs")
        else:
            print("✅ Schema matches")
    
    print(f"\n6. Complete Schema from {reference_file}:")
    reference_df = spark.read.option("multiline", "true").option("encoding", "UTF-8").json(f"{base_path}{reference_file}")
    reference_df.printSchema()
    
    print(f"\n7. Record Count Analysis:")
    total_estimated_records = 0
    for file_name in successful_reads:
        count = record_counts[file_name]
        print(f"   {file_name}: {count:,} records")
        total_estimated_records += count
    
    avg_records = total_estimated_records / len(successful_reads)
    estimated_total = avg_records * 12  # All 12 files
    
    print(f"\n   Average records per file: {avg_records:,.0f}")
    print(f"   Estimated total records (12 files): {estimated_total:,.0f}")
    
    print(f"\n8. Sample Data from {reference_file}:")
    reference_df.show(3, truncate=False)
    
    print(f"\n9. Key Field Analysis:")
    # Check what kind of summary data this contains
    sample_columns = reference_df.columns[:10]  # First 10 columns
    print(f"   First 10 columns: {sample_columns}")
    
    # Look for summary-related fields
    summary_keywords = ['TOTAL', 'COUNT', 'SUM', 'AVG', 'SUMMARY', 'CNT']
    summary_cols = [col for col in reference_df.columns if any(keyword in col.upper() for keyword in summary_keywords)]
    if summary_cols:
        print(f"   Summary-related columns: {summary_cols}")
    
    print(f"\n=== EXPLORATION COMPLETE ===")
    
    if all_schemas_match and len(successful_reads) >= 2:
        print(f"✅ RECOMMENDATION: Use wildcard pattern approach - schemas are consistent")
        print(f"   Ready for bronze table creation")
    elif len(successful_reads) >= 2:
        print(f"⚠️  RECOMMENDATION: Use union approach with schema handling")
    else:
        print(f"❌ RECOMMENDATION: Investigate file access issues")

else:
    print(f"❌ No files could be read successfully")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time

spark = SparkSession.builder.getOrCreate()

print("=== DRUG PRODUCT PERMISSION SUMMARY INFO - BRONZE TABLE CREATION ===")

# Configuration
base_path = '/Volumes/main/default/medical_data_volume/medical_data_complete_unzipped/medical_data_local/drug-product-permission-summary-info/'
wildcard_path = f"{base_path}chunk_*.json"
bronze_table_name = "main.default.drug_product_permission_summary_bronze"

print(f"\nSource path: {wildcard_path}")
print(f"Target table: {bronze_table_name}")

# Start timing
start_time = time.time()

print(f"\n1. Reading all 12 JSON files using wildcard pattern...")
try:
    # Read all files at once using wildcard pattern
    df = spark.read \
        .option("multiline", "true") \
        .option("encoding", "UTF-8") \
        .json(wildcard_path)
    
    print(f"✅ Successfully read all files")
    
    # Quick validation
    total_records = df.count()
    total_columns = len(df.columns)
    
    print(f"   Total records: {total_records:,}")
    print(f"   Total columns: {total_columns}")
    
    print(f"\n2. Data Quality Checks:")
    
    # Check for null values in key columns
    key_columns = ["ITEM_SEQ", "ITEM_NAME", "ENTP_NAME", "ITEM_PERMIT_DATE", "PRDUCT_TYPE"]
    for col_name in key_columns:
        null_count = df.filter(col(col_name).isNull() | (col(col_name) == "")).count()
        print(f"   {col_name}: {null_count:,} null/empty values")
    
    # Check for duplicates
    distinct_records = df.distinct().count()
    duplicate_count = total_records - distinct_records
    print(f"   Duplicate records: {duplicate_count:,}")
    
    # Show schema overview
    print(f"\n3. Schema Overview (21 columns):")
    schema_fields = df.dtypes
    print(f"   Core summary columns:")
    core_cols = ["ITEM_SEQ", "ITEM_NAME", "ENTP_NAME", "ITEM_PERMIT_DATE", 
                "PRDUCT_TYPE", "ITEM_INGR_CNT", "PERMIT_KIND_CODE", "SPCLTY_PBLC"]
    for col_name in core_cols:
        col_type = dict(schema_fields)[col_name] if col_name in dict(schema_fields) else "Not found"
        print(f"     {col_name}: {col_type}")
    
    print(f"\n4. Creating Bronze Table...")
    
    # Write to Delta table
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(bronze_table_name)
    
    # End timing
    end_time = time.time()
    processing_time = end_time - start_time
    
    print(f"✅ Successfully created bronze table")
    print(f"   Processing time: {processing_time:.2f} seconds")
    print(f"   Records per second: {total_records/processing_time:,.0f}")
    
    print(f"\n5. Bronze Table Verification:")
    
    # Verify the table was created
    bronze_df = spark.table(bronze_table_name)
    verified_count = bronze_df.count()
    
    print(f"   Records in bronze table: {verified_count:,}")
    print(f"   Data integrity: {'✅ Passed' if verified_count == total_records else '❌ Failed'}")
    
    print(f"\n6. Sample Data from Bronze Table:")
    bronze_df.select(
        "ITEM_NAME", 
        "ENTP_NAME", 
        "ITEM_PERMIT_DATE",
        "PRDUCT_TYPE",
        "ITEM_INGR_CNT",
        "SPCLTY_PBLC"
    ).show(5, truncate=False)
    
    print(f"\n7. Business Intelligence Summary:")
    
    # Product type distribution
    print(f"   Product Type Distribution:")
    bronze_df.groupBy("PRDUCT_TYPE") \
        .count() \
        .orderBy(col("count").desc()) \
        .show(10, truncate=False)
    
    # Specialty public classification
    print(f"   Specialty Public Classification:")
    bronze_df.groupBy("SPCLTY_PBLC") \
        .count() \
        .orderBy(col("count").desc()) \
        .show(10, truncate=False)
    
    # Top enterprises by product count
    print(f"   Top 10 Enterprises by Product Count:")
    bronze_df.groupBy("ENTP_NAME") \
        .agg(countDistinct("ITEM_SEQ").alias("unique_products"),
             count("*").alias("total_records")) \
        .orderBy(col("unique_products").desc()) \
        .show(10, truncate=False)
    
    # Ingredient count distribution
    print(f"   Product Ingredient Count Distribution:")
    bronze_df.filter(col("ITEM_INGR_CNT").isNotNull() & (col("ITEM_INGR_CNT") != "")) \
        .groupBy("ITEM_INGR_CNT") \
        .count() \
        .orderBy(col("ITEM_INGR_CNT").cast("int")) \
        .show(20, truncate=False)
    
    # Permit kind distribution
    print(f"   Permit Kind Code Distribution:")
    bronze_df.groupBy("PERMIT_KIND_CODE") \
        .count() \
        .orderBy(col("count").desc()) \
        .show(10, truncate=False)
    
    # Products with images
    image_count = bronze_df.filter(col("BIG_PRDT_IMG_URL").isNotNull() & (col("BIG_PRDT_IMG_URL") != "")).count()
    print(f"\n   Products with images: {image_count:,} ({image_count/total_records*100:.1f}%)")
    
    print(f"\n=== INGESTION COMPLETED SUCCESSFULLY ===")
    print(f"🎉 Bronze table '{bronze_table_name}' created with {verified_count:,} records")
    print(f"📊 Ready for silver layer transformations and summary analysis")
    
except Exception as e:
    print(f"❌ ERROR during ingestion: {str(e)}")
    print(f"Error type: {type(e).__name__}")
    
    # Provide debugging info
    print(f"\nDebugging information:")
    print(f"- Verify all 12 files are accessible")
    print(f"- Check Spark cluster memory (21 columns × 35K records)")
    print(f"- Consider batch processing if memory constraints exist")
    
    raise e