STEP 1: FILE STRUCTURE EXPLORATION

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Base path for the JSON files
base_path = '/Volumes/main/default/medical_data_volume/medical_data_complete_unzipped/medical_data_local/drug-product-material-info/'

# Check if directory exists and list files
print(f"\n1. Checking directory: {base_path}")

try:
    # Use dbutils to list files in the directory
    files = dbutils.fs.ls(base_path)
    
    # Filter for JSON files and sort them
    json_files = [f for f in files if f.name.endswith('.json') and f.name.startswith('chunk_')]
    json_files.sort(key=lambda x: int(x.name.split('_')[1].split('.')[0]))
    
    print(f"Found {len(json_files)} JSON files")
    
    # Check if we have all expected files (chunk_001.json to chunk_131.json)
    expected_files = [f"chunk_{i:03d}.json" for i in range(1, 132)]
    actual_files = [f.name for f in json_files]
    
    missing_files = set(expected_files) - set(actual_files)
    extra_files = set(actual_files) - set(expected_files)
    
    print(f"\n2. File Inventory:")
    print(f"   Expected files: 131 (chunk_001.json to chunk_131.json)")
    print(f"   Found files: {len(actual_files)}")
    print(f"   Missing files: {len(missing_files)}")
    print(f"   Extra files: {len(extra_files)}")
    
    if missing_files:
        print(f"   Missing: {sorted(list(missing_files))[:10]}...")  # Show first 10
    if extra_files:
        print(f"   Extra: {sorted(list(extra_files))[:10]}...")  # Show first 10
    
    # Check file sizes (handle potential access issues)
    print(f"\n3. File Size Analysis:")
    try:
        file_sizes = [(f.name, f.size) for f in json_files]
        file_sizes.sort(key=lambda x: x[1])  # Sort by size
        
        total_size = sum(size for _, size in file_sizes)
        avg_size = total_size / len(file_sizes) if file_sizes else 0
        min_size = min(file_sizes, key=lambda x: x[1]) if file_sizes else None
        max_size = max(file_sizes, key=lambda x: x[1]) if file_sizes else None
        
        print(f"   Total size: {total_size / (1024*1024):.2f} MB")
        print(f"   Average size: {avg_size / (1024*1024):.2f} MB")
        print(f"   Smallest file: {min_size[0]} ({min_size[1] / (1024*1024):.2f} MB)")
        print(f"   Largest file: {max_size[0]} ({max_size[1] / (1024*1024):.2f} MB)")
        
    except Exception as size_error:
        print(f"   Could not access file size information: {str(size_error)}")
        print(f"   Proceeding with schema analysis...")
    
    # Sample a few files to check their structure
    print(f"\n4. Sample File Structure Analysis:")
    
    # Use direct file paths for testing
    sample_file_names = ['chunk_001.json', 'chunk_066.json', 'chunk_131.json']
    
    for i, file_name in enumerate(sample_file_names, 1):
        file_path = f"{base_path}{file_name}"
        print(f"\n   Sample {i}: {file_name}")
        print(f"   Full path: {file_path}")
        
        try:
            # Try to read the file
            sample_df = spark.read.option("multiline", "true").option("encoding", "UTF-8").json(file_path)
            print(f"   ✅ File readable")
            print(f"   Records: {sample_df.count()}")
            print(f"   Columns: {len(sample_df.columns)}")
            print(f"   Schema preview:")
            
            # Show first few columns of schema
            schema_info = sample_df.dtypes
            for j, (col_name, col_type) in enumerate(schema_info[:10]):
                print(f"     {j+1:2d}. {col_name}: {col_type}")
            
            if len(sample_df.columns) > 10:
                print(f"     ... and {len(sample_df.columns) - 10} more columns")
            
            # Show a sample record
            print(f"   Sample record:")
            sample_df.show(1, truncate=False, vertical=True)
                
        except Exception as e:
            print(f"   ❌ ERROR reading file: {str(e)}")
            print(f"   Error type: {type(e).__name__}")
    
    print(f"\n=== EXPLORATION COMPLETE ===")
    print(f"Ready for Step 2: Schema Validation")
    
except Exception as e:
    print(f"ERROR: {str(e)}")
    print("Make sure the path is correct and accessible.")

STEP 2: SCHEMA VALIDATION ACROSS MULTIPLE FILES

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()
base_path = '/Volumes/main/default/medical_data_volume/medical_data_complete_unzipped/medical_data_local/drug-product-material-info/'

# Test files to validate schema consistency
test_files = [
    'chunk_001.json',  # First file
    'chunk_010.json',  # Early file
    'chunk_050.json',  # Middle file
    'chunk_100.json',  # Late file
    'chunk_131.json'   # Last file
]

schemas = {}
record_counts = {}
successful_reads = []
failed_reads = []

print(f"\n1. Testing {len(test_files)} sample files for schema consistency:")

for file_name in test_files:
    file_path = f"{base_path}{file_name}"
    print(f"\n   Testing: {file_name}")
    
    try:
        # Read the file
        df = spark.read.option("multiline", "true").option("encoding", "UTF-8").json(file_path)
        
        # Get schema and record count
        schema = df.schema
        count = df.count()
        
        # Store results
        schemas[file_name] = schema
        record_counts[file_name] = count
        successful_reads.append(file_name)
        
        print(f"   ✅ Success - Records: {count}, Columns: {len(schema.fields)}")
        
    except Exception as e:
        failed_reads.append((file_name, str(e)))
        print(f"   ❌ Failed - Error: {str(e)}")

print(f"\n2. Schema Validation Results:")
print(f"   Successful reads: {len(successful_reads)}")
print(f"   Failed reads: {len(failed_reads)}")

if failed_reads:
    print(f"   Failed files:")
    for file_name, error in failed_reads:
        print(f"     {file_name}: {error}")

if successful_reads:
    # Compare schemas
    print(f"\n3. Schema Consistency Analysis:")
    
    # Use first successful schema as reference
    reference_file = successful_reads[0]
    reference_schema = schemas[reference_file]
    reference_fields = {field.name: field.dataType for field in reference_schema.fields}
    
    print(f"   Reference schema from: {reference_file}")
    print(f"   Reference columns: {len(reference_fields)}")
    
    # Check if all schemas match
    all_schemas_match = True
    schema_differences = {}
    
    for file_name in successful_reads[1:]:
        current_schema = schemas[file_name]
        current_fields = {field.name: field.dataType for field in current_schema.fields}
        
        # Compare column names
        ref_columns = set(reference_fields.keys())
        curr_columns = set(current_fields.keys())
        
        missing_in_current = ref_columns - curr_columns
        extra_in_current = curr_columns - ref_columns
        
        # Compare data types for common columns
        type_differences = []
        common_columns = ref_columns & curr_columns
        for col in common_columns:
            if reference_fields[col] != current_fields[col]:
                type_differences.append((col, reference_fields[col], current_fields[col]))
        
        if missing_in_current or extra_in_current or type_differences:
            all_schemas_match = False
            schema_differences[file_name] = {
                'missing_columns': missing_in_current,
                'extra_columns': extra_in_current,
                'type_differences': type_differences
            }
        
        print(f"   {file_name}: Columns={len(current_fields)} ", end="")
        if missing_in_current or extra_in_current or type_differences:
            print("❌ Schema differs")
        else:
            print("✅ Schema matches")
    
    print(f"\n4. Detailed Schema Information:")
    print(f"   All schemas identical: {'✅ Yes' if all_schemas_match else '❌ No'}")
    
    if schema_differences:
        print(f"   Schema differences found in {len(schema_differences)} files:")
        for file_name, diffs in schema_differences.items():
            print(f"\n     {file_name}:")
            if diffs['missing_columns']:
                print(f"       Missing columns: {list(diffs['missing_columns'])}")
            if diffs['extra_columns']:
                print(f"       Extra columns: {list(diffs['extra_columns'])}")
            if diffs['type_differences']:
                print(f"       Type differences:")
                for col, ref_type, curr_type in diffs['type_differences']:
                    print(f"         {col}: {ref_type} -> {curr_type}")
    
    # Show complete schema from reference file
    print(f"\n5. Complete Schema from {reference_file}:")
    reference_df = spark.read.option("multiline", "true").option("encoding", "UTF-8").json(f"{base_path}{reference_file}")
    reference_df.printSchema()
    
    print(f"\n6. Record Count Analysis:")
    total_estimated_records = 0
    for file_name in successful_reads:
        count = record_counts[file_name]
        print(f"   {file_name}: {count:,} records")
        total_estimated_records += count
    
    avg_records = total_estimated_records / len(successful_reads)
    estimated_total = avg_records * 131  # All 131 files
    
    print(f"\n   Average records per file: {avg_records:,.0f}")
    print(f"   Estimated total records (131 files): {estimated_total:,.0f}")
    
    print(f"\n7. Sample Data from {reference_file}:")
    reference_df.show(3, truncate=False)

print(f"\n=== SCHEMA VALIDATION COMPLETE ===")

if all_schemas_match and len(successful_reads) >= 3:
    print(f"✅ RECOMMENDATION: Use wildcard pattern approach - schemas are consistent")
    print(f"   Next step: Implement wildcard ingestion")
elif len(successful_reads) >= 3:
    print(f"⚠️  RECOMMENDATION: Use union approach with schema handling")
    print(f"   Next step: Implement union-based ingestion with schema normalization")
else:
    print(f"❌ RECOMMENDATION: Investigate file access issues before proceeding")
    print(f"   Next step: Debug file access problems")

STEP 3: WILDCARD PATTERN INGESTION

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time

spark = SparkSession.builder.getOrCreate()

print("Drug Product Material Info - Bronze Table Creation")

# Configuration
base_path = '/Volumes/main/default/medical_data_volume/medical_data_complete_unzipped/medical_data_local/drug-product-material-info/'
wildcard_path = f"{base_path}chunk_*.json"
bronze_table_name = "main.default.drug_product_material_bronze"

print(f"\nSource path: {wildcard_path}")
print(f"Target table: {bronze_table_name}")

# Start timing
start_time = time.time()

print(f"\n1. Reading all JSON files using wildcard pattern...")
try:
    # Read all files at once using wildcard pattern
    df = spark.read \
        .option("multiline", "true") \
        .option("encoding", "UTF-8") \
        .json(wildcard_path)
    
    print(f"✅ Successfully read all files")
    
    # Quick validation
    total_records = df.count()
    total_columns = len(df.columns)
    
    print(f"   Total records: {total_records:,}")
    print(f"   Total columns: {total_columns}")
    
    print(f"\n2. Data Quality Checks:")
    
    # Check for null values in key columns
    key_columns = ["ITEM_SEQ", "PRDUCT", "MTRAL_NM", "ENTRPS"]
    for col_name in key_columns:
        null_count = df.filter(col(col_name).isNull() | (col(col_name) == "")).count()
        print(f"   {col_name}: {null_count:,} null/empty values")
    
    # Check for duplicates
    distinct_records = df.distinct().count()
    duplicate_count = total_records - distinct_records
    print(f"   Duplicate records: {duplicate_count:,}")
    
    # Show schema
    print(f"\n3. Final Schema:")
    df.printSchema()
    
    print(f"\n4. Creating Bronze Table...")
    
    # Write to Delta table
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(bronze_table_name)
    
    # End timing
    end_time = time.time()
    processing_time = end_time - start_time
    
    print(f"✅ Successfully created bronze table")
    print(f"   Processing time: {processing_time:.2f} seconds")
    print(f"   Records per second: {total_records/processing_time:,.0f}")
    
    print(f"\n5. Bronze Table Verification:")
    
    # Verify the table was created
    bronze_df = spark.table(bronze_table_name)
    verified_count = bronze_df.count()
    
    print(f"   Records in bronze table: {verified_count:,}")
    print(f"   Data integrity: {'✅ Passed' if verified_count == total_records else '❌ Failed'}")
    
    print(f"\n6. Sample Data from Bronze Table:")
    bronze_df.select(
        "ENTRPS", 
        "PRDUCT", 
        "MTRAL_NM", 
        "MAIN_INGR_ENG", 
        "QNT",
        "INGD_UNIT_CD"
    ).show(5, truncate=False)
    
    print(f"\n7. Data Distribution Analysis:")
    
    # Top enterprises by product count
    print(f"   Top 10 Enterprises by Product Count:")
    bronze_df.groupBy("ENTRPS") \
        .agg(countDistinct("ITEM_SEQ").alias("unique_products"),
             count("*").alias("total_materials")) \
        .orderBy(col("total_materials").desc()) \
        .show(10, truncate=False)
    
    # Material distribution
    print(f"   Top 10 Materials by Usage:")
    bronze_df.groupBy("MTRAL_NM") \
        .count() \
        .orderBy(col("count").desc()) \
        .show(10, truncate=False)
    
    print(f"\n=== INGESTION COMPLETED SUCCESSFULLY ===")
    print(f"🎉 Bronze table '{bronze_table_name}' created with {verified_count:,} records")
    
except Exception as e:
    print(f"❌ ERROR during ingestion: {str(e)}")
    print(f"Error type: {type(e).__name__}")
    
    # Try to provide helpful debugging info
    print(f"\nDebugging information:")
    print(f"- Check if all files are accessible")
    print(f"- Verify Spark cluster has sufficient memory")
    print(f"- Consider using batch processing approach if memory issues persist")
    
    raise e