In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

file_path = '/Volumes/main/default/medical_data_volume/medical_data_complete_unzipped/medical_data_local/drug-easy-info/all_drug_data.json'

spark = SparkSession.builder.getOrCreate()

# Use multiline option and specify encoding
df = spark.read.option("multiline", "true").option("encoding", "UTF-8").json(file_path)

print("Schema:")
df.printSchema()

In [0]:
# Check record count and basic info
total_records = df.count()
print(f"Total records: {total_records}")

# Look at the actual data
print("\nFirst 5 records with key fields:")
df.select("entpName", "itemName", "itemSeq", "openDe", "updateDe").show(5, truncate=False)

# Check for null values in key fields
print("\nNull value counts:")
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Show sample of actual content
print("\nSample drug information:")
df.select("itemName", "efcyQesitm").show(3, truncate=False)

In [0]:
# Define your bronze table name
bronze_table_name = "main.default.drug_easy_info_bronze"

# Write the DataFrame to a Delta table
print(f"Creating bronze table: {bronze_table_name}")

df.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .saveAsTable(bronze_table_name)

print(f"✅ Successfully created bronze table with {total_records} records")

# Verify the table was created
print(f"\nVerifying bronze table:")
bronze_df = spark.table(bronze_table_name)
print(f"Records in bronze table: {bronze_df.count()}")

# Show table info
print(f"\nBronze table schema:")
bronze_df.printSchema()

# Show sample data from bronze table
print(f"\nSample data from bronze table:")
bronze_df.select("entpName", "itemName", "itemSeq").show(5)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Read from the bronze table
bronze_table_name = "main.default.medical_data_bronze"
bronze_df = spark.table(bronze_table_name)

print("=== BRONZE TABLE OVERVIEW ===")
print(f"Table: {bronze_table_name}")
print(f"Total records: {bronze_df.count()}")

print("\n=== TABLE SCHEMA ===")
bronze_df.printSchema()

print("\n=== SAMPLE DATA ===")
print("First 5 records:")
bronze_df.show(5, truncate=False)

print("\n=== KEY COLUMNS SAMPLE ===")
bronze_df.select("entpName", "itemName", "itemSeq", "openDe", "updateDe").show(10, truncate=False)

print("\n=== DATA QUALITY CHECK ===")
print("Null value counts per column:")
null_counts = bronze_df.select([count(when(col(c).isNull(), c)).alias(c) for c in bronze_df.columns])
null_counts.show()

print("\n=== RECENT DATA ===")
print("Most recently updated drugs:")
bronze_df.orderBy(col("updateDe").desc()).select("itemName", "entpName", "updateDe").show(5, truncate=False)

print("\n=== COMPANY ANALYSIS ===")
print("Top companies by product count:")
bronze_df.groupBy("entpName").count().orderBy(col("count").desc()).show(10)

print("\n=== SAMPLE DRUG DETAILS ===")
print("Drug efficacy information (first 3):")
bronze_df.select("itemName", "efcyQesitm").show(3, truncate=False)

# If you added metadata columns, show them
if "ingestion_timestamp" in bronze_df.columns:
    print("\n=== METADATA INFO ===")
    bronze_df.select("ingestion_timestamp", "source_file", "data_source").distinct().show(truncate=False)