In [0]:
# Import libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import seaborn as sns
# Configuration - Data Paths
BASE_PATH = "/Volumes/workspace/default/file_store"
RAW_DATA_PATH = f"{BASE_PATH}"
PROCESSED_DATA_PATH = f"{BASE_PATH}/processed_data"
FEATURE_DATA_PATH = f"{BASE_PATH}/feature_data"
MODEL_PATH = f"{BASE_PATH}/models"

# Specific data files
BENEFICIARY_FILE = f"{RAW_DATA_PATH}/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv"
INPATIENT_CLAIMS_FILE = f"{RAW_DATA_PATH}/DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv"

# Delta Lake paths
DELTA_BASE_PATH = f"{BASE_PATH}/delta"
DELTA_BRONZE_PATH = f"{DELTA_BASE_PATH}/bronze"
DELTA_SILVER_PATH = f"{DELTA_BASE_PATH}/silver"
DELTA_GOLD_PATH = f"{DELTA_BASE_PATH}/gold"

print("✓ Configuration loaded")

# MAGIC %md
# MAGIC ## 2. Load Beneficiary Summary File

# COMMAND ----------

# Load beneficiary data
beneficiary_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(BENEFICIARY_FILE)

print(f"Beneficiary records loaded: {beneficiary_df.count():,}")
print(f"Number of columns: {len(beneficiary_df.columns)}")

# COMMAND ----------

# Display schema
beneficiary_df.printSchema()

# COMMAND ----------

# Show first few rows
display(beneficiary_df.limit(10))

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Load Inpatient Claims File

# COMMAND ----------

# Load inpatient claims data
inpatient_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(INPATIENT_CLAIMS_FILE)

print(f"Inpatient claims loaded: {inpatient_df.count():,}")
print(f"Number of columns: {len(inpatient_df.columns)}")

# COMMAND ----------

# Display schema
inpatient_df.printSchema()

# COMMAND ----------

# Show first few rows
display(inpatient_df.limit(10))

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Initial Data Quality Assessment

# COMMAND ----------

# Check missing values in beneficiary data
print("=== Beneficiary Data - Missing Values ===")
beneficiary_missing = beneficiary_df.select([
    (count(when(col(c).isNull(), c)) / count(lit(1))).alias(c) 
    for c in beneficiary_df.columns
])
display(beneficiary_missing)

# COMMAND ----------

# Check missing values in inpatient data
print("=== Inpatient Claims - Missing Values ===")
inpatient_missing = inpatient_df.select([
    (count(when(col(c).isNull(), c)) / count(lit(1))).alias(c) 
    for c in inpatient_df.columns[:20]  # First 20 columns
])
display(inpatient_missing)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Save to Delta Lake - Bronze Layer

# COMMAND ----------

# Create bronze layer directory if needed
print(f"Saving to Bronze layer: {DELTA_BRONZE_PATH}")

# COMMAND ----------

# Save beneficiary data to Delta format
beneficiary_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(f"{DELTA_BRONZE_PATH}/beneficiary")

print("✓ Beneficiary data saved to Delta Bronze layer")

# COMMAND ----------

# Save inpatient claims to Delta format
inpatient_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(f"{DELTA_BRONZE_PATH}/inpatient_claims")

print("✓ Inpatient claims saved to Delta Bronze layer")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Verify Delta Tables

# COMMAND ----------

# Read back from Delta to verify
beneficiary_delta = spark.read.format("delta").load(f"{DELTA_BRONZE_PATH}/beneficiary")
inpatient_delta = spark.read.format("delta").load(f"{DELTA_BRONZE_PATH}/inpatient_claims")

print(f"✓ Beneficiary Delta table: {beneficiary_delta.count():,} records")
print(f"✓ Inpatient Delta table: {inpatient_delta.count():,} records")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary
# MAGIC 
# MAGIC Data successfully ingested and stored in Delta Lake Bronze layer:
# MAGIC - Beneficiary Summary: Patient demographics and conditions
# MAGIC - Inpatient Claims: Hospital admissions and diagnoses
# MAGIC 
# MAGIC Next step: Data Processing & EDA (02_data_processing)