In [0]:
%python
%pip install pdfplumber
import pdfplumber
from io import BytesIO
from pyspark.sql import Row
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pyspark.sql.functions import current_timestamp



In [0]:
%sql
USE CATALOG ppe;
USE SCHEMA bronze;


In [0]:
bronze_schema = StructType([
    StructField("file_name", StringType(), False),
    StructField("raw_text", StringType(), True)
])

registry_schema = StructType([
    StructField("file_name", StringType(), False),
    StructField("file_path", StringType(), False),
    StructField("status", StringType(), False)
])


In [0]:
#Configuration
SOURCE_DIR = "/Volumes/ppe/ppe/ppe_input"


In [0]:
#List all PDF Files
pdf_files_df = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.pdf")
    .load(SOURCE_DIR)
    .select("path")
)

pdf_paths = [row["path"] for row in pdf_files_df.collect()]


In [0]:
# Fetch Already processed files
processed_files = {
    row["file_path"]
    for row in spark.sql(
        "SELECT file_path FROM bronze_file_registry WHERE status = 'SUCCESS'"
    ).collect()
}


In [0]:
# Identify new files
new_files = [p for p in pdf_paths if p not in processed_files]

print(f"New files detected: {len(new_files)}")

# Guard clause: exit if no new files
if len(new_files) == 0:
    print("No new source files found. Exiting notebook.")
    dbutils.notebook.exit("NO_NEW_SOURCE_FILES")


In [0]:
bronze_rows = []
registry_rows = []

for path in new_files:
    file_name = path.split("/")[-1]

    try:
        pdf_df = spark.read.format("binaryFile").load(path)
        pdf_bytes = pdf_df.select("content").collect()[0]["content"]

        with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
            raw_text = "\n".join([page.extract_text() or "" for page in pdf.pages])

        bronze_rows.append((file_name, raw_text))
        registry_rows.append((file_name, path, "SUCCESS"))

    except Exception as e:
        registry_rows.append((file_name, path, "FAILED"))
        print(f"Error processing {file_name}: {e}")


In [0]:
# Write to Bronze Table
if bronze_rows:
    (
        spark.createDataFrame(bronze_rows, bronze_schema)
        .withColumn("ingestion_ts", current_timestamp())
        .write.mode("append")
        .saveAsTable("ppe.bronze.bronze_raw_transactions")
    )



In [0]:
# Write to File Registery
if registry_rows:
    (
        spark.createDataFrame(registry_rows, registry_schema)
        .withColumn("ingestion_ts", current_timestamp())
        .write.mode("append")
        .saveAsTable("ppe.bronze.bronze_file_registry")
    )


In [0]:
import pdfplumber
from io import BytesIO

# Path where PhonePe PDF is placed
pdf_path = "/Volumes/ppe/ppe/ppe_input"

# Read PDF as binary
pdf_df = spark.read.format("binaryFile").load(pdf_path)
pdf_bytes = pdf_df.select("content").collect()[0]["content"]

# ðŸ”¹ THIS IS raw_text (do not change this logic)
with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
    raw_text = "\n".join(
        [page.extract_text() or "" for page in pdf.pages]
    )


In [0]:
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

bronze_rows = [
    (raw_text, datetime.now())
]

bronze_schema = StructType([
    StructField("raw_text", StringType(), True),
    StructField("ingestion_ts", TimestampType(), True)
])

df_bronze = spark.createDataFrame(bronze_rows, schema=bronze_schema)
df_bronze.write.mode("append").saveAsTable("ppe.bronze.bronze_raw")


In [0]:
%sql
SELECT
  length(raw_text) AS text_length,
  ingestion_ts
FROM bronze_raw
ORDER BY ingestion_ts DESC;
