In [0]:
%pip install langchain pypdf2 streamlit

# Cell 2: Import libraries and restart Python
dbutils.library.restartPython()

In [0]:
dbutils.library.restartPython()

In [0]:
try:
    # Test if Databricks already has S3 access configured
    files = dbutils.fs.ls("s3://testbucketsudhanshudatabricks/")
    print("✅ Databricks already has S3 access!")
    print("Available files/folders:")
    for file in files:
        print(f"  - {file.name}")
    
    # If this works, we can use dbutils approach (Method A)
    S3_ACCESS_METHOD = "dbutils"
    
except Exception as e:
    print(f"❌ No direct S3 access: {e}")
    print("Need to configure credentials...")
    S3_ACCESS_METHOD = "credentials"

In [0]:
%python
def ask_banking_question_dbutils(s3_path, question):
    from mlflow.deployments import get_deploy_client
    pdf_path = '/Volumes/sudhanshu/default/pdf/2025-07-02_Statement.pdf'
    
    # # Copy file to temp location and read
    # temp_path = "/tmp/temp_statement.pdf"
    # dbutils.fs.cp(s3_path, f"file:{temp_path}")
    
    # Extract text using PyPDF2
    from PyPDF2 import PdfReader
    reader = PdfReader(pdf_path)
    text = ''.join(page.extract_text() for page in reader.pages)
    
    # # Clean up temp file
    # dbutils.fs.rm(f"file:{temp_path}")
    
    # Process with LLM
    client = get_deploy_client("databricks")
    prompt = f"""
    Analyze this bank statement: {text[:15000]}
    
    Question: {question}
    
    Provide specific numbers and analysis.
    """
    
    response = client.predict(
        endpoint="databricks-meta-llama-3-1-405b-instruct",
        inputs={"messages": [{"role": "user", "content": prompt}], "max_tokens": 500}
    )
    
    return response['choices'][0]['message']['content']

# Test this method
if S3_ACCESS_METHOD == "dbutils":
    answer = ask_banking_question_dbutils("s3://testbucketsudhanshudatabricks/pdfs/2025-07-02_Statement.pdf", 
                                         "What's my total spending?")
    print("✅ SUCCESS with dbutils method!")
    print(answer)

In [0]:
# Just like spark.read.csv("s3://bucket/file.csv")
# We should do: process_pdf("s3://bucket/file.pdf")

def banking_query(s3_path, question):
    """Direct S3 PDF processing - no downloads!"""
    import boto3
    from PyPDF2 import PdfReader
    from io import BytesIO
    
    # Parse S3 path
    bucket = s3_path.split('/')[2]
    key = '/'.join(s3_path.split('/')[3:])
    
    # Read directly into memory
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key)
    pdf_bytes = obj['Body'].read()
    
    # Extract text directly
    reader = PdfReader(BytesIO(pdf_bytes))
    text = ''.join(page.extract_text() for page in reader.pages)
    
    # Send to LLM
    return llm_analyze(text, question)

# One-liner usage (just like Spark!)
answer = banking_query("s3://testbucketsudhanshudatabricks/pdfs/2025-07-02_Statement.pdf", 
                      "How much did I spend on food?")