In [51]:
!pip install faiss-cpu sentence-transformers transformers accelerate pypdf2 beautifulsoup4 gradio evaluate requests



In [52]:
import os
DATA_DIR = "/content/data"
os.makedirs(DATA_DIR, exist_ok=True)


In [53]:
import requests

urls = {
    "SR11_7.pdf": "https://www.federalreserve.gov/supervisionreg/srletters/sr1107a1.pdf",
    "OCC_2011_12.pdf": "https://www.occ.gov/news-issuances/bulletins/2011/bulletin-2011-12a.pdf",
    "FDIC_2017_01.pdf": "https://www.fdic.gov/news/financial-institution-letters/2017/fil17001a.pdf",
    "FRB_CECL.pdf": "https://www.federalreserve.gov/supervisionreg/srletters/sr1523a1.pdf",
    "Basel_III.pdf": "https://www.bis.org/bcbs/publ/d424.pdf",
    "Basel_II.pdf": "https://www.bis.org/publ/bcbs107.pdf",
    "BCBS239.pdf": "https://www.bis.org/publ/bcbs239.pdf",
    "FRTB.pdf": "https://www.bis.org/bcbs/publ/d352.pdf",
    "SR95_51.pdf": "https://www.federalreserve.gov/supervisionreg/srletters/sr9551.pdf",
    "FDIC_CreditEval.pdf": "https://www.fdic.gov/resources/bankers/banker-colleges/files/credit-evaluation.pdf",
}

for file, url in urls.items():
    print(f"Downloading {file} ...")
    try:
        r = requests.get(url, timeout=20)
        save_path = f"{DATA_DIR}/{file}"

        # Validate PDF content (>5 KB)
        if r.status_code == 200 and len(r.content) > 5000:
            with open(save_path, "wb") as f:
                f.write(r.content)
            print(f"✓ Saved {file} ({len(r.content)} bytes)")
        else:
            print(f"✗ FAILED {file} (invalid or empty response)")
    except Exception as e:
        print(f"✗ ERROR downloading {file}: {e}")


Downloading SR11_7.pdf ...
✓ Saved SR11_7.pdf (300659 bytes)
Downloading OCC_2011_12.pdf ...
✓ Saved OCC_2011_12.pdf (159116 bytes)
Downloading FDIC_2017_01.pdf ...
✗ FAILED FDIC_2017_01.pdf (invalid or empty response)
Downloading FRB_CECL.pdf ...
✗ FAILED FRB_CECL.pdf (invalid or empty response)
Downloading Basel_III.pdf ...
✓ Saved Basel_III.pdf (3053985 bytes)
Downloading Basel_II.pdf ...
✓ Saved Basel_II.pdf (1137745 bytes)
Downloading BCBS239.pdf ...
✓ Saved BCBS239.pdf (129752 bytes)
Downloading FRTB.pdf ...
✓ Saved FRTB.pdf (1456526 bytes)
Downloading SR95_51.pdf ...
✓ Saved SR95_51.pdf (317664 bytes)
Downloading FDIC_CreditEval.pdf ...
✗ FAILED FDIC_CreditEval.pdf (invalid or empty response)


In [54]:
!ls -lh /content/data

total 6.3M
-rw-r--r-- 1 root root  969 Nov 18 18:37 answers.txt
-rw-r--r-- 1 root root 3.0M Nov 18 19:15 Basel_III.pdf
-rw-r--r-- 1 root root 1.1M Nov 18 19:15 Basel_II.pdf
-rw-r--r-- 1 root root 127K Nov 18 19:15 BCBS239.pdf
-rw-r--r-- 1 root root    0 Nov 18 18:55 ECB_MRM.pdf
-rw-r--r-- 1 root root    0 Nov 18 18:55 FDIC_2017_01.pdf
-rw-r--r-- 1 root root    0 Nov 18 18:55 FRB_CECL.pdf
-rw-r--r-- 1 root root 1.4M Nov 18 19:15 FRTB.pdf
-rw-r--r-- 1 root root 156K Nov 18 19:15 OCC_2011_12.pdf
-rw-r--r-- 1 root root    0 Nov 18 18:55 PRA_SS3_18.pdf
-rw-r--r-- 1 root root  434 Nov 18 18:37 questions.txt
-rw-r--r-- 1 root root 294K Nov 18 19:15 SR11_7.pdf
-rw-r--r-- 1 root root 311K Nov 18 19:15 SR95_51.pdf


In [55]:
import PyPDF2

def load_pdf_pages(fname):
    try:
        reader = PyPDF2.PdfReader(open(fname,'rb'))
        pages = []
        for page in reader.pages:
            text = page.extract_text()
            if text:
                pages.append(text)
        return pages
    except Exception as e:
        print(f"Error reading {fname}: {e}")
        return []


In [56]:
import re

def chunk_text(text, chunk_size=400):
    sentences = re.split(r'(?<=[.!?]) +', text)

    chunks = []
    current = []
    length = 0

    for sent in sentences:
        words = len(sent.split())
        current.append(sent)
        length += words

        if length >= chunk_size:
            chunks.append(" ".join(current))
            current = []
            length = 0

    if current:
        chunks.append(" ".join(current))

    return chunks


In [57]:
pdf_files = [f"{DATA_DIR}/{f}" for f in os.listdir(DATA_DIR) if f.endswith(".pdf")]

all_chunks = []

for pdf in pdf_files:
    pages = load_pdf_pages(pdf)

    if len(pages) == 0:
        print("Skipping EMPTY PDF:", pdf)
        continue

    for p in pages:
        page_chunks = chunk_text(p)
        all_chunks.extend(page_chunks)

print("Total chunks:", len(all_chunks))


Error reading /content/data/FDIC_2017_01.pdf: Cannot read an empty file
Skipping EMPTY PDF: /content/data/FDIC_2017_01.pdf
Error reading /content/data/ECB_MRM.pdf: Cannot read an empty file
Skipping EMPTY PDF: /content/data/ECB_MRM.pdf
Error reading /content/data/PRA_SS3_18.pdf: Cannot read an empty file
Skipping EMPTY PDF: /content/data/PRA_SS3_18.pdf
Error reading /content/data/FRB_CECL.pdf: Cannot read an empty file
Skipping EMPTY PDF: /content/data/FRB_CECL.pdf
Total chunks: 1016


In [58]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embs = embedder.encode(all_chunks, show_progress_bar=True)

index = faiss.IndexFlatL2(embs.shape[1])
index.add(np.array(embs))

print("FAISS index ready!")


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

FAISS index ready!


In [59]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "microsoft/phi-3-mini-4k-instruct"

tok = AutoTokenizer.from_pretrained(model_name)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16
).to("cuda")

print("LLM loaded successfully.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 78.12 MiB is free. Process 11139 has 14.66 GiB memory in use. Of the allocated memory 14.11 GiB is allocated by PyTorch, and 424.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def retrieve(query, k=5):
    q_emb = embedder.encode([query])
    _, I = index.search(np.array(q_emb), k)
    return [all_chunks[i] for i in I[0]]


In [None]:
def rag_answer(query):
    ctx = retrieve(query)
    ctx_text = "\n\n".join(ctx)

    prompt = f"""
You are a compliance expert. Use ONLY the context below:

CONTEXT:
{ctx_text}

QUESTION: {query}

ANSWER:
"""

    tokens = tok(prompt, return_tensors="pt").to("cuda")
    out = llm.generate(**tokens, max_new_tokens=300)
    return tok.decode(out[0], skip_special_tokens=True)


In [None]:
rag_answer("What are the model validation expectations under SR 11-7?")


In [None]:
import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

questions = [
    "What are the three pillars of model risk management?",
    "Explain validation independence in SR 11-7.",
    "What documentation does OCC 2011-12 require?",
    "What does Basel II specify for credit risk?",
    "What is BCBS 239 about?",
]

answers = [
    "The three pillars include development, independent validation, and ongoing monitoring.",
    "Validation independence means validators must be separate from model developers.",
    "The OCC requires complete documentation including assumptions, development, monitoring, and testing.",
    "Basel II provides a global framework for credit, market, and operational risk.",
    "BCBS 239 defines principles for effective risk data aggregation and reporting.",
]

preds = [rag_answer(q) for q in questions]

rouge_results = rouge.compute(predictions=preds, references=answers)
bleu_results = bleu.compute(predictions=preds, references=[[a] for a in answers])

rouge_results, bleu_results


In [None]:
import gradio as gr
gr.Interface(fn=rag_answer, inputs="text", outputs="text",
             title="Compliance RAG Chatbot").launch()
