In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print("Key loaded:", OPENAI_API_KEY is not None)

In [None]:
import pdfplumber
from pathlib import Path
import json

pdf_path = Path("../data/raw/TD-2024-Annual-Report.pdf")

pagenumber_text = []

with pdfplumber.open(pdf_path) as pdf:
    for i , page in enumerate (pdf.pages):
        text = page.extract_text()
        pagenumber_text.append({"pagenumber": i+1, "text": text})


output_path = Path("../data/processed/td_2024_pages.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(pagenumber_text , f, indent=2, ensure_ascii=False)

print('Saved to:', output_path)

len (pagenumber_text)
# for page in pagenumber_text[:2]:
#     print(f"\n---page{page['pagenumber']}---\n")
#     print(page['text'][:1000])





In [None]:
import re

def clean_text (text:str) -> str:

    text = re.sub(r"\n{2,}", "\n", text)

    text = re.sub(r"[ ]{2,}", " ", text)

    text = text.strip()

    return text


for page in pagenumber_text:

    page["text_clean"] = clean_text(page["text"])


print ("raw:\n" + pagenumber_text[17]["text"][:500])
print ("\nclean:\n" + pagenumber_text[17]["text_clean"][:500])

In [None]:
import re
from collections import Counter

def normalize_header_footer(line: str) -> str:
    line = line.strip()

    # remove leading page number like "10 TD BANK ..."
    line = re.sub(r"^\d+\s+", "", line)

    # remove trailing page number like "... OUR STRATEGY 10"
    line = re.sub(r"\s+\d+\s*$", "", line)

    # collapse multiple spaces
    line = re.sub(r"\s{2,}", " ", line)

    return line

def collect_candidates(pagenumber_text, n=3):
    header_lines = []
    footer_lines = []

    for page in pagenumber_text:
        text = page.get("text_clean") or page.get("text") or ""
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        if not lines:
            continue

        header_lines.extend([normalize_header_footer(ln) for ln in lines[:n]])
        footer_lines.extend([normalize_header_footer(ln) for ln in lines[-n:]])

    return Counter(header_lines), Counter(footer_lines)

header_counts, footer_counts = collect_candidates(pagenumber_text, n=3)

print("Top header candidates:")
for line, count in header_counts.most_common(10):
    print(f"{count:3d} | {line}")

print("\nTop footer candidates:")
for line, count in footer_counts.most_common(10):
    print(f"{count:3d} | {line}")

HEADER_MIN_COUNT = 10
FOOTER_MIN_COUNT = 10

header_lines_to_remove = {line for line, cnt in header_counts.items() if cnt >= HEADER_MIN_COUNT}
footer_lines_to_remove = {line for line, cnt in footer_counts.items() if cnt >= FOOTER_MIN_COUNT}

lines_to_remove = header_lines_to_remove | footer_lines_to_remove
print("\n# lines_to_remove =", len(lines_to_remove))

def remove_repeated_headers_footers(text: str, remove_set: set[str]) -> str:
    kept = []
    for ln in text.splitlines():
        ln_norm = normalize_header_footer(ln)
        if ln_norm in remove_set:
            continue
        kept.append(ln)
    return "\n".join(kept)

for page in pagenumber_text:
    base = page.get("text_clean") or page.get("text") or ""
    page["text_nostruct"] = remove_repeated_headers_footers(base, lines_to_remove)




In [None]:
# quick check
i = 11
print("BEFORE:\n", pagenumber_text[i]["text_clean"][:250], "\n...\n", pagenumber_text[i]["text_clean"][-250:])
print("\nAFTER:\n", pagenumber_text[i]["text_nostruct"][:250], "\n...\n", pagenumber_text[i]["text_nostruct"][-250:])


In [None]:

for page in pagenumber_text:
    page["text_final"] = page.get("text_nostruct") or page.get("text_clean") or page.get("text") or ""


pages_final = []
for page in pagenumber_text:
    pages_final.append({
        "pagenumber": page["pagenumber"],
        "text_final": page["text_final"].strip()
    })


pages_final = [p for p in pages_final if p["text_final"]]

print("Final pages:", len(pages_final))


out_path = Path("../data/processed/td_2024_pages_final.json")

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(pages_final, f, indent=2, ensure_ascii=False)

print("Saved:", out_path)


In [None]:
import re

def normalize_spaces(s: str) -> str:
    s = s.replace("\x00", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def chunk_text(text: str, chunk_size=1200, overlap=200):
    """
    Simple character-based chunking with overlap.
    Works well as a first version.
    """
    text = normalize_spaces(text)
    if not text:
        return []

    chunks = []
    start = 0
    n = len(text)

    while start < n:
        end = min(start + chunk_size, n)
        chunks.append(text[start:end])
        if end == n:
            break
        start = max(0, end - overlap)

    return chunks


all_chunks = []
for page in pagenumber_text:
    final_text = page.get("text_final") or ""
    chunks = chunk_text(final_text, chunk_size=1200, overlap=200)

    page["chunks"] = chunks  

    
    for j, ch in enumerate(chunks):
        all_chunks.append({
            "pdf_page": page["pagenumber"],
            "chunk_id": j,
            "text": ch
        })

len(all_chunks)


In [None]:
all_chunks[11]["pdf_page"], len(all_chunks[11]["text"]), all_chunks[11]["text"][:1200]


In [None]:


out_chunks = Path("../data/processed/td_2024_chunks.json")

out_chunks.parent.mkdir(parents=True, exist_ok=True)

with open(out_chunks, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)

print("Saved:", out_chunks)
print("Total chunks:", len(all_chunks))


In [None]:
for r in all_chunks[:3]:
    print(r["pdf_page"], r["chunk_id"], "=>", r["text"][:120], "...\n")


In [None]:
import json
from pathlib import Path

chunks_path = Path("../data/processed/td_2024_chunks.json")
all_chunks = json.loads(chunks_path.read_text(encoding="utf-8"))

len(all_chunks), all_chunks[0].keys()


In [None]:
# OpenAI Embeddings

import os
from pathlib import Path
from dotenv import load_dotenv



load_dotenv()
# assert os.getenv("OPENAI_API_KEY"), "Missing OPENAI_API_KEY in .env"
print("Key loaded:", os.getenv("OPENAI_API_KEY") is not None)

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

persist_dir = Path("../data/processed/chroma_td2024")
persist_dir.mkdir(parents=True, exist_ok=True)

emb = OpenAIEmbeddings(model="text-embedding-3-small"
                       
                       )


texts = [c["text"] for c in all_chunks]
metadatas = [{"bank": "TD", "year": 2024, "pdf_page": c["pdf_page"], "chunk_id": c["chunk_id"]} for c in all_chunks]

lengths = [len(t) for t in texts]
print("max len:", max(lengths), "avg:", sum(lengths)/len(lengths))
print("top 5:", sorted(lengths, reverse=True)[:5])

vectordb = Chroma.from_texts(
    texts=texts,
    embedding=emb,
    metadatas=metadatas,
    persist_directory=str(persist_dir),
    collection_name="bank_reports",
)

vectordb.persist()
print("Saved ChromaDB to:", persist_dir)
print("Docs stored:", len(texts))

In [None]:
# VertexAI Embeddings

from pathlib import Path
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.vectorstores import Chroma

vec_path = Path("../data/processed/chroma_td2024_vertex")
vec_path.mkdir(parents=True, exist_ok=True)

emb_v = VertexAIEmbeddings(model_name="text-embedding-001",
                           project="bank-report-rag",
                           location="us-central1")


texts = [c["text"] for c in all_chunks]
metadatas = [{"bank": "TD", "year": 2024, "pdf_page": c["pdf_page"], "chunk_id": c["chunk_id"]} for c in all_chunks]


vectordb_v = Chroma(
    persist_directory=str(vec_path),
    collection_name="bank_reports",
    embedding_function=emb_v,
)

BATCH = 50
for i in range(0, len(texts), BATCH):
    batch_texts = texts[i:i+BATCH]
    batch_metas = metadatas[i:i+BATCH]
    try:
        vectordb_v.add_texts(texts=batch_texts, metadatas=batch_metas)
        print(f"Added {min(i+BATCH, len(texts))}/{len(texts)}")
    except Exception as e:
        lens = [len(t) for t in batch_texts]
        j = max(range(len(lens)), key=lambda k: lens[k])
        print("FAILED BATCH starting at", i)
        print("Longest text len:", lens[j])
        print("Metadata:", batch_metas[j])
        raise


vectordb_v.persist()
print("Saved ChromaDB to:", vec_path)
print("Docs stored:", len(texts))


In [None]:
docs = vectordb.similarity_search(
    "risk factors",
    k=5,
    filter={
        "$and": [
            {"bank": "TD"},
            {"year": 2024}
        ]
    }
)

len(docs)


In [None]:
import os
from pathlib import Path
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI

emb = OpenAIEmbeddings(model="text-embedding-3-small")
                       
vec_path = Path("../data/processed/chroma_td2024")
vectordb = Chroma(
    persist_directory=str(vec_path),
     collection_name="bank_reports",  
    embedding_function=emb
)


llm = ChatOpenAI(
    model = "gpt-4o-mini",
    temperature=0
)


def answer_question(question, bank="TD", year=2024, k=5, show_snippets=True):
    retriever = vectordb.as_retriever(
        search_kwargs={"k":k, "filter":{"$and": [
        {"bank": "TD"},
        {"year": 2024}
        ]
        }}
    )

    docs = retriever.invoke(question)   

  
    seen = set()
    unique_docs = []
    for d in docs:
        md = d.metadata or {}
        key = (md.get("pdf_page"), md.get("chunk_id"))
        if key in seen:
            continue
        seen.add(key)
        unique_docs.append(d)

    context_blocks = []
    sources = []

    for d in unique_docs:
        md = d.metadata or {}
        pg = md.get("pdf_page")
        cid = md.get("chunk_id")
        text = d.page_content

        context_blocks.append(f"[page {pg} | chunk {cid}]\n{text}")

        src = {"pdf_page": pg, "chunk_id": cid}
        if show_snippets:
            src["snippet"] = text[:220].replace("\n", " ") + "..."
        sources.append(src)

    context = "\n\n---\n\n".join(context_blocks)

    prompt = f"""You are a helpful analyst reading a bank annual report.
Use ONLY the provided context.
If the answer is not in the context, say: "I do not know based on the provided text."

QUESTION:
{question}

CONTEXT:
{context}

Return:
1) Answer (short, clear)
2) Bullet list of key evidence (1-4 bullets)
"""

    resp = llm.invoke(prompt)
    return {"answer": resp.content, "sources": sources}


result = answer_question(
    "What are the main risk factors mentioned for 2024?",
    bank="TD",
    year=2024,
    k=5
)

print(result["answer"])
print("\nSOURCES:", result["sources"])

