In [32]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print("Key loaded:", OPENAI_API_KEY is not None)

Key loaded: True


In [1]:
import pdfplumber
from pathlib import Path
import json

pdf_path = Path("../data/raw/TD-2024-Annual-Report.pdf")

pagenumber_text = []

with pdfplumber.open(pdf_path) as pdf:
    for i , page in enumerate (pdf.pages):
        text = page.extract_text()
        pagenumber_text.append({"pagenumber": i+1, "text": text})


output_path = Path("../data/processed/td_2024_pages.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(pagenumber_text , f, indent=2, ensure_ascii=False)

print('Saved to:', output_path)

len (pagenumber_text)
# for page in pagenumber_text[:2]:
#     print(f"\n---page{page['pagenumber']}---\n")
#     print(page['text'][:1000])





Saved to: ..\data\processed\td_2024_pages.json


244

In [3]:
import re

def clean_text (text:str) -> str:

    text = re.sub(r"\n{2,}", "\n", text)

    text = re.sub(r"[ ]{2,}", " ", text)

    text = text.strip()

    return text


for page in pagenumber_text:

    page["text_clean"] = clean_text(page["text"])


print ("raw:\n" + pagenumber_text[17]["text"][:500])
print ("\nclean:\n" + pagenumber_text[17]["text_clean"][:500])

raw:
Board Committees
COMMITTEE MEMBERS1 KEY RESPONSIBILITIES2
Corporate Alan N. MacGibbon Responsibility for corporate governance of the Bank:
Governance (Chair) • Identify individuals qualified to become Board members, recommend to the Board the director
Committee Amy W. Brinkley nominees for the next annual meeting of shareholders and recommend candidates to fill vacancies
Claude Mongeau on the Board that occur between meetings of the shareholders.
Nancy G. Tower • Develop and recommend to the Boa

clean:
Board Committees
COMMITTEE MEMBERS1 KEY RESPONSIBILITIES2
Corporate Alan N. MacGibbon Responsibility for corporate governance of the Bank:
Governance (Chair) • Identify individuals qualified to become Board members, recommend to the Board the director
Committee Amy W. Brinkley nominees for the next annual meeting of shareholders and recommend candidates to fill vacancies
Claude Mongeau on the Board that occur between meetings of the shareholders.
Nancy G. Tower • Develop and recomm

In [None]:
import re
from collections import Counter

def normalize_header_footer(line: str) -> str:
    line = line.strip()

    # remove leading page number like "10 TD BANK ..."
    line = re.sub(r"^\d+\s+", "", line)

    # remove trailing page number like "... OUR STRATEGY 10"
    line = re.sub(r"\s+\d+\s*$", "", line)

    # collapse multiple spaces
    line = re.sub(r"\s{2,}", " ", line)

    return line

def collect_candidates(pagenumber_text, n=3):
    header_lines = []
    footer_lines = []

    for page in pagenumber_text:
        text = page.get("text_clean") or page.get("text") or ""
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        if not lines:
            continue

        header_lines.extend([normalize_header_footer(ln) for ln in lines[:n]])
        footer_lines.extend([normalize_header_footer(ln) for ln in lines[-n:]])

    return Counter(header_lines), Counter(footer_lines)

header_counts, footer_counts = collect_candidates(pagenumber_text, n=3)

print("Top header candidates:")
for line, count in header_counts.most_common(10):
    print(f"{count:3d} | {line}")

print("\nTop footer candidates:")
for line, count in footer_counts.most_common(10):
    print(f"{count:3d} | {line}")

HEADER_MIN_COUNT = 10
FOOTER_MIN_COUNT = 10

header_lines_to_remove = {line for line, cnt in header_counts.items() if cnt >= HEADER_MIN_COUNT}
footer_lines_to_remove = {line for line, cnt in footer_counts.items() if cnt >= FOOTER_MIN_COUNT}

lines_to_remove = header_lines_to_remove | footer_lines_to_remove
print("\n# lines_to_remove =", len(lines_to_remove))

def remove_repeated_headers_footers(text: str, remove_set: set[str]) -> str:
    kept = []
    for ln in text.splitlines():
        ln_norm = normalize_header_footer(ln)
        if ln_norm in remove_set:
            continue
        kept.append(ln)
    return "\n".join(kept)

for page in pagenumber_text:
    base = page.get("text_clean") or page.get("text") or ""
    page["text_nostruct"] = remove_repeated_headers_footers(base, lines_to_remove)




Top header candidates:
 28 | |
 13 | (millions of Canadian dollars) As at
  6 | October 31 October 31 October 31 October
  6 | October 31, 2024 October 31,
  5 | (millions of Canadian dollars, except as noted) 2024
  5 | FINANCIAL RESULTS OVERVIEW
  5 | BUSINESS SEGMENT ANALYSIS
  5 | BUSINESS STRATEGY BUSINESS HIGHLIGHTS IN
  5 | 2023
  4 | Order/Agreement Key Requirements

Top footer candidates:
125 | TD BANK GROUP ANNUAL REPORT 2024 MANAGEMENT’S DISCUSSION AND ANALYSIS
 92 | TD BANK GROUP ANNUAL REPORT 2024 FINANCIAL RESULTS
 16 | TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY
  5 | section of this document.
  4 | For additional information about the Bank’s use of non-GAAP financial measures, refer to “Non-GAAP and Other Financial Measures” in the “Financial Results Overview”
  4 | Includes loans that are measured at FVOCI.
  3 | TD BANK GROUP ANNUAL REPORT 2024 GLOSSARY
  3 | The accompanying Notes are an integral part of these Consolidated
  3 | Financial Statements.
  3 | TD BANK 

In [17]:
# quick check
i = 11
print("BEFORE:\n", pagenumber_text[i]["text_clean"][:250], "\n...\n", pagenumber_text[i]["text_clean"][-250:])
print("\nAFTER:\n", pagenumber_text[i]["text_nostruct"][:250], "\n...\n", pagenumber_text[i]["text_nostruct"][-250:])


BEFORE:
 Sustainability
As a global financial institution, we know we have
an important role to play in supporting our customers,
colleagues, and communities in a changing world.
Awards and
milestones TD’s commitment to sustainability is reflected in
our appr 
...
  released an updated financial education initiatives
Sustainable Financing Framework. in Canada and the U.S. The
progress made on these targets
in 2024 will be shared in our 2024
Sustainability Report.
10 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

AFTER:
 Sustainability
As a global financial institution, we know we have
an important role to play in supporting our customers,
colleagues, and communities in a changing world.
Awards and
milestones TD’s commitment to sustainability is reflected in
our appr 
...
 d this through TD led and supported
year, TD also released an updated financial education initiatives
Sustainable Financing Framework. in Canada and the U.S. The
progress made on these targets
in 2024 will be shared i

In [18]:

for page in pagenumber_text:
    page["text_final"] = page.get("text_nostruct") or page.get("text_clean") or page.get("text") or ""


pages_final = []
for page in pagenumber_text:
    pages_final.append({
        "pagenumber": page["pagenumber"],
        "text_final": page["text_final"].strip()
    })


pages_final = [p for p in pages_final if p["text_final"]]

print("Final pages:", len(pages_final))


out_path = Path("../data/processed/td_2024_pages_final.json")

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(pages_final, f, indent=2, ensure_ascii=False)

print("Saved:", out_path)


Final pages: 244
Saved: ..\data\processed\td_2024_pages_final.json


In [20]:
import re

def normalize_spaces(s: str) -> str:
    s = s.replace("\x00", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def chunk_text(text: str, chunk_size=1200, overlap=200):
    """
    Simple character-based chunking with overlap.
    Works well as a first version.
    """
    text = normalize_spaces(text)
    if not text:
        return []

    chunks = []
    start = 0
    n = len(text)

    while start < n:
        end = min(start + chunk_size, n)
        chunks.append(text[start:end])
        if end == n:
            break
        start = max(0, end - overlap)

    return chunks


all_chunks = []
for page in pagenumber_text:
    final_text = page.get("text_final") or ""
    chunks = chunk_text(final_text, chunk_size=1200, overlap=200)

    page["chunks"] = chunks  

    
    for j, ch in enumerate(chunks):
        all_chunks.append({
            "pdf_page": page["pagenumber"],
            "chunk_id": j,
            "text": ch
        })

len(all_chunks)


1300

In [30]:
all_chunks[11]["pdf_page"], len(all_chunks[11]["text"]), all_chunks[11]["text"][:1200]


(5,
 1200,
 ' Canada’s top 100 Employers for the 18th consecutive year and America’s Best Employers This was a year with difficult challenges. for Diversity by Forbes for the third year in a row. In addition, The deficiencies of our U.S. AML program were serious. As a TD once again achieved the Great Place to Work certification Global Systemically Important Bank (G-SIB), and an integral in both Canada and the U.S. part of the financial system, we have a responsibility to Our colleagues also advanced programs that build on our protect the system and thwart criminal activity. We did not capabilities to innovate. TD Invent, the Bank’s enterprise deliver, and we apologize to all our stakeholders. approach to innovation, surpassed 10,000 implemented In October, we reached a resolution of these matters with ideas from colleagues across the Bank. And our patent U.S. regulators as well as the Department of Justice. The portfolio reached over 2,500 patents, with more than 800 terms were costly 

In [23]:


out_chunks = Path("../data/processed/td_2024_chunks.json")

out_chunks.parent.mkdir(parents=True, exist_ok=True)

with open(out_chunks, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)

print("Saved:", out_chunks)
print("Total chunks:", len(all_chunks))


Saved: ..\data\processed\td_2024_chunks.json
Total chunks: 1300


In [None]:
for r in all_chunks[:3]:
    print(r["pdf_page"], r["chunk_id"], "=>", r["text"][:120], "...\n")


1 0 => 2024 Annual Report ...

2 0 => Table of Contents OUR STRATEGY 1 Group President and CEO’s Message 2 Chair of the Board’s Message 3 Progress on Our U.S. ...

3 0 => Our Strategy Anchored in our proven business Proven Business Model model, we are guided by our Deliver consistent earnin ...



In [None]:
import json
from pathlib import Path

chunks_path = Path("../data/processed/td_2024_chunks.json")
all_chunks = json.loads(chunks_path.read_text(encoding="utf-8"))

len(all_chunks), all_chunks[0].keys()


(1300, dict_keys(['pdf_page', 'chunk_id', 'text']))

In [6]:
import os
from pathlib import Path
from dotenv import load_dotenv



load_dotenv()
# assert os.getenv("OPENAI_API_KEY"), "Missing OPENAI_API_KEY in .env"
print("Key loaded:", os.getenv("OPENAI_API_KEY") is not None)

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

persist_dir = Path("../data/processed/chroma_td2024")
persist_dir.mkdir(parents=True, exist_ok=True)

emb = OpenAIEmbeddings(model="text-embedding-3-small",
                       openai_api_key=os.getenv("OPENAI_API_KEY")
                       )


texts = [c["text"] for c in all_chunks]
metadatas = [{"bank": "TD", "year": 2024, "pdf_page": c["pdf_page"], "chunk_id": c["chunk_id"]} for c in all_chunks]

vectordb = Chroma.from_texts(
    texts=texts,
    embedding=emb,
    metadatas=metadatas,
    persist_directory=str(persist_dir),
    collection_name="bank_reports",
)

vectordb.persist()
print("Saved ChromaDB to:", persist_dir)
print("Docs stored:", len(texts))

Key loaded: True


RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for text-embedding-3-small in organization org-tgYINwnqYzEyzRuEy2DslJoc on tokens per min (TPM): Limit 40000, Requested 83100. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}