In [None]:
import pdfplumber
from pathlib import Path
import json

pdf_path = Path("../data/raw/TD-2024-Annual-Report.pdf")

pagenumber_text = []

with pdfplumber.open(pdf_path) as pdf:
    for i , page in enumerate (pdf.pages):
        text = page.extract_text()
        pagenumber_text.append({"pagenumber": i+1, "text": text})


output_path = Path("../data/processed/td_2024_pages.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(pagenumber_text , f, indent=2, ensure_ascii=False)

print('Saved to:', output_path)

len (pagenumber_text)
# for page in pagenumber_text[:2]:
#     print(f"\n---page{page['pagenumber']}---\n")
#     print(page['text'][:1000])




Saved to: ..\data\processed\td_2024_pages.json


[{'pagenumber': 1, 'text': '2024\nAnnual Report'},
 {'pagenumber': 2,
  'text': 'Table of Contents\nOUR STRATEGY 1\nGroup President and CEO’s Message 2\nChair of the Board’s Message 3\nProgress on Our U.S. AML Program 5\nProven Business Model 6\nPurpose-Driven 8\nSustainability 10\nForward-Focused 14\nBoard Committees 16\nMANAGEMENT’S DISCUSSION AND ANALYSIS 18\nGlossary 143 See the TD Annual Report\nonline by visiting\nwww.td.com/ar2024/\nFINANCIAL RESULTS\nConsolidated Financial Statements 146\nNotes to Consolidated Financial Statements 159\nTen-Year Statistical Review 238\nShareholder and Investor Information 241\nFor information on TD’s commitment to the community and our environment, visit\nwww.td.com/content/dam/tdcom/canada/about-td/pdf/esg/2023-sustainability-report-en.pdf\n* 2024 Sustainability Report to be published in March 2025'}]

In [19]:
import re

def clean_text (text:str) -> str:

    text = re.sub(r"\n{2,}", "\n", text)

    text = re.sub(r"[ ]{2,}", " ", text)

    text = text.strip()

    return text


for page in pagenumber_text:

    page["text_clean"] = clean_text(page["text"])


print ("raw:\n" + pagenumber_text[17]["text"][:500])
print ("\nclean:\n" + pagenumber_text[17]["text_clean"][:500])

raw:
Board Committees
COMMITTEE MEMBERS1 KEY RESPONSIBILITIES2
Corporate Alan N. MacGibbon Responsibility for corporate governance of the Bank:
Governance (Chair) • Identify individuals qualified to become Board members, recommend to the Board the director
Committee Amy W. Brinkley nominees for the next annual meeting of shareholders and recommend candidates to fill vacancies
Claude Mongeau on the Board that occur between meetings of the shareholders.
Nancy G. Tower • Develop and recommend to the Boa

clean:
Board Committees
COMMITTEE MEMBERS1 KEY RESPONSIBILITIES2
Corporate Alan N. MacGibbon Responsibility for corporate governance of the Bank:
Governance (Chair) • Identify individuals qualified to become Board members, recommend to the Board the director
Committee Amy W. Brinkley nominees for the next annual meeting of shareholders and recommend candidates to fill vacancies
Claude Mongeau on the Board that occur between meetings of the shareholders.
Nancy G. Tower • Develop and recomm

In [36]:
from collections import Counter

footer_keyword = 'TD BANK GROUP ANNUAL REPORT 2024'

def collect_header_footer_lines(pages, n=3):

    header_lines = []
    footer_lines = []

    for i , page in enumerate(pages, start=1):

        lines = [ln.strip() for ln in page["text_clean"].splitlines() if ln.strip()]
        matches = [ln2 for ln2 in lines if footer_keyword in ln2]

        if matches:
            print(f"\n---page {i}---")
            for ln in matches:
                print (ln)

        if not lines:
            continue

        header_lines.extend(lines[:n])
        footer_lines.extend(lines[-n:])

    return Counter(header_lines), Counter(footer_lines)

header_counts, footer_counts = collect_header_footer_lines(pagenumber_text, n=3)


print("Top candidates for header:\n")
for hd , count in header_counts.most_common(15):
    print(f"{count:3d} | {hd}")


print("\nTop candidate for footer:\n")
for ft , count in footer_counts.most_common(15):
    print(f"{count:3d} | {ft}")


---page 4---
2 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 6---
4 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 8---
6 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 10---
8 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 12---
10 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 14---
12 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 16---
14 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 18---
16 TD BANK GROUP ANNUAL REPORT 2024 OUR STRATEGY

---page 20---
18 TD BANK GROUP ANNUAL REPORT 2024 MANAGEMENT’S DISCUSSION AND ANALYSIS

---page 22---
20 TD BANK GROUP ANNUAL REPORT 2024 MANAGEMENT’S DISCUSSION AND ANALYSIS

---page 24---
22 TD BANK GROUP ANNUAL REPORT 2024 MANAGEMENT’S DISCUSSION AND ANALYSIS

---page 26---
24 TD BANK GROUP ANNUAL REPORT 2024 MANAGEMENT’S DISCUSSION AND ANALYSIS

---page 28---
26 TD BANK GROUP ANNUAL REPORT 2024 MANAGEMENT’S DISCUSSION AND ANALYSIS

---page 30---
28 TD BANK GROUP ANNUAL REPORT 2024 MANA