In [None]:
!apt-get install poppler-utils
!pip install pdf2image pytesseract pillow opencv-python

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (147 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126308 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading 

In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import pytesseract
from collections import defaultdict
from pdf2image import convert_from_path


In [None]:
def convert_pdfs_to_images(pdf_folder, output_image_folder):
    os.makedirs(output_image_folder, exist_ok=True)
    for pdf_file in sorted(os.listdir(pdf_folder)):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            year = os.path.splitext(pdf_file)[0]
            images = convert_from_path(pdf_path, dpi=300)
            for idx, image in enumerate(images):
                image_filename = f"{year}_page_{idx+1}.png"
                output_path = os.path.join(output_image_folder, image_filename)
                image.save(output_path, "PNG")
                print(f"Saved image: {output_path}")


In [None]:
convert_pdfs_to_images(pdf_folder="/content/pdf_years", output_image_folder="/content/pdf_images")

Saved image: /content/pdf_images/1904_1787_page_1.png
Saved image: /content/pdf_images/1905_page_1.png
Saved image: /content/pdf_images/1906_952_page_1.png
Saved image: /content/pdf_images/1907_page_1.png
Saved image: /content/pdf_images/1912_page_1.png
Saved image: /content/pdf_images/1913_page_1.png
Saved image: /content/pdf_images/1914_1015_page_1.png
Saved image: /content/pdf_images/1914_1276_page_1.png
Saved image: /content/pdf_images/1915_page_1.png
Saved image: /content/pdf_images/1916_page_1.png
Saved image: /content/pdf_images/1917_page_1.png
Saved image: /content/pdf_images/1918_876_page_1.png
Saved image: /content/pdf_images/1919_905_page_1.png
Saved image: /content/pdf_images/1920_656_page_1.png
Saved image: /content/pdf_images/1920_989_page_1.png
Saved image: /content/pdf_images/1921_1035_page_1.png
Saved image: /content/pdf_images/1921_342_page_1.png
Saved image: /content/pdf_images/1922_1008_page_1.png
Saved image: /content/pdf_images/1923_page_1.png
Saved image: /conten

In [None]:
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"❌ Failed to load: {image_path}")
        return None

    # Resize
    image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

    # Contrast
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(image)

    # Blur and threshold
    blurred = cv2.GaussianBlur(enhanced, (3, 3), 0)
    _, otsu = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological clean-up
    kernel_open = np.ones((2, 2), np.uint8)
    opened = cv2.morphologyEx(otsu, cv2.MORPH_OPEN, kernel_open)
    kernel_dilate = np.ones((2, 1), np.uint8)
    dilated = cv2.dilate(opened, kernel_dilate, iterations=1)

    return dilated


In [None]:
def detect_and_crop_columns(image_path, output_dir, base_filename, min_height_ratio=0.2):
    os.makedirs(output_dir, exist_ok=True)

    # Load image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"❌ Cannot load: {image_path}")
        return []

    height, width = image.shape
    min_height = int(min_height_ratio * height)

    # Binarize
    _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Vertical line detection
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, height // 20))
    vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=1)

    # Find contours and filter by height
    contours, _ = cv2.findContours(vertical_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    column_boxes = [cv2.boundingRect(c) for c in contours]
    tall_boxes = [box for box in column_boxes if box[3] >= min_height]
    tall_boxes.sort(key=lambda b: b[0])

    x_coords = [x for (x, y, w, h) in tall_boxes]
    print(f"Tall vertical line x-positions: {x_coords}")

    if not x_coords:
        print(" No tall vertical lines found.")
        return []

    # Find the line closest to center
    center_x = width // 2
    x_cut = min(x_coords, key=lambda x: abs(x - center_x))
    print(f"✅ Splitting at x={x_cut}, nearest to center x={center_x}")

    cropped_paths = []

    # Left column
    left_crop = image[:, 0:x_cut]
    left_path = os.path.join(output_dir, f"{base_filename}_left.png")
    cv2.imwrite(left_path, left_crop)
    cropped_paths.append(left_path)

    # Right column
    right_crop = image[:, x_cut:width]
    right_path = os.path.join(output_dir, f"{base_filename}_right.png")
    cv2.imwrite(right_path, right_crop)
    cropped_paths.append(right_path)

    return cropped_paths


In [None]:
def ocr_columns(cropped_paths):
    full_text = ""
    for idx, img_path in enumerate(sorted(cropped_paths)):
        img = cv2.imread(img_path)
        if img is None:
            continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        column_text = pytesseract.image_to_string(binary, config="--psm 4")
        full_text += f"\n### Column {idx + 1} ({os.path.basename(img_path)}) ###\n{column_text.strip()}\n"
    return full_text


In [None]:
def process_all_years(input_folder="/content/pdf_images", output_text_folder="/content/ocr_texts", column_crop_root="/content/column_crops"):
    os.makedirs(output_text_folder, exist_ok=True)
    os.makedirs(column_crop_root, exist_ok=True)

    year_to_images = defaultdict(list)
    for image_file in sorted(os.listdir(input_folder)):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            year = image_file.split("_")[0]
            year_to_images[year].append(os.path.join(input_folder, image_file))

    print(f"Detected Years: {list(year_to_images.keys())}")

    for year, image_paths in year_to_images.items():
        year_text = ""
        print(f"\n Processing Year: {year} ({len(image_paths)} page(s))")

        for img_path in sorted(image_paths):
            print(f" -  Page: {os.path.basename(img_path)}")

            preprocessed = preprocess_image(img_path)
            if preprocessed is None:
                continue

            page_id = os.path.basename(img_path).split("_")[1]
            column_output_dir = os.path.join(column_crop_root, f"{year}_{page_id}")
            cropped_columns = detect_and_crop_columns(
                img_path,
                output_dir=column_output_dir,
                base_filename=f"{year}_{page_id}"
            )

            if not cropped_columns:
                print(f" Skipping OCR for {img_path} (no columns detected).")
                continue

            page_text = ocr_columns(cropped_columns)
            year_text += f"\n--- Page: {os.path.basename(img_path)} ---\n{page_text}"

        # Save year OCR text
        output_text_path = os.path.join(output_text_folder, f"{year}.txt")
        with open(output_text_path, "w", encoding="utf-8") as f:
            f.write(year_text)
        print(f" Year OCR saved: {output_text_path}")


In [None]:
process_all_years()

🔎 Detected Years: ['1904', '1905', '1906', '1907', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1934', '1935', '1936', '1937', '1938', '1944', '1948']

📅 Processing Year: 1904 (1 page(s))
 - 🖼️ Page: 1904_1787_page_1.png
✅ Tall vertical line x-positions: [1016]
✅ Splitting at x=1016, nearest to center x=1200
✅ Year OCR saved: /content/ocr_texts/1904.txt

📅 Processing Year: 1905 (1 page(s))
 - 🖼️ Page: 1905_page_1.png
✅ Tall vertical line x-positions: [409, 1371]
✅ Splitting at x=1371, nearest to center x=1200
✅ Year OCR saved: /content/ocr_texts/1905.txt

📅 Processing Year: 1906 (1 page(s))
 - 🖼️ Page: 1906_952_page_1.png
✅ Tall vertical line x-positions: [397, 1304]
✅ Splitting at x=1304, nearest to center x=1200
✅ Year OCR saved: /content/ocr_texts/1906.txt

📅 Processing Year: 1907 (1 page(s))
 - 🖼️ Page: 1907_page_1.png
✅ Tall vertical line x-positions: [1080, 2013]
✅ 

In [None]:
!pip install google-generativeai



In [None]:
import google.generativeai as genai

genai.configure(api_key="")


In [None]:
import re

def extract_1807_context(text, window_size=500):
    matches = []

    for match in re.finditer(r'1807', text):
        start = max(match.start() - window_size, 0)
        end = min(match.end() + window_size, len(text))
        context = text[start:end]
        matches.append(context.strip())

    return matches


In [None]:
with open("/content/ocr_texts/1904.txt", "r", encoding="utf-8") as f:
    ocr_text = f.read()

contexts = extract_1807_context(ocr_text)

for idx, snippet in enumerate(contexts, 1):
    print(f"\n Match {idx}: \n{snippet}\n")



 Match 1: 
--- Page: 1904_1787_page_1.png ---

### Column 1 (1904_1787_left.png) ###
DESIGNER:

 

PHOTO-ENGI

WIT MINNEAPOLIS ( C
Witte Max F (Witte Bros), r 1812 n 4th. |
WITTE OTTO

Bank, Store, Saloon and Office Fix-
tures, 16-18 Central av, tel T C 1942,
r 1609 Lyndale av n. (See under Sa-
loon and Office Fixtures.)

Wittenberg Charles A, tailor The Plym-
outh, r 260 Humboldt av n.

Witter Davia F, supt of agencies N Ww
Natl Life Ins Co, r 1807 Dupont ay s.

“Emmett B®, student U of M, b 1807 Du-
pont av s.

“ Glen F, student, b 1807 Dupont av s.

Wittey Thomas HE, driver Swift & Co, r

: 723 6th av n.

Wittgraf Clara A, b 3102 Blaisdell av.

i. dred A, chf elk M & St L RR, r 911
n

“ George W, elk Great Western Elev Co,
b 3102 Blaisdell av.

“ Louis H, slsmn Underwood Typewriter

' Co, r 2112 27th av s.

* Louis W, mngr Mutual Cigar Store 245
lst av s, r 8102 Blaisdell av.

“ Pearl A, pleater The New Store, b 3102
Blaisdell av.

Witt


 Match 2: 
--- Page: 1904_1787_page_1.png -

In [None]:
import re
import os
import json
import google.generativeai as genai

genai.configure(api_key="")

def extract_1807_context(text, window_size=500):
    matches = []
    for match in re.finditer(r'1807', text):
        start = max(match.start() - window_size, 0)
        end = min(match.end() + window_size, len(text))
        context = text[start:end]
        matches.append(context.strip())
        print(matches)
    return matches

def build_gemini_prompt(year, snippet, target_address="1807 Dupont av s"):
    return f"""
You are a data extraction assistant for historical city directories.

Below is a short OCR text snippet from the {year} Minneapolis city directory:

\"\"\"{snippet}\"\"\"

Step 1: First, does this snippet refer to the address "{target_address}" (allow for minor OCR variations like punctuation, abbreviation, etc.)?
Step 2: If yes, extract these fields (if available):

- Year: {year}
- Full Resident Name
- Spouse Name
- Occupation
- Occupation Address
- Employer
- Address as written in snippet

Strictly output a single JSON object like:

{{
  "year": {year},
  "resident_full_name": "...",
  "spouse_name": "...",
  "occupation": "...",
  "occupation_address":"...",
  "employer": "...",
  "address_in_text": "..."
}}

If the address is not 1807 Dupont Ave S, output: null
"""

def query_gemini(prompt_text):
    model = genai.GenerativeModel('gemini-1.5-pro')
    response = model.generate_content(prompt_text)
    return response.text

def process_year_verify_address(file_path, target_address):
    year = os.path.basename(file_path).replace(".txt", "")
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    snippets = extract_1807_context(text, window_size=50)
    print(f" {year}: Found {len(snippets)} context snippets with '1807'")

    year_results = []

    for idx, snippet in enumerate(snippets):
        prompt = build_gemini_prompt(year, snippet, target_address)
        gemini_response = query_gemini(prompt)


        year_results.append(gemini_response)

    return year_results


In [None]:
file_path = "/content/text_files/1938.txt"  # Example year
target_address = "1807 Dupont av s"

results = process_year_verify_address(file_path, target_address)

print(results)

['. “ Louis, slsmn r1223 Newton av N\n“Mildred maid 1807 Dupont av 8 “ Meyer student r909 Girard av N\n“ Ph']
['. “ Louis, slsmn r1223 Newton av N\n“Mildred maid 1807 Dupont av 8 “ Meyer student r909 Girard av N\n“ Ph', "ss dr *“Saml (Rose) h909 Girard av N\n“Vivian maid'1807 Dupont av 8 -|Wainstolk Ann siswn Sally Frocks In"]
 1938: Found 2 context snippets with '1807'
['{\n  "year": 1938,\n  "resident_full_name": "Mildred “",\n  "spouse_name": null,\n  "occupation": "maid",\n  "occupation_address": "1807 Dupont av 8",  \n  "employer": null,\n  "address_in_text": "1807 Dupont av 8"\n}\n', '{\n  "year": 1938,\n  "resident_full_name": "Vivian Wainstolk",\n  "spouse_name": null,\n  "occupation": "maid",\n  "occupation_address": "1807 Dupont av S",\n  "employer": null,\n  "address_in_text": "1807 Dupont av S"\n}\n']


In [None]:
import json
import os

ocr_text_folder = "/content/text_files"
target_address = "1807 Dupont av s"

all_results = []

for filename in sorted(os.listdir(ocr_text_folder)):
    if filename.endswith(".txt"):
        file_path = os.path.join(ocr_text_folder, filename)
        print(f"\n Processing: {filename}")

        year_results = process_year_verify_address(file_path, target_address)

        # Save results for this year
        output_json_path = os.path.join("/content", f"verified_1807_{filename.replace('.txt', '')}.json")
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(year_results, f, indent=2)
        print(f"Saved year results to: {output_json_path}")

        all_results.extend(year_results)

# Optionally save full combined timeline:
with open("/content/final_timeline_1807.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2)
print("\nFull timeline saved: /content/final_timeline_1807.json")



 Processing: 1904.txt
['avia F, supt of agencies N Ww\nNatl Life Ins Co, r 1807 Dupont ay s.\n\n“Emmett B®, student U of M, b 1807']
['avia F, supt of agencies N Ww\nNatl Life Ins Co, r 1807 Dupont ay s.\n\n“Emmett B®, student U of M, b 1807', '1807 Dupont ay s.\n\n“Emmett B®, student U of M, b 1807 Du-\npont av s.\n\n“ Glen F, student, b 1807 Dupont']
['avia F, supt of agencies N Ww\nNatl Life Ins Co, r 1807 Dupont ay s.\n\n“Emmett B®, student U of M, b 1807', '1807 Dupont ay s.\n\n“Emmett B®, student U of M, b 1807 Du-\npont av s.\n\n“ Glen F, student, b 1807 Dupont', 'of M, b 1807 Du-\npont av s.\n\n“ Glen F, student, b 1807 Dupont av s.\n\nWittey Thomas HE, driver Swift & Co']
 1904: Found 3 context snippets with '1807'
Saved year results to: /content/verified_1807_1904.json

 Processing: 1905.txt
['David F, mngr loan dept N W:°Natl.\nLife Ins Co, r 1807 Dupont av So. |\n“ Emmet E, real est; b 1807 Dupon']
['David F, mngr loan dept N W:°Natl.\nLife Ins Co, r 1807 Dupont av So. |\n