In [None]:
!pip install pytesseract pdf2image pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.13


In [None]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (365 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126308 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
from pdf2image import convert_from_path

# Load only first two pages (e.g., page 147 and 148)
images = convert_from_path("PG117_1900.pdf", dpi=300, first_page=1, last_page=2)

# Save if needed
for i, image in enumerate(images):
    image.save(f"page_{i+1}.png", "PNG")

In [None]:
import cv2
import numpy as np
from PIL import Image

def preprocess_image(image_path, save_path=None, apply_deskew=True):
    # Load image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Resize to improve OCR detail
    image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

    # Enhance contrast using CLAHE
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(image)

    # Apply Gaussian Blur to reduce small noise
    blurred = cv2.GaussianBlur(enhanced, (3, 3), 0)

    # Try adaptive thresholding first
    adaptive = cv2.adaptiveThreshold(
        blurred, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 11
    )

    # Fallback: Otsu's binarization (can be better for uniform documents)
    _, otsu = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Choose adaptive or Otsu based on stddev (adaptive better in noisy background)
    thresh = adaptive if np.std(adaptive) > np.std(otsu) else otsu

    # Morphological opening to remove small white noise
    kernel_open = np.ones((2, 2), np.uint8)
    opened = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_open)

    # Morphological dilation to connect broken text
    kernel_dilate = np.ones((2, 1), np.uint8)
    dilated = cv2.dilate(opened, kernel_dilate, iterations=1)

    # Optionally save the processed image
    if save_path:
        cv2.imwrite(save_path, dilated)

    # Convert to PIL for pytesseract
    return Image.fromarray(dilated)


In [None]:
preprocessed_image = preprocess_image("page_1.png", save_path="processed_page_1.png")

In [None]:
import cv2
import numpy as np
import os

def detect_vertical_lines_and_crop(image_path, output_dir="column_crops"):
    # Load grayscale image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    height, width = image.shape

    # Binarize the image
    _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Create vertical line kernel
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, height // 20))

    # Detect vertical lines
    vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=1)

    # Find contours of the vertical lines
    contours, _ = cv2.findContours(vertical_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Get bounding boxes and sort by x-coordinate (left to right)
    column_boxes = [cv2.boundingRect(c) for c in contours]
    column_boxes.sort(key=lambda b: b[0])

    # Calculate column boundaries (x values only)
    column_boundaries = [x for (x, y, w, h) in column_boxes]
    column_boundaries = sorted(set(column_boundaries))

    # Add left edge and right edge
    column_boundaries = [0] + column_boundaries + [width]

    # Ensure output dir
    os.makedirs(output_dir, exist_ok=True)

    # Crop between column boundaries
    cropped_columns = []
    for i in range(len(column_boundaries) - 1):
        x_start = column_boundaries[i]
        x_end = column_boundaries[i + 1]
        if x_end - x_start < width // 20:
            continue
        crop = image[:, x_start:x_end]
        save_path = os.path.join(output_dir, f"column_{i+1}.png")
        cv2.imwrite(save_path, crop)
        cropped_columns.append((save_path, crop))

    print(f"Cropped {len(cropped_columns)} vertical sections to: {output_dir}/")
    return cropped_columns


In [None]:
detect_vertical_lines_and_crop("processed_page_1.png")

In [None]:
import os
import cv2
import pytesseract

def ocr_columns_from_folder(folder_path):
    full_text = ""

    # Get sorted list of image files (assumes naming like 'column_1.png', ...)
    image_files = sorted(
        [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    )

    for idx, filename in enumerate(image_files):
        img_path = os.path.join(folder_path, filename)

        # Read the image
        img = cv2.imread(img_path)

        # Optional: preprocess (grayscale, blur, threshold)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # OCR with Tesseract
        column_text = pytesseract.image_to_string(binary, config="--psm 4")
        full_text += f"\n### Column {idx + 1} ({filename}) ###\n{column_text.strip()}\n"

    return full_text
fs=ocr_columns_from_folder("/content/column_crops")


In [None]:
fs

'\n### Column 1 (column_1.png) ###\n° ea: et . Soot yt SON Rm Bem, * . ate as te\n. sat sola we tlt. . . . : - soe . . Teer\nSe TU De Ne Ft Bee Be, wos ne\nvee SU ee oe an aan re . Rope - ot - a ee\n: five - *. an ut we ost. : Tye - . ;\nvo ae . haan it ed wo tee ~ a 2 2 cot 7, : . oo\n: eT eds 7 7 . : ne ’ . — , oo 7 : - ~ a\neas i an " - oly. Pa, : /\n. . wt - pose o 4 ne aa ad . a . nos\nLoe us t. oo | . sg y | ‘ ; . i\n: : | in Of\n43. . V .\n? z : . . ‘\n\nthe =\nALL  —s MINNEAPOLIS CI\n\n    \n\n     \n\n| ‘Allen aston C, moved. to Fargo; N D. .\n‘ Warren F, contr; r 1522 20th av n\n“ wm cooper Co-op Bbl | Mnfg Co, r 2915\n\nSth.\n_ Wim A (Scofield & Allen), r 1613. Hillside\nave\n| ALLEN WILLIAM Cc, SEC THE GREAT\nWestern Indemnity Cor 619. ‘The Phoe-\nnix, r 314 12th av\n“wm D (Allen Bros), r. jst ‘Aldrich: av’ 8.\n“Wm D, ‘fireman Cc M & St P. Ry, r 1310 e\n\n25th.\n“Wm H, elk: R. s. Goodfellow & Cor St\n\nPau\n“Wm. a Yr 2899 15th av S. 7\n_ a wo Joseph, medicine mnfr 13. e 25t

In [None]:
import re

def parse_entry(line):
    pattern = re.compile(
        r'^(?P<first>[A-Z][a-z.\']*)\s+(?P<last>[A-Z][a-zA-Z.\'-]*)'                         # Name
        r'(?:,\s*(?P<occupation>.*?))?'                                                     # Occupation (optional, ends before address marker)
        r'(?:,\s*(?P<res_indicator>r\.|b\.|res\.|dom\.))?\s*'                                # Residence indicator
        r'(?P<address>[\d]{1,5}[\w\s\.]*[AaVvEeNnSs]*[\.]?)?'                               # Address (digits and street)
    )

    match = pattern.search(line)
    if not match:
        return None

    entry = match.groupdict()

    # Detect widow case
    spouse_match = re.search(r'\(wid\s+([A-Za-z\s]+)\)', line)
    entry['spouse_name'] = spouse_match.group(1) if spouse_match else None

    # Employer (very fuzzy, try capturing if "employer" pattern exists)
    employer_match = re.search(r'bkpr\s+(.*?)(?:,|$)', line)
    if not employer_match:
        employer_match = re.search(r'(?:emp\.|clerk|foreman|pkr|tmstr|lineman)\s+(.*?)(?:,|$)', line)
    entry['employer'] = employer_match.group(1).strip() if employer_match else None

    return entry


In [None]:
parsed_people = []

for line in fs.splitlines():
    line = line.strip()
    if line and re.match(r'^[A-Z][a-z]+', line):  # likely a resident line
        parsed = parse_entry(line)
        if parsed:
            parsed['directory_year'] = 1900
            parsed_people.append(parsed)


In [None]:
import json

print(json.dumps(parsed_people, indent=2))  # Print first 10 for review


[
  {
    "first": "Se",
    "last": "TU",
    "occupation": null,
    "res_indicator": null,
    "address": null,
    "spouse_name": null,
    "employer": null,
    "directory_year": 1900
  },
  {
    "first": "Western",
    "last": "Indemnity",
    "occupation": null,
    "res_indicator": null,
    "address": null,
    "spouse_name": null,
    "employer": null,
    "directory_year": 1900
  },
  {
    "first": "Allendort",
    "last": "W",
    "occupation": null,
    "res_indicator": null,
    "address": null,
    "spouse_name": null,
    "employer": null,
    "directory_year": 1900
  },
  {
    "first": "Alletzhausser",
    "last": "Mrs.",
    "occupation": null,
    "res_indicator": null,
    "address": null,
    "spouse_name": null,
    "employer": null,
    "directory_year": 1900
  },
  {
    "first": "Allgren",
    "last": "J",
    "occupation": null,
    "res_indicator": null,
    "address": null,
    "spouse_name": null,
    "employer": null,
    "directory_year": 1900
  },
  {

In [None]:
!pip install google-generativeai



In [None]:
import google.generativeai as genai

genai.configure(api_key="")


In [None]:
from typing import List
import json
import re

# 1. Break text into chunks (by entries or lines)
def chunk_ocr_text(text: str, max_lines: int = 40) -> List[str]:
    lines = text.splitlines()
    chunks = []
    for i in range(0, len(lines), max_lines):
        chunk = "\n".join(lines[i:i + max_lines])
        chunks.append(chunk)
    return chunks

# 2. Ensure Gemini only returns JSON
def build_prompt(chunk_text: str) -> str:
    return f"""
You are an expert in parsing OCR-transcribed historical city directories like those from 1908 Minneapolis.

Parse the input into structured JSON entries with these fields:
- first_name
- last_name
- spouse_name (or null)
- occupation (expand abbreviations like bkpr → bookkeeper)
- home_address (standardized)
- business_address (if available)
- year (default 1908)

Return ONLY valid JSON list. Do not add explanation or notes.

Input:
{chunk_text}
"""

# 3. Process each chunk
def extract_all_entries(model, ocr_text: str) -> List[dict]:
    all_results = []
    chunks = chunk_ocr_text(ocr_text)

    for chunk in chunks:
        prompt = build_prompt(chunk)
        response = model.generate_content(prompt)
        try:
            # Remove trailing notes if Gemini still adds them
            json_part = re.search(r"\[.*\]", response.text, re.DOTALL)
            if json_part:
                extracted = json.loads(json_part.group(0))
                all_results.extend(extracted)
        except Exception as e:
            print("Error parsing chunk:", e)
            continue

    return all_results


In [None]:
model = genai.GenerativeModel(model_name="gemini-1.5-pro")
extract_all_entries(model=model,ocr_text=fs)

KeyboardInterrupt: 