In [42]:
import os
import numpy as np
import pandas as pd
import re
import json
import random
import time

from collections import Counter

import spacy
import nltk
from nltk.tokenize import sent_tokenize

from rapidfuzz import fuzz

import folium
import matplotlib.pyplot as plt

from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut

import geopandas as gpd

from openai import OpenAI

In [43]:
# Set up paths
CWD = os.getcwd()
print("Current working dir:", CWD)
print(" ")

PATH_RAW = os.path.join(CWD, "data", "raw")
PATH_PROCESSED = os.path.join(CWD, "data", "processed")
PATH_TEXT = os.path.join(PATH_RAW, "The expedition of_expedition_of_pedro_de_ursua.txt")
PATH_COORDS_JSON = os.path.join(PATH_PROCESSED, "coords.json")

print("Text path:", PATH_TEXT, "Exists?", os.path.exists(PATH_TEXT))
print(" ")
print("Coords JSON path:", PATH_COORDS_JSON, "Exists?", os.path.exists(PATH_COORDS_JSON))

Current working dir: /Users/jamesbyers/code/github/openai_to_z-text
 
Text path: /Users/jamesbyers/code/github/openai_to_z-text/data/raw/The expedition of_expedition_of_pedro_de_ursua.txt Exists? True
 
Coords JSON path: /Users/jamesbyers/code/github/openai_to_z-text/data/processed/coords.json Exists? True


In [44]:
# Ensure nltk punkt is downloaded
nltk.download('punkt', quiet=True)
# Load spaCy model
nlp = spacy.load("en_core_web_trf")

def is_garbage_line(line, min_length=4):
    s = line.strip()
    if not s:
        return True
    if re.fullmatch(r'\d+\.?', s):  # just numbers or numbers.
        return True
    if re.fullmatch(r'[\W_]{1,3}', s):  # just punctuation, short
        return True
    if len(s) < min_length:
        return True
    return False

with open(PATH_TEXT, encoding='utf8') as f:
    raw = f.read()

# Remove "garbage" and normalize
lines = [line for line in raw.splitlines() if not is_garbage_line(line)]
text = ' '.join(lines)
text = ' '.join(text.split())  # normalize whitespace

doc = nlp(text)
sent_list = list(doc.sents)
print(f"Extracted {len(sent_list)} sentences.")

Extracted 407 sentences.


In [45]:
# Gather (location, sentence block, position) tuples
data = []
for i, sent in enumerate(sent_list):
    for ent in sent.ents:
        if ent.label_ in {'GPE', 'LOC'}:
            start = max(i-2, 0)
            end = min(i+3, len(sent_list))
            block = ' '.join(sent.text for sent in sent_list[start:end])
            data.append({
                'location': ent.text,
                'sentence_block': block,
                'sentence_number': i
            })

df = pd.DataFrame(data)
print(f"Location-sentence pairs: {len(df)}")
print("Unique locations (pre-grouping):", df['location'].nunique())

Location-sentence pairs: 168
Unique locations (pre-grouping): 82


In [46]:
# Group locations that are nearly identical (fuzzy match)
SIM_THRESH = 90
unique_locs = list(df["location"].unique())
groups = []
used = set()
for i, loc in enumerate(unique_locs):
    if loc in used:
        continue
    group = [loc]
    used.add(loc)
    for other in unique_locs[i+1:]:
        if other in used: continue
        score = fuzz.ratio(loc.lower(), other.lower())
        if score >= SIM_THRESH:
            group.append(other)
            used.add(other)
    groups.append(group)

counts = Counter(df["location"])
replace_map = {}
for group in groups:
    main = max(group, key=lambda x: counts[x])  # most common spelling
    for g in group:
        replace_map[g] = main

print(f'Num distinct locations before grouping: {len(unique_locs)}')
df["location"] = df["location"].map(lambda x: replace_map.get(x, x))
print(f'Num distinct after grouping: {df["location"].nunique()}')

Num distinct locations before grouping: 82
Num distinct after grouping: 75


In [51]:
def get_location_coords(location_set, verbose=False):
    geolocator = Nominatim(user_agent="generic_name")
    location_coord_dict = {}
    for count, location in enumerate(location_set, start=1):
        print(f"Getting coords for '{location}' [{count}/{len(location_set)}]")
        try:
            loc = geolocator.geocode(location)
            if loc is not None:
                coords = (loc.latitude, loc.longitude)
                location_coord_dict[location] = coords
                print(f"  Found: {coords}")
            else:
                print("  Not found.")
        except Exception as e:
            print("  Error:", e)
        time.sleep(1.1)
    return location_coord_dict

# -- Instead, just load the existing coords!
# with open(PATH_COORDS_JSON, "r") as f:
#     coords_dict = json.load(f)

coords_dict = get_location_coords(unique_locs, verbose=True)

print(f"Loaded {len(coords_dict)} locations with coordinates.")

Getting coords for 'Peru' [1/82]
  Found: (-6.8699697, -75.0458515)
Getting coords for 'Marahon' [2/82]
  Not found.
Getting coords for 'Santa Fe' [3/82]
  Found: (-30.3154739, -61.1645076)
Getting coords for 'the New Kingdom of Granada' [4/82]
  Not found.
Getting coords for 'Navarre' [5/82]
  Found: (42.6125488, -1.8307877)
Getting coords for 'Ursua' [6/82]
  Found: (55.4361509, -133.3430368)
Getting coords for 'Pampluna' [7/82]
  Not found.
Getting coords for 'New Granada' [8/82]
  Found: (14.6124566, 121.0354958)
Getting coords for 'ofTudela' [9/82]
  Not found.
Getting coords for 'Santa Martha' [10/82]
  Found: (19.3602817, -98.9952089)
Getting coords for 'Carthagena' [11/82]
  Found: (40.4367135, -84.5599552)
Getting coords for 'Panama' [12/82]
  Found: (8.559559, -81.1308434)
Getting coords for 'Lima' [13/82]
  Found: (-12.0621065, -77.0365256)
Getting coords for 'Canete' [14/82]
  Found: (46.3266525, 9.493132)
Getting coords for 'Brazil' [15/82]
  Found: (-10.3333333, -53.2)
Ge

In [55]:
amazon_gdf = gpd.read_file('data/raw/Amazonia-sensu-stricto.gpkg')
amazon_geom = amazon_gdf.unary_union  # Combine shapes

filtered_coords = {}
for place, coords in coords_dict.items():
    lat, lon = coords
    point = Point(lon, lat)  # lon, lat for shapely
    if amazon_geom.contains(point):
        filtered_coords[place] = (lat, lon)
print(f"Filtered down to {len(filtered_coords)} places inside Amazonia boundary.")

Filtered down to 3 places inside Amazonia boundary.


  amazon_geom = amazon_gdf.unary_union  # Combine shapes


In [56]:
# Compute spaCy word offsets for each sentence
sentence_starts = []
word_counter = 0
for sent in sent_list:
    sentence_starts.append((sent.text, word_counter))
    word_counter += len([t for t in sent if not t.is_space])
sent_start_dict = dict(sentence_starts)

# Assign 'start_word_index' to each row: match first sentence of block in doc
def find_start_word_index(block):
    # Use first sentence in block
    first_sent = sent_tokenize(block)[0] if block else ''
    # Use .strip() to align with sent.text
    for s, start_idx in sentence_starts:
        if first_sent.strip() == s.strip():
            return start_idx
    return -1

df['start_word_index'] = df['sentence_block'].map(find_start_word_index)
total_words = word_counter

print(f"Total words in text: {total_words}")

# Group by location, order by appearance
sentences_grouped = (
    df
    .groupby('location')
    .apply(lambda subdf:
        subdf.sort_values('start_word_index')[['sentence_block', 'start_word_index']].to_dict('records')
    )
)

def first_N_words(text, N=7):
    tokens = text.split()
    preview = " ".join(tokens[:N])
    return preview + ("..." if len(tokens) > N else "")

# Construct more readable popups
location_to_popup = {}
for location, sentences in sentences_grouped.items():
    popup_lines = []
    for i, entry in enumerate(sentences):
        pct_through = (100 * entry['start_word_index'] / total_words) if entry['start_word_index'] >= 0 else 0
        pct_str = f"<b>{pct_through:.1f}% through book</b>: "
        # First block: full, later: preview
        if i == 0:
            popup_lines.append(f"{pct_str}<span>{entry['sentence_block']}</span>")
        else:
            preview = first_N_words(entry['sentence_block'])
            popup_lines.append(f"{pct_str}<i>{preview}</i>")
    popup_text = "<hr>".join(popup_lines)
    location_to_popup[location] = popup_text

print("Sample popup for first filtered location:")
if filtered_coords:
    sample_place = next(iter(filtered_coords))
    print(sample_place)
    print(location_to_popup.get(sample_place, "(none)"))

Total words in text: 15833
Sample popup for first filtered location:
Peru
<b>0.0% through book</b>: <span>— 2. Certain Brazilian Indians give information in Peru , of very rich provinces near the river Marahon. — 3. The Marquis of Cahete determines to send Pedro de Ursua to explore them , and summons him to his presence. — 4. Various opinions are formed in Peru respecting this expedition. 1. The Captain Pedro de Ursua entered the city of Santa Fe, in the New Kingdom of Granada, with his uncle the licentiate Miguel Diaz de Armendariz, the first Juez de Resi- dencia, who arrived there after the discovery of the kingdom, and settlement of the city.1 This Pedro de Ursua was a native of the kingdom of Navarre, and of a town called Ursua, near the noble city of Pampluna.</span><hr><b>0.0% through book</b>: <i>These Indians brought news respecting the provinces...</i><hr><b>0.0% through book</b>: <i>The arrival of Don Juan was hailed...</i><hr><b>1.1% through book</b>: <i>From Santa Fe he wen

  .apply(lambda subdf:


In [57]:
if filtered_coords:
    lats = [coords[0] for coords in filtered_coords.values()]
    lons = [coords[1] for coords in filtered_coords.values()]
    mean_lat, mean_lon = np.mean(lats), np.mean(lons)
    m = folium.Map(location=[mean_lat, mean_lon], zoom_start=4)
else:
    m = folium.Map(location=[-3, -60], zoom_start=4)  # Amazonia central

# Add Amazonia mask as GeoJSON (optional but illustrative)
folium.GeoJson(
    amazon_gdf.to_json(),
    name='Amazonia region',
    style_function=lambda x: {"fillColor": "#90ee90", "color": "#228B22", "weight": 2, "fillOpacity": 0.25}
).add_to(m)

for name, coords in filtered_coords.items():
    folium.Marker(
        location=coords,
        popup=folium.Popup(location_to_popup.get(name, name), max_width=350),
        icon=folium.Icon(color='blue', icon='info-sign')
    ).add_to(m)

m.save("map_of_locations.html")
print("Map saved to 'map_of_locations.html'. Displaying below (if in notebook):")
m

Map saved to 'map_of_locations.html'. Displaying below (if in notebook):


In [20]:
# Check which coordinate names lack a popup
names_with_coords = set(filtered_coords)
names_with_text = set(location_to_popup)

missing_text = names_with_coords - names_with_text
print(f"There are {len(missing_text)} places with coords but no text: {missing_text}")

There are 43 places with coords but no text: {'Breves', 'Bolivia', 'BOLIVIA', 'Chimoré', 'Principe da Beira', 'Cayubaba', 'Mamoré', 'Itonama', 'River Piray', 'Madeira Rivers', 'Itenez', 'Canichana', 'Izozo', 'Pará', 'Yuracaré', 'Marajo', 'Purus', 'CARIPUNAS', 'Reyes', 'PARÁ', 'PERU', 'Mayosa', 'Manáos', 'Yacuma', 'Tijamuchi', 'Maués', 'Manicoré', 'Tapajoz', 'Securé', 'Baures', 'San Ignacio', 'Mojos', 'AMAZON', 'Exaltacion', 'Quilpa', 'Cavinas', 'Yata', 'Gurupá', 'Mojeño', 'Cobija', 'Umaitá', 'Amazon', 'Matto Grosso'}


In [21]:
print("Sample missing places with coords but no text:", list(missing_text)[:5])

Sample missing places with coords but no text: ['Breves', 'Bolivia', 'BOLIVIA', 'Chimoré', 'Principe da Beira']


In [22]:
for name in list(missing_text)[:5]:
    hits = df[df['location'].str.lower() == name.lower()]
    print(f"Blocks in df for '{name}': {len(hits)}")
    if not hits.empty:
        print(hits.head())

Blocks in df for 'Breves': 0
Blocks in df for 'Bolivia': 0
Blocks in df for 'BOLIVIA': 0
Blocks in df for 'Chimoré': 0
Blocks in df for 'Principe da Beira': 0
