In [2]:
import pandas as pd
import re

# Load data
master = pd.read_csv('data/master.csv')
matches = pd.read_csv('data/matches.csv')

years = [1869, 1871, 1872, 1873, 1876, 1877, 1878, 1879, 1880, 1882, 1883, 1884, 1885, 1890]

# --- Matching logic ---

def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    prev_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        curr_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = prev_row[j + 1] + 1
            deletions = curr_row[j] + 1
            substitutions = prev_row[j] + (c1 != c2)
            curr_row.append(min(insertions, deletions, substitutions))
        prev_row = curr_row
    return prev_row[-1]

def strings_match(s1, s2, max_distance=1):
    s1_clean = s1.strip().lower()
    s2_clean = s2.strip().lower()
    if s1_clean == s2_clean:
        return True
    return levenshtein_distance(s1_clean, s2_clean) <= max_distance

def tokenize_publisher(publisher_str):
    if not publisher_str:
        return []
    cleaned = re.sub(r'[;,]', ' ', publisher_str)
    tokens = cleaned.split()
    return [t.strip() for t in tokens if len(t.strip()) >= 4]

def publishers_match_tokenized(pub1, pub2):
    tokens1 = tokenize_publisher(pub1)
    tokens2 = tokenize_publisher(pub2)
    if not tokens1 or not tokens2:
        return False
    for t1 in tokens1:
        for t2 in tokens2:
            if strings_match(t1, t2, max_distance=1):
                return True
    return False

def normalize_publisher(pub):
    if not pub:
        return ""
    return re.sub(r'[^a-z0-9]', '', pub.lower())

def publishers_match_normalized(pub1, pub2):
    n1 = normalize_publisher(pub1)
    n2 = normalize_publisher(pub2)
    if not n1 or not n2:
        return False
    return n1 == n2 or n1 in n2 or n2 in n1

def names_match(name1, name2):
    return publishers_match_tokenized(name1, name2) or publishers_match_normalized(name1, name2)

def clean_field(value):
    if pd.isna(value):
        return ''
    s = str(value).strip()
    if s.lower() == 'nan':
        return ''
    return s

def remove_blips(data_points):
    """
    Remove single-entry 'blips' that are likely data entry errors.
    A blip is where value changes at index i but reverts back at index i+1
    (i.e., prev and next match each other, but current doesn't match prev).
    """
    if len(data_points) <= 2:
        return data_points

    filtered = []
    i = 0
    while i < len(data_points):
        if 0 < i < len(data_points) - 1:
            prev_year, prev_val = data_points[i - 1]
            curr_year, curr_val = data_points[i]
            next_year, next_val = data_points[i + 1]
            if not names_match(prev_val, curr_val) and names_match(prev_val, next_val):
                i += 1
                continue
        filtered.append(data_points[i])
        i += 1
    return filtered

def get_distinct_names(values):
    """
    Given a list of (year, name) tuples, return a list of distinct names
    using names_match() to deduplicate. Keeps the first occurrence as the
    representative string. Also returns the years each distinct name appears.
    """
    distinct = []  # list of (representative_name, [years])
    for year, name in values:
        found = False
        for i, (rep, yr_list) in enumerate(distinct):
            if names_match(rep, name):
                yr_list.append(year)
                found = True
                break
        if not found:
            distinct.append((name, [year]))
    return distinct

def extract_for_row(row):
    pub_data = []
    ed_data = []
    for year in years:
        publisher = clean_field(row.get(f'{year} publisher', ''))
        editor = clean_field(row.get(f'{year} editor', ''))
        if publisher:
            pub_data.append((year, publisher))
        if editor:
            ed_data.append((year, editor))

    # Remove blips before deduplicating
    pub_data = remove_blips(pub_data)
    ed_data = remove_blips(ed_data)

    return get_distinct_names(pub_data), get_distinct_names(ed_data)

# Build one row per distinct publisher or editor
rows_out = []
for _, match_row in matches.iterrows():
    if pd.isna(match_row['master_id']):
        continue
    master_id = int(match_row['master_id'])
    issn = match_row.get('issn', '')
    newspapers_all_years_name = match_row.get('newspapers_all_years_name', '')
    master_name = match_row.get('master_name', '')

    if master_id < 0 or master_id >= len(master):
        continue

    m_row = master.iloc[master_id]
    state = clean_field(m_row.get('state', ''))
    town = clean_field(m_row.get('town', ''))
    newspaper_name = clean_field(m_row.get('newspaper_name', ''))

    distinct_pubs, distinct_eds = extract_for_row(m_row)

    base = {
        'master_id': master_id,
        'issn': issn,
        'newspapers_all_years_name': newspapers_all_years_name,
        'master_name': master_name,
        'state': state,
        'town': town,
        'newspaper_name': newspaper_name,
    }

    for name, yr_list in distinct_pubs:
        rows_out.append({
            **base,
            'role': 'publisher',
            'name': name,
            'years': '; '.join(str(y) for y in yr_list),
            'first_year': min(yr_list),
            'last_year': max(yr_list),
            'num_years': len(yr_list),
        })

    for name, yr_list in distinct_eds:
        rows_out.append({
            **base,
            'role': 'editor',
            'name': name,
            'years': '; '.join(str(y) for y in yr_list),
            'first_year': min(yr_list),
            'last_year': max(yr_list),
            'num_years': len(yr_list),
        })

out_df = pd.DataFrame(rows_out)
out_df.insert(0, 'unique_id', range(1, len(out_df) + 1))
out_df.to_csv('data/owners_and_editors.csv', index=False)

print(f"Wrote {len(out_df)} rows to data/owners_and_editors.csv")
print(f"  Publishers: {len(out_df[out_df['role'] == 'publisher'])}")
print(f"  Editors:    {len(out_df[out_df['role'] == 'editor'])}")
print(f"  Newspapers: {out_df['master_id'].nunique()}")
print("\nSample rows:")
print(out_df[['newspaper_name', 'role', 'name', 'years']].head(10).to_string(index=False))

  master = pd.read_csv('data/master.csv')


Wrote 2088 rows to data/owners_and_editors.csv
  Publishers: 1075
  Editors:    1013
  Newspapers: 545

Sample rows:
newspaper_name      role                                                                                        name                                                                        years
       Tribune publisher                                                                          TribuneAssociation                   1871; 1873; 1876; 1877; 1879; 1880; 1882; 1883; 1884; 1885
       Tribune publisher                                                       Tribune circulation-daily Association                                                                         1878
       Tribune publisher                                                                               Whitelaw Reid                                                                         1890
       Tribune    editor Charles A. Dana,George Ripley,Bayard Taylor,Henry J. Raymond,Horace Greeley,Margar

In [1]:
import pandas as pd
import re

df = pd.read_csv('data/owners_and_editors.csv')

# --- Same matching logic ---

def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    prev_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        curr_row = [i + 1]
        for j, c2 in enumerate(s2):
            curr_row.append(min(prev_row[j+1]+1, curr_row[j]+1, prev_row[j]+(c1!=c2)))
        prev_row = curr_row
    return prev_row[-1]

def names_match(a, b):
    """Only match if the entire string is at most 1 edit away."""
    a_clean = a.strip().lower()
    b_clean = b.strip().lower()
    if not a_clean or not b_clean:
        return False
    return a_clean == b_clean or levenshtein_distance(a_clean, b_clean) <= 1

# Deduplicate names across all rows using names_match()
# Each distinct name group gets a person_id and keeps the first occurrence as representative
distinct = []  # list of (person_id, representative_name, [unique_ids])

for _, row in df.iterrows():
    name = str(row['name']).strip()
    uid = row['unique_id']
    if not name:
        continue

    found = False
    for i, (pid, rep, uid_list) in enumerate(distinct):
        if names_match(rep, name):
            uid_list.append(uid)
            found = True
            break
    if not found:
        distinct.append((len(distinct) + 1, name, [uid]))

# Build output: one row per distinct person, with all linked unique_ids
rows_out = []
for pid, rep_name, uid_list in distinct:
    rows_out.append({
        'person_id': pid,
        'name': rep_name,
        'unique_ids': '; '.join(str(u) for u in uid_list),
        'num_entries': len(uid_list),
    })

out_df = pd.DataFrame(rows_out)
out_df.to_csv('data/distinct_names.csv', index=False)

print(f"Input rows: {len(df)}")
print(f"Distinct names: {len(out_df)}")
print(f"\nNames appearing more than once:")
multi = out_df[out_df['num_entries'] > 1].sort_values('num_entries', ascending=False)
print(multi[['person_id', 'name', 'num_entries']].head(20).to_string(index=False))

Input rows: 2088
Distinct names: 1312

Names appearing more than once:
 person_id                                       name  num_entries
        67                               Register Co.            7
       993                        Herman Stockenstrom            6
       846                             Ben F. Stanton            6
       844                              David W. Cobb            6
       142                              David Higgins            5
        83                                E.D. Kelley            5
        68                 West Virginia Printing Co.            5
        24                                Lewis Baker            5
       613                             John H. Marion            5
       980                    Edward Alexander Oldham            4
       955                            Robert D. Blair            4
       948                              John W. Kelly            4
      1015                      Clement A. Lounsberry     

In [1]:
import pandas as pd
import json
import os
import time
from dotenv import load_dotenv
from google import genai

load_dotenv()
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash-lite")

df = pd.read_csv('data/distinct_names.csv')
BATCH_SIZE = 250

PROMPT_TEMPLATE = """You are cleaning a list of newspaper owner/editor names from the 1860s-1890s.
For each numbered entry, do the following:
1. If the entry is a real person's name (or multiple people's names), return the corrected name(s).
2. Fix obvious typos and formatting issues (e.g. "Jonh" -> "John", missing spaces). But be very cautious here - don't change names on hunches.
3. If the entry contains MULTIPLE people's names (separated by commas, semicolons, "and", "&", etc.), split them into separate names.
4. If the entry is NOT a person's name (e.g. it's a company name like "Tribune Association", "Printing Co.", an organization, a place name, gibberish, or just a title/abbreviation with no real name), return an empty list.
5. Keep honorifics like "Dr." or "Col." if they appear to be part of a real person's name.

Return ONLY a JSON object mapping the entry number (as a string) to a list of cleaned names.
Example input:
1. Charles A. Dana,George Ripley,Bayard Taylor
2. TribuneAssociation
3. Jonh Smtih
4. Caine, Sloan & Dun- bar

Example output:
{{"1": ["Charles A. Dana", "George Ripley", "Bayard Taylor"], "2": [], "3": ["John Smith"], "4": ["Caine", "Sloan", "Dunbar"]}}

Here are the entries to clean:
"""

def clean_batch(batch_df):
    """Send a batch of names to Gemini for cleaning. Returns list of (person_id, cleaned_name, unique_ids)."""
    lines = []
    pid_map = {}  # entry_number -> (person_id, unique_ids)
    for i, (_, row) in enumerate(batch_df.iterrows(), 1):
        lines.append(f"{i}. {row['name']}")
        pid_map[str(i)] = (row['person_id'], row['unique_ids'])

    prompt = PROMPT_TEMPLATE + "\n".join(lines)

    for attempt in range(3):
        try:
            response = client.models.generate_content(model=MODEL, contents=prompt)
            text = response.text.strip()
            # Extract JSON from response (handle markdown code blocks)
            if text.startswith("```"):
                text = text.split("\n", 1)[1].rsplit("```", 1)[0].strip()
            result = json.loads(text)
            break
        except Exception as e:
            print(f"  Attempt {attempt+1} failed: {e}")
            if attempt < 2:
                time.sleep(5 * (attempt + 1))
            else:
                print(f"  Skipping batch after 3 failures")
                return []

    cleaned_rows = []
    for entry_num, (pid, uids) in pid_map.items():
        names = result.get(entry_num, result.get(int(entry_num), []))
        if isinstance(names, str):
            names = [names]
        for name in names:
            name = name.strip()
            if name:
                cleaned_rows.append({
                    'person_id': pid,
                    'cleaned_name': name,
                    'unique_ids': uids,
                })
    return cleaned_rows

# Process in batches
all_cleaned = []
num_batches = (len(df) + BATCH_SIZE - 1) // BATCH_SIZE

for i in range(num_batches):
    start = i * BATCH_SIZE
    end = min(start + BATCH_SIZE, len(df))
    batch = df.iloc[start:end]
    print(f"Batch {i+1}/{num_batches} (rows {start+1}-{end})...")
    cleaned = clean_batch(batch)
    all_cleaned.extend(cleaned)
    print(f"  -> {len(cleaned)} cleaned names")
    if i < num_batches - 1:
        time.sleep(2)

cleaned_df = pd.DataFrame(all_cleaned)
cleaned_df.to_csv('data/cleaned_distinct_names.csv', index=False)

print(f"\nDone! {len(df)} input entries -> {len(cleaned_df)} cleaned names")
print(f"Saved to data/cleaned_distinct_names.csv")
print(f"\nSample:")
print(cleaned_df.head(20).to_string(index=False))

Batch 1/6 (rows 1-250)...
  -> 264 cleaned names
Batch 2/6 (rows 251-500)...
  -> 290 cleaned names
Batch 3/6 (rows 501-750)...
  -> 304 cleaned names
Batch 4/6 (rows 751-1000)...
  -> 294 cleaned names
Batch 5/6 (rows 1001-1250)...
  -> 298 cleaned names
Batch 6/6 (rows 1251-1312)...
  -> 70 cleaned names

Done! 1312 input entries -> 1520 cleaned names
Saved to data/cleaned_distinct_names.csv

Sample:
 person_id             cleaned_name unique_ids
         3            Whitelaw Reid       3; 5
         4          Charles A. Dana          4
         4            George Ripley          4
         4            Bayard Taylor          4
         4         Henry J. Raymond          4
         4           Horace Greeley          4
         4          Margaret Fuller          4
         5             Crosby Noyes       6; 9
         8            Edgar Snowden     10; 11
         9                     Frew     12; 13
         9                   Hagans     12; 13
         9                    