In [2]:
import pandas as pd
import re

# Load data
master = pd.read_csv('data/master.csv')
matches = pd.read_csv('data/matches.csv')

years = [1869, 1871, 1872, 1873, 1876, 1877, 1878, 1879, 1880, 1882, 1883, 1884, 1885, 1890]

# --- Same matching logic as the original script ---

def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    prev_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        curr_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = prev_row[j + 1] + 1
            deletions = curr_row[j] + 1
            substitutions = prev_row[j] + (c1 != c2)
            curr_row.append(min(insertions, deletions, substitutions))
        prev_row = curr_row
    return prev_row[-1]

def strings_match(s1, s2, max_distance=1):
    s1_clean = s1.strip().lower()
    s2_clean = s2.strip().lower()
    if s1_clean == s2_clean:
        return True
    return levenshtein_distance(s1_clean, s2_clean) <= max_distance

def tokenize_publisher(publisher_str):
    if not publisher_str:
        return []
    cleaned = re.sub(r'[;,]', ' ', publisher_str)
    tokens = cleaned.split()
    return [t.strip() for t in tokens if len(t.strip()) >= 4]

def publishers_match_tokenized(pub1, pub2):
    tokens1 = tokenize_publisher(pub1)
    tokens2 = tokenize_publisher(pub2)
    if not tokens1 or not tokens2:
        return False
    for t1 in tokens1:
        for t2 in tokens2:
            if strings_match(t1, t2, max_distance=1):
                return True
    return False

def normalize_publisher(pub):
    if not pub:
        return ""
    return re.sub(r'[^a-z0-9]', '', pub.lower())

def publishers_match_normalized(pub1, pub2):
    n1 = normalize_publisher(pub1)
    n2 = normalize_publisher(pub2)
    if not n1 or not n2:
        return False
    return n1 == n2 or n1 in n2 or n2 in n1

def names_match(name1, name2):
    return publishers_match_tokenized(name1, name2) or publishers_match_normalized(name1, name2)

def clean_field(value):
    if pd.isna(value):
        return ''
    s = str(value).strip()
    if s.lower() == 'nan':
        return ''
    return s

def get_distinct_names(values):
    """
    Given a list of (year, name) tuples, return a list of distinct names
    using names_match() to deduplicate. Keeps the first occurrence as the
    representative string. Also returns the years each distinct name appears.
    """
    distinct = []  # list of (representative_name, [years])
    for year, name in values:
        found = False
        for i, (rep, yr_list) in enumerate(distinct):
            if names_match(rep, name):
                yr_list.append(year)
                found = True
                break
        if not found:
            distinct.append((name, [year]))
    return distinct

def extract_for_row(row):
    pub_data = []
    ed_data = []
    for year in years:
        publisher = clean_field(row.get(f'{year} publisher', ''))
        editor = clean_field(row.get(f'{year} editor', ''))
        if publisher:
            pub_data.append((year, publisher))
        if editor:
            ed_data.append((year, editor))
    return get_distinct_names(pub_data), get_distinct_names(ed_data)

# Build one row per distinct publisher or editor
rows_out = []
for _, match_row in matches.iterrows():
    if pd.isna(match_row['master_id']):
        continue
    master_id = int(match_row['master_id'])
    issn = match_row.get('issn', '')
    newspapers_all_years_name = match_row.get('newspapers_all_years_name', '')
    master_name = match_row.get('master_name', '')

    if master_id < 0 or master_id >= len(master):
        continue

    m_row = master.iloc[master_id]
    state = clean_field(m_row.get('state', ''))
    town = clean_field(m_row.get('town', ''))
    newspaper_name = clean_field(m_row.get('newspaper_name', ''))

    distinct_pubs, distinct_eds = extract_for_row(m_row)

    base = {
        'master_id': master_id,
        'issn': issn,
        'newspapers_all_years_name': newspapers_all_years_name,
        'master_name': master_name,
        'state': state,
        'town': town,
        'newspaper_name': newspaper_name,
    }

    for name, yr_list in distinct_pubs:
        rows_out.append({
            **base,
            'role': 'publisher',
            'name': name,
            'years': '; '.join(str(y) for y in yr_list),
            'first_year': min(yr_list),
            'last_year': max(yr_list),
            'num_years': len(yr_list),
        })

    for name, yr_list in distinct_eds:
        rows_out.append({
            **base,
            'role': 'editor',
            'name': name,
            'years': '; '.join(str(y) for y in yr_list),
            'first_year': min(yr_list),
            'last_year': max(yr_list),
            'num_years': len(yr_list),
        })

out_df = pd.DataFrame(rows_out)
out_df.to_csv('data/owners_and_editors.csv', index=False)

print(f"Wrote {len(out_df)} rows to data/owners_and_editors.csv")
print(f"  Publishers: {len(out_df[out_df['role'] == 'publisher'])}")
print(f"  Editors:    {len(out_df[out_df['role'] == 'editor'])}")
print(f"  Newspapers: {out_df['master_id'].nunique()}")
print("\nSample rows:")
print(out_df[['newspaper_name', 'role', 'name', 'years']].head(10).to_string(index=False))

  master = pd.read_csv('data/master.csv')


Wrote 2081 rows to data/owners_and_editors.csv
  Publishers: 1064
  Editors:    1017
  Newspapers: 502

Sample rows:
newspaper_name      role                                                                                        name                                                                        years
       Tribune publisher                                                                          TribuneAssociation                   1871; 1873; 1876; 1877; 1879; 1880; 1882; 1883; 1884; 1885
       Tribune publisher                                                       Tribune circulation-daily Association                                                                         1878
       Tribune publisher                                                                               Whitelaw Reid                                                                         1890
       Tribune    editor Charles A. Dana,George Ripley,Bayard Taylor,Henry J. Raymond,Horace Greeley,Margar