###  Estimating the effect of new editors/owners on Newspaper Coverage

In [9]:
# add publisher and editor change information to Master.csv
# NOW SPLITS editor_and_publisher_change into:
#   - editor_and_publisher_change_same_year
#   - editor_and_publisher_change_diff_year
# ALSO: ignores single-entry "blips" (likely data entry errors)

import pandas as pd
import re

# Load the data
df = pd.read_csv('data/master.csv')

# Define the years we're tracking
years = [1869, 1871, 1872, 1873, 1876, 1877, 1878, 1879, 1880, 1882, 1883, 1884, 1885, 1890]

def levenshtein_distance(s1, s2):
    """Calculate the Levenshtein distance between two strings."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    prev_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        curr_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = prev_row[j + 1] + 1
            deletions = curr_row[j] + 1
            substitutions = prev_row[j] + (c1 != c2)
            curr_row.append(min(insertions, deletions, substitutions))
        prev_row = curr_row
    return prev_row[-1]

def strings_match(s1, s2, max_distance=1):
    """Check if two strings match within max_distance edits."""
    s1_clean = s1.strip().lower()
    s2_clean = s2.strip().lower()
    if s1_clean == s2_clean:
        return True
    return levenshtein_distance(s1_clean, s2_clean) <= max_distance

def tokenize_publisher(publisher_str):
    if not publisher_str:
        return []
    cleaned = re.sub(r'[;,]', ' ', publisher_str)
    tokens = cleaned.split()
    return [t.strip() for t in tokens if len(t.strip()) >= 4]

def publishers_match_tokenized(pub1, pub2):
    tokens1 = tokenize_publisher(pub1)
    tokens2 = tokenize_publisher(pub2)
    if not tokens1 or not tokens2:
        return False
    for t1 in tokens1:
        for t2 in tokens2:
            if strings_match(t1, t2, max_distance=1):
                return True
    return False

def normalize_publisher(pub):
    if not pub:
        return ""
    return re.sub(r'[^a-z0-9]', '', pub.lower())

def publishers_match_normalized(pub1, pub2):
    n1 = normalize_publisher(pub1)
    n2 = normalize_publisher(pub2)
    if not n1 or not n2:
        return False
    return n1 == n2 or n1 in n2 or n2 in n1

def names_match(name1, name2):
    return publishers_match_tokenized(name1, name2) or publishers_match_normalized(name1, name2)

def clean_field(value):
    if pd.isna(value):
        return ''
    s = str(value).strip()
    if s.lower() == 'nan':
        return ''
    return s

def remove_blips(data_points):
    """
    Remove single-entry 'blips' that are likely data entry errors.
    A blip is where value changes at index i but reverts back at index i+1
    (i.e., data_points[i-1] and data_points[i+1] match, but data_points[i] doesn't
    match either of them).
    
    For example: A, A, B, A, A -> the lone B is a blip and gets removed.
    """
    if len(data_points) <= 2:
        return data_points

    filtered = []
    i = 0
    while i < len(data_points):
        if 0 < i < len(data_points) - 1:
            prev_year, prev_val = data_points[i - 1]
            curr_year, curr_val = data_points[i]
            next_year, next_val = data_points[i + 1]

            # Check if this is a blip: current doesn't match prev,
            # but prev and next DO match each other
            is_blip = (
                not names_match(prev_val, curr_val)
                and names_match(prev_val, next_val)
            )
            if is_blip:
                # Skip this data point entirely
                i += 1
                continue

        filtered.append(data_points[i])
        i += 1

    return filtered

def detect_first_change(data_points):
    # First remove blips, then detect changes on the cleaned sequence
    cleaned = remove_blips(data_points)
    for i in range(1, len(cleaned)):
        prev_year, prev_val = cleaned[i - 1]
        curr_year, curr_val = cleaned[i]
        if not names_match(prev_val, curr_val):
            return curr_year
    return None

def analyze_changes(row):
    """
    Analyze a newspaper row for publisher and editor changes.
    Returns: (category, publisher_change_year, editor_change_year)
    Categories:
      - 'insufficient_data'
      - 'no_change'
      - 'publisher_change_only'
      - 'editor_change_only'
      - 'editor_and_publisher_change_same_year'
      - 'editor_and_publisher_change_diff_year'
    """
    pub_data = []
    ed_data = []
    for year in years:
        publisher = clean_field(row.get(f'{year} publisher', ''))
        editor = clean_field(row.get(f'{year} editor', ''))
        if publisher:
            pub_data.append((year, publisher))
        if editor:
            ed_data.append((year, editor))

    has_enough_pub = len(pub_data) >= 3
    has_enough_ed = len(ed_data) >= 3

    if not has_enough_pub and not has_enough_ed:
        return ('insufficient_data', None, None)

    pub_change_year = detect_first_change(pub_data) if has_enough_pub else None
    ed_change_year = detect_first_change(ed_data) if has_enough_ed else None

    has_pub_change = pub_change_year is not None
    has_ed_change = ed_change_year is not None

    if has_pub_change and has_ed_change:
        if pub_change_year == ed_change_year:
            category = 'editor_and_publisher_change_same_year'
        else:
            category = 'editor_and_publisher_change_diff_year'
    elif has_pub_change:
        category = 'publisher_change_only'
    elif has_ed_change:
        category = 'editor_change_only'
    else:
        category = 'no_change'

    return (category, pub_change_year, ed_change_year)

# Apply analysis to each row
results = df.apply(analyze_changes, axis=1)
df['category'] = results.apply(lambda x: x[0])
df['publisher_change_year'] = results.apply(lambda x: x[1])
df['editor_change_year'] = results.apply(lambda x: x[2])

# Filter out insufficient data
valid_df = df[df['category'] != 'insufficient_data'].copy()

# Count categories
category_counts = valid_df['category'].value_counts()

print("=" * 60)
print("PUBLISHER & EDITOR CHANGE ANALYSIS RESULTS")
print("=" * 60)
print(f"\nTotal newspapers analyzed: {len(df)}")
print(f"Newspapers with at least 4 years of data: {len(valid_df)}")
print(f"Newspapers with insufficient data: {len(df) - len(valid_df)}")
print("\n" + "-" * 40)
print("CATEGORY BREAKDOWN:")
print("-" * 40)

category_labels = {
    'editor_and_publisher_change_same_year': 'Editor & publisher changed (same year)',
    'editor_and_publisher_change_diff_year': 'Editor & publisher changed (diff years)',
    'publisher_change_only': 'Publisher changed only',
    'editor_change_only': 'Editor changed only',
    'no_change': 'No change detected',
}

for cat, label in category_labels.items():
    count = category_counts.get(cat, 0)
    pct = (count / len(valid_df) * 100) if len(valid_df) > 0 else 0
    print(f"{label}: {count} ({pct:.1f}%)")

# Save updated CSV
df.to_csv('data/master.csv', index=False)
print("\n" + "=" * 60)
print("Updated master.csv with 'category', 'publisher_change_year', and 'editor_change_year' columns")
print("=" * 60)

# Show sample of newspapers with changes
print("\n" + "-" * 40)
print("SAMPLE: Newspapers with changes")
print("-" * 40)
change_categories = list(category_labels.keys())
change_categories.remove('no_change')
changes_df = valid_df[valid_df['category'].isin(change_categories)]
if len(changes_df) > 0:
    sample_cols = ['state', 'town', 'newspaper_name', 'category', 'publisher_change_year', 'editor_change_year']
    print(changes_df[sample_cols].head(10).to_string(index=False))
else:
    print("No changes found.")

# Create lists for each category
publisher_change_only_list = valid_df[valid_df['category'] == 'publisher_change_only'][
    ['state', 'town', 'newspaper_name', 'publisher_change_year']
]
editor_change_only_list = valid_df[valid_df['category'] == 'editor_change_only'][
    ['state', 'town', 'newspaper_name', 'editor_change_year']
]
editor_and_publisher_same_list = valid_df[valid_df['category'] == 'editor_and_publisher_change_same_year'][
    ['state', 'town', 'newspaper_name', 'publisher_change_year', 'editor_change_year']
]
editor_and_publisher_diff_list = valid_df[valid_df['category'] == 'editor_and_publisher_change_diff_year'][
    ['state', 'town', 'newspaper_name', 'publisher_change_year', 'editor_change_year']
]
no_change_list = valid_df[valid_df['category'] == 'no_change'][
    ['state', 'town', 'newspaper_name']
]

print("\n" + "=" * 60)
print("DataFrames created:")
print("  - publisher_change_only_list")
print("  - editor_change_only_list")
print("  - editor_and_publisher_same_list")
print("  - editor_and_publisher_diff_list")
print("  - no_change_list")
print("=" * 60)

  df = pd.read_csv('data/master.csv')


PUBLISHER & EDITOR CHANGE ANALYSIS RESULTS

Total newspapers analyzed: 48137
Newspapers with at least 4 years of data: 12597
Newspapers with insufficient data: 35540

----------------------------------------
CATEGORY BREAKDOWN:
----------------------------------------
Editor & publisher changed (same year): 4686 (37.2%)
Editor & publisher changed (diff years): 838 (6.7%)
Publisher changed only: 816 (6.5%)
Editor changed only: 741 (5.9%)
No change detected: 5516 (43.8%)

Updated master.csv with 'category', 'publisher_change_year', and 'editor_change_year' columns

----------------------------------------
SAMPLE: Newspapers with changes
----------------------------------------
state      town     newspaper_name                              category  publisher_change_year  editor_change_year
  NaN     Afton            Tribane editor_and_publisher_change_same_year                 1876.0              1876.0
  NaN     Albia Spirit of the West editor_and_publisher_change_same_year            

In [10]:
# filter down to newspapers that we can match 

import pandas as pd

master = pd.read_csv("data/master.csv")
matches = pd.read_csv("data/matches.csv")

matches["publisher_change_year"] = matches["master_id"].dropna().astype(int).map(master["publisher_change_year"])
matches["editor_change_year"] = matches["master_id"].dropna().astype(int).map(master["editor_change_year"])
matches["category"] = matches["master_id"].dropna().astype(int).map(master["category"])
matches = matches[matches.master_id.notna()]
matches.to_csv("data/final_list.csv", index=False)

print(f"Rows with master_id: {matches['master_id'].notna().sum()}")
print(f"Rows with publisher_change_year: {matches['publisher_change_year'].notna().sum()}")

print(len(matches[matches.category.str.contains('editor_change_only')]))
print(len(matches[matches.category.str.contains('publisher_change_only')]))

  master = pd.read_csv("data/master.csv")


Rows with master_id: 566
Rows with publisher_change_year: 285
49
64


In [1]:
# structural drift panel creation — multi-treatment DID
# Updated: uses editor_and_publisher_change_same_year only
#          (drops editor_and_publisher_change_diff_year observations)
# Updated: topic_counts keyed by issn
# Updated: adds year-over-year volatility (Y_vol) and lifecycle-anchored
#          drift (Y_lifecycle) as alternative outcome measures to address
#          the anchoring/variance-cone concern.

import pandas as pd
import json
import numpy as np

# Load data sources
newspapers = pd.read_csv('data/final_list.csv')
with open('data/topic_counts.json', 'r') as f:
    topic_data = json.load(f)

TOPICS = [
    'labor_workers', 'politics_elections', 'congress_government',
    'business_commerce', 'railroads_transportation', 'agriculture_farming',
    'courts_law', 'finance_money', 'immigration_foreign', 'crime_police'
]

# Only these three categories are treated; diff-year papers are excluded entirely
TREATMENT_CATEGORIES = [
    'publisher_change_only',
    'editor_change_only',
    'editor_and_publisher_change_same_year',
]

# =============================================================================
# Step 1: Build raw panel with topic rates per 1,000 headlines
# =============================================================================
records = []
for year, papers in topic_data.items():
    for issn, data in papers.items():
        if 'topic_counts' in data and 'total_headlines' in data:
            total = data['total_headlines']
            if total >= 75:
                record = {'year': int(year), 'issn': issn}
                for topic in TOPICS:
                    count = data['topic_counts'].get(topic, 0)
                    record[topic] = (count / total) * 1000
                records.append(record)

panel = pd.DataFrame(records)

# Merge with metadata
panel = panel.merge(
    newspapers[[
        'issn', 'master_id', 'master_name',
        'category', 'publisher_change_year', 'editor_change_year'
    ]],
    on='issn', how='left'
)

# =============================================================================
# Step 1.5: Drop diff-year papers entirely so they don't pollute control group
# =============================================================================
n_before = panel['master_id'].nunique()
panel = panel[panel['category'] != 'editor_and_publisher_change_diff_year'].copy()
n_after = panel['master_id'].nunique()
print(f"Dropped {n_before - n_after} diff-year papers from panel")

# =============================================================================
# Step 2: Determine treatment status and treatment year per paper
# =============================================================================
panel['is_treated'] = panel['category'].isin(TREATMENT_CATEGORIES)

def get_treatment_year(row):
    """Return the earliest change year for a treated paper, or NaN for control."""
    if row['category'] not in TREATMENT_CATEGORIES:
        return np.nan
    years = []
    if pd.notna(row['publisher_change_year']):
        years.append(row['publisher_change_year'])
    if pd.notna(row['editor_change_year']):
        years.append(row['editor_change_year'])
    return min(years) if years else np.nan

panel['treatment_year'] = panel.apply(get_treatment_year, axis=1)

# Adjust: subtract 1 so the treatment year marks the last pre-treatment year
panel['treatment_year'] = panel['treatment_year'] - 1

print(f"Median treatment year among treated papers: "
      f"{panel.loc[panel['is_treated'], 'treatment_year'].median()}")

# =============================================================================
# Step 3: Create treatment group dummies
# =============================================================================
for cat in TREATMENT_CATEGORIES:
    panel[f'is_{cat}'] = (panel['category'] == cat).astype(int)

# =============================================================================
# Step 4: Define anchor cutoff year for each paper
#         Treated  → treatment_year (already set above)
#         Control  → max(first_year + 3, individual median year)
# =============================================================================
panel = panel.dropna(subset=['master_id']).copy()
panel = panel.sort_values(['master_id', 'year']).reset_index(drop=True)

paper_stats = panel.groupby('master_id')['year'].agg(['min', 'median'])
paper_stats.columns = ['first_year', 'median_year']
panel = panel.merge(paper_stats, on='master_id', how='left')

panel['anchor_cutoff'] = np.where(
    panel['is_treated'],
    panel['treatment_year'],
    np.maximum(panel['first_year'] + 3, panel['median_year'])
).astype(int)

# =============================================================================
# Step 5: Filter papers with at least 3 pre-treatment years
# =============================================================================
pre_counts = panel[panel['year'] < panel['anchor_cutoff']].groupby('master_id').size()
valid_papers = pre_counts[pre_counts >= 3].index
panel = panel[panel['master_id'].isin(valid_papers)].copy()
print(f"Papers with ≥3 pre-treatment years: {len(valid_papers)}")

# =============================================================================
# Step 6: Calculate anchor vectors for BOTH anchoring strategies
# =============================================================================

# --- 6a: Original anchor (pre-cutoff mean) ---
pre_panel = panel[panel['year'] < panel['anchor_cutoff']]
anchors_original = pre_panel.groupby('master_id')[TOPICS].mean()
anchors_original.columns = [f'anchor_{t}' for t in TOPICS]
panel = panel.merge(anchors_original, on='master_id', how='left')

# --- 6b: Lifecycle anchor (first 3 observed years for every paper) ---
# This puts treated and control papers on the same footing: everyone's
# anchor is their earliest topic distribution, so time-from-anchor is
# comparable across groups.
first_3 = (
    panel.sort_values(['master_id', 'year'])
    .groupby('master_id')
    .head(3)
)
anchors_lifecycle = first_3.groupby('master_id')[TOPICS].mean()
anchors_lifecycle.columns = [f'lifecycle_anchor_{t}' for t in TOPICS]
panel = panel.merge(anchors_lifecycle, on='master_id', how='left')

# Also store each paper's lifecycle anchor start year (for rel_year_lifecycle)
lifecycle_start = first_3.groupby('master_id')['year'].max()
lifecycle_start.name = 'lifecycle_anchor_end'
panel = panel.merge(lifecycle_start, on='master_id', how='left')

# =============================================================================
# Step 7: Calculate outcome variables
# =============================================================================

# --- 7a: Y_it = original drift from pre-cutoff anchor ---
def calc_drift(row, anchor_prefix='anchor'):
    sq_diffs = sum((row[t] - row[f'{anchor_prefix}_{t}'])**2 for t in TOPICS)
    return np.sqrt(sq_diffs)

panel['Y_it'] = panel.apply(lambda r: calc_drift(r, 'anchor'), axis=1)

# --- 7b: Y_lifecycle = drift from lifecycle (first-3-years) anchor ---
panel['Y_lifecycle'] = panel.apply(
    lambda r: calc_drift(r, 'lifecycle_anchor'), axis=1
)

# --- 7c: Y_vol = year-over-year volatility (Euclidean distance from t-1) ---
# This sidesteps anchoring entirely: measures turbulence/instability.
panel = panel.sort_values(['master_id', 'year']).reset_index(drop=True)

topic_arr = panel[TOPICS].values
# Shift within each paper group
shifted = panel.groupby('master_id')[TOPICS].shift(1)
diff = topic_arr - shifted.values
panel['Y_vol'] = np.sqrt(np.nansum(diff**2, axis=1))

# First observation per paper has no lag → NaN
first_obs_mask = panel.groupby('master_id').cumcount() == 0
panel.loc[first_obs_mask, 'Y_vol'] = np.nan

# =============================================================================
# Step 8: Create Post_it dummy and interaction terms
# =============================================================================
panel['Post_it'] = (panel['year'] >= panel['anchor_cutoff']).astype(int)

for cat in TREATMENT_CATEGORIES:
    panel[f'Post_x_{cat}'] = panel['Post_it'] * panel[f'is_{cat}']

# =============================================================================
# Step 9: Event-study relative time variables
# =============================================================================
# Original: relative to anchor_cutoff
panel['rel_year'] = panel['year'] - panel['anchor_cutoff']

# Lifecycle: relative to end of lifecycle anchor window
panel['rel_year_lifecycle'] = panel['year'] - panel['lifecycle_anchor_end']

# =============================================================================
# Step 10: Build final output table
# =============================================================================
output_cols = [
    'master_id', 'master_name', 'year', 'category',
    'is_treated', 'anchor_cutoff', 'Post_it',
    'Y_it', 'Y_lifecycle', 'Y_vol',
    'rel_year', 'rel_year_lifecycle',
] + [f'is_{cat}' for cat in TREATMENT_CATEGORIES] \
  + [f'Post_x_{cat}' for cat in TREATMENT_CATEGORIES]

output = panel[output_cols].rename(columns={
    'master_id': 'Newspaper_ID',
    'master_name': 'Newspaper_Name',
    'year': 'Year',
    'anchor_cutoff': 'Anchor_Cutoff_Year',
    'rel_year': 'Rel_Year',
    'rel_year_lifecycle': 'Rel_Year_Lifecycle',
}).sort_values(['Newspaper_ID', 'Year']).reset_index(drop=True)

# Clean up intermediate anchor columns before saving
panel.drop(
    columns=[f'anchor_{t}' for t in TOPICS]
    + [f'lifecycle_anchor_{t}' for t in TOPICS]
    + ['first_year', 'median_year', 'lifecycle_anchor_end'],
    inplace=True, errors='ignore'
)

# Display diagnostics
print("\nSample of final panel:\n")
print(output[['Newspaper_ID', 'Year', 'category', 'Post_it',
              'Y_it', 'Y_lifecycle', 'Y_vol', 'Rel_Year']].head(20).to_string(index=False))

print(f"\n--- Panel Summary ---")
print(f"Total observations: {len(output)}")
print(f"Unique newspapers: {output['Newspaper_ID'].nunique()}")

treated_counts = panel[panel['is_treated']].groupby('category')['master_id'].nunique()
control_count = panel[~panel['is_treated']]['master_id'].nunique()
print(f"Control papers (no_change): {control_count}")
for cat in TREATMENT_CATEGORIES:
    count = treated_counts.get(cat, 0)
    print(f"Treated papers ({cat}): {count}")

print(f"Pre-treatment obs: {(output['Post_it'] == 0).sum()}")
print(f"Post-treatment obs: {(output['Post_it'] == 1).sum()}")

# Outcome variable summary
print(f"\n--- Outcome Variable Summary ---")
for y_var in ['Y_it', 'Y_lifecycle', 'Y_vol']:
    valid = output[y_var].dropna()
    print(f"{y_var}: mean={valid.mean():.3f}, sd={valid.std():.3f}, "
          f"median={valid.median():.3f}, n={len(valid)}")

# Save
output.to_csv('data/panel_structural_drift2.csv', index=False)
print("\nSaved to 'panel_structural_drift2.csv'")

Dropped 71 diff-year papers from panel
Median treatment year among treated papers: 1877.0
Papers with ≥3 pre-treatment years: 313

Sample of final panel:

 Newspaper_ID  Year  category  Post_it       Y_it  Y_lifecycle     Y_vol  Rel_Year
          4.0  1877 no_change        0  42.951005    28.844740       NaN        -6
          4.0  1878 no_change        0  11.705858    16.640013 43.564128        -5
          4.0  1879 no_change        0  17.509141    17.888121 19.019220        -4
          4.0  1880 no_change        0  19.699659    32.985300 27.094376        -3
          4.0  1881 no_change        0  26.164697    42.832762 21.520588        -2
          4.0  1882 no_change        0  20.727700    34.030956 22.411459        -1
          4.0  1883 no_change        1  30.203680    43.337589 22.440928         0
          4.0  1884 no_change        1  45.615109    46.038748 51.874028         1
          4.0  1885 no_change        1  73.492058    66.489024 60.423300         2
          4.0  