In [3]:
#joins for wikipedia_api_scraper/output_v2.csv and ai_summarizer/output.csv

wikipedia_api_scraper_path = "../wikipedia_api_scraper/output_v2.csv"
ai_summarizer_path = "../ai_summarizer/wikipedia_extraction/human_evaluated_output.csv"

import pandas as pd

wikipedia_api_scraper = pd.read_csv(wikipedia_api_scraper_path)
ai_summarizer = pd.read_csv(ai_summarizer_path)
ai_summarizer.rename(columns={"name": "Name"}, inplace=True)

left_join = wikipedia_api_scraper.merge(ai_summarizer, on='Name', how='left')
right_join = wikipedia_api_scraper.merge(ai_summarizer, on='Name', how='right')
left_join_nulls = left_join[left_join['wikipedia_url'].isna()].copy()
right_join_nulls = right_join[right_join['Wikipedia Link'].isna()].copy()
inner_join = wikipedia_api_scraper.merge(ai_summarizer, on='Name', how='inner')




In [4]:
# Build DataFrames where Wikipedia links match vs don't match (based on inner_join)

# Normalize link strings for robust equality (trim, lowercase, https, no trailing slash)
def _normalize_links(series):
    return (
        series.fillna("")
              .str.strip()
              .str.lower()
              .str.replace("http://", "https://", regex=False)
              .str.rstrip("/")
    )

left_link_norm = _normalize_links(inner_join["Wikipedia Link"]) if "Wikipedia Link" in inner_join.columns else None
right_link_norm = _normalize_links(inner_join["wikipedia_url"]) if "wikipedia_url" in inner_join.columns else None

# Match only when both present and normalized values are identical
both_present = inner_join["Wikipedia Link"].notna() & inner_join["wikipedia_url"].notna()
link_match_mask = both_present & (left_link_norm == right_link_norm)

links_match_df = inner_join[link_match_mask].copy()
links_non_match_df = inner_join[~link_match_mask].copy()

print(f"Exact link matches: {len(links_match_df)}")
print(f"Non-matches (including missing): {len(links_non_match_df)}")


Exact link matches: 35
Non-matches (including missing): 253


In [23]:
links_match_df

Unnamed: 0,Name,Primary Occupation,Race,Sex,Birth Date,Death Date,Wikipedia Link,original_designation,county,wikipedia_url,...,education,dob,dod,place_of_birth,place_of_death,gender,involved_in_sports,involved_in_politics,involved_in_military,involved_in_music
1,Allan Bense,politician,,male,1951-10-06,,https://en.wikipedia.org/wiki/Allan_Bense,Allan Bense Highway,Bay,https://en.wikipedia.org/wiki/Allan_Bense,...,"['University of West Florida, Bachelor of Arts...",1947-10-25,2014-07-10,"Pensacola, Florida, United States","Pensacola, Florida, United States",male,no,yes,no,no
8,Harriet Tubman,writer,African Americans,female,1821-00-00,1913-03-10,https://en.wikipedia.org/wiki/Harriet_Tubman,Harriet Tubman Highway/U.S. 1/S.R. 5,Miami-Dade,https://en.wikipedia.org/wiki/Harriet_Tubman,...,,1822-01-01,1913-03-10,"Dorchester County, Maryland, United States","Auburn, New York, United States",female,no,yes,yes,no
9,Harriet Tubman,writer,African Americans,female,1821-00-00,1913-03-10,https://en.wikipedia.org/wiki/Harriet_Tubman,Harriet Tubman Highway/State Road 909,Miami-Dade,https://en.wikipedia.org/wiki/Harriet_Tubman,...,,1822-01-01,1913-03-10,"Dorchester County, Maryland, United States","Auburn, New York, United States",female,no,yes,yes,no
32,Carmelau Monestime,radio personality,,male,1931-04-06,2016-01-16,https://en.wikipedia.org/wiki/Carmelau_Monestime,Carmelau Monestime Street,Miami‐Dade,https://en.wikipedia.org/wiki/Carmelau_Monestime,...,,,,not found,not found,not found,no,no,no,no
40,Larcenia Bullard,politician,,female,1947-07-21,2013-03-16,https://en.wikipedia.org/wiki/Larcenia_Bullard,Larcenia Bullard Way,Miami‐Dade,https://en.wikipedia.org/wiki/Larcenia_Bullard,...,,1947-01-01,2023-01-01,not found,not found,female,no,yes,no,no
41,Betty Pino,announcer,,female,1948-04-21,2013-08-07,https://en.wikipedia.org/wiki/Betty_Pino,Betty Pino Way,Miami‐Dade,https://en.wikipedia.org/wiki/Betty_Pino,...,,,1979-11-18,not found,"Jonestown, Guyana",female,no,no,no,no
45,C. Bette Wimbish,lawyer,,female,1924-03-24,2009-11-30,https://en.wikipedia.org/wiki/C._Bette_Wimbish,C. Bette Wimbish Highway,Pinellas,https://en.wikipedia.org/wiki/C._Bette_Wimbish,...,"[""Virginia State University, Bachelor's degree...",1924-01-01,2016-01-01,"Petersburg, Virginia, United States",not found,female,no,yes,no,no
55,Albert W. Gilchrist,politician,,male,1858-01-15,1926-05-15,https://en.wikipedia.org/wiki/Albert_W._Gilchrist,Albert W. Gilchrist Bridge,Charlotte,https://en.wikipedia.org/wiki/Albert_W._Gilchrist,...,['East Florida Seminary (now part of the Unive...,1858-01-15,1926-05-15,"Nassau County, Florida, United States","Gainesville, Florida, United States",male,no,yes,yes,no
74,Claude Pepper,politician,,male,1900-09-08,1989-05-30,https://en.wikipedia.org/wiki/Claude_Pepper,Claude Pepper Memorial Highway,Multiple Counties,https://en.wikipedia.org/wiki/Claude_Pepper,...,"['University of Alabama, Bachelor of Arts', 'H...",1900-09-08,1989-05-30,"Dudleyville, Alabama, United States","Washington, D.C., United States",male,no,yes,yes,no
85,Billy Bowlegs III,,,male,1862-01-01,1965-01-01,https://en.wikipedia.org/wiki/Billy_Bowlegs_III,Billy Bowlegs III Bridge,Okeechobee,https://en.wikipedia.org/wiki/Billy_Bowlegs_III,...,,1910-01-01,1965-01-01,"Florida, United States","Florida, United States",male,no,yes,no,no


In [24]:
links_non_match_df

Unnamed: 0,Name,Primary Occupation,Race,Sex,Birth Date,Death Date,Wikipedia Link,original_designation,county,wikipedia_url,...,education,dob,dod,place_of_birth,place_of_death,gender,involved_in_sports,involved_in_politics,involved_in_military,involved_in_music
0,Jim Tullis,politician,,male,1941-11-03,2017-10-14,https://en.wikipedia.org/wiki/Jim_Tullis,Jim Tullis Memorial Boulevard,Duval,https://en.wikipedia.org/wiki/James_B._Fuller,...,,,,not found,not found,male,no,yes,no,no
2,Robert L. Clark,lawyer,,male,1872-01-31,,https://en.wikipedia.org/wiki/Robert_L._Clark,Robert L. Clark Memorial Highway,Broward,https://en.wikipedia.org/wiki/Plantation%2C_Fl...,...,,,,not found,not found,not found,no,no,no,no
3,James Harold Thompson,politician,,male,1944-11-10,,https://en.wikipedia.org/wiki/James_Harold_Tho...,James Harold Thompson Highway,Gadsden,https://en.wikipedia.org/wiki/Harold_Sebring,...,"[""University of Michigan (Bachelor's degree)"",...",1893-01-01,1980-01-01,not found,not found,male,yes,no,yes,no
4,John U. Lloyd,,,,,,https://en.wikipedia.org/wiki/Dr._Von_D._Mizel...,John U. Lloyd Bridge,Broward,https://en.wikipedia.org/wiki/John_O%27Hurley,...,,1954-10-09,,"Kittery, Maine, United States",not found,male,no,no,no,no
5,Helen Gordon Davis,actor,,female,1926-12-25,2015-05-18,https://en.wikipedia.org/wiki/Helen_Davis,Helen Gordon Davis Boulevard,Hillsborough,https://en.wikipedia.org/wiki/Helen_Aguirre_Fe...,...,"['University of Miami, Bachelor of Arts in Pol...",,,not found,not found,female,no,yes,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,First Coast,,,,,,https://gl.wikipedia.org/wiki/Northeast_Florida,First Coast Highway,Multiple Counties,https://en.wikipedia.org/wiki/First_Coast_High...,...,,,,not found,not found,not found,yes,no,no,no
282,Atlantic Beach,,,,,,"https://en.wikipedia.org/wiki/Atlantic_Beach,_...",Atlantic Beach Boulevard,Multiple Counties,https://en.wikipedia.org/wiki/Atlantic_Beach,...,,,,not found,not found,not found,no,no,no,no
285,Archbishop Edward A. McCarthy High School,,,,,,https://en.wikipedia.org/wiki/Archbishop_Edwar...,Archbishop Edward A. McCarthy High School Way,Broward,https://en.wikipedia.org/wiki/Justin_Lebron,...,,,,not found,not found,male,yes,no,no,no
286,Lois D. Martin,,,female,,2022-01-09,https://en.wikipedia.org/wiki/Lois_D._Martin,Lois D. Martin Way,Palm Beach,https://en.wikipedia.org/wiki/Lois_Lenski,...,"['Ohio State University, Bachelor of Science i...",1893-10-14,1974-09-11,"Springfield, Ohio, United States","Tallahassee, Florida, United States",female,no,no,no,no


In [5]:
links_match_df.to_csv("links_match.csv", index=False)
links_non_match_df.to_csv("links_non_match.csv", index=False)

In [15]:
left_join_nulls = left_join_nulls[["Name", "wikipedia_url", "Wikipedia Link", "Does_Sam_Think_is_real"]]

In [None]:
#161 rows that from wikipedia_api_scraper are not in ai_summarizer
len(left_join_nulls)

161

In [None]:
#161 rows from ai_summarizer are not in wikipedia_api_scraper
len(right_join_nulls)

464