In [7]:
import pandas as pd

In [21]:
import pandas as pd
import os

# 1. Load the titles to search for
print("Loading titles...")
with open('papers_with_missing_affiliations.txt', 'r') as f:
    titles = [line.strip() for line in f if line.strip()]
data = pd.DataFrame(titles, columns=['title'])
print(f"Loaded {len(data)} titles.")

# 2. Parse the latex affiliations file
print("Parsing latex affiliation sections... this may take a moment.")
paper_sections = {}
current_paper = None
affiliation_started = False
failed_download_count = 0
papers_with_errors = []

with open("latex_affiliations_output.txt", "r") as f:
    for line in f:
        if line.startswith("PAPER: "):
            current_paper = line.replace("PAPER: ", "").strip()
            paper_sections[current_paper] = []
            affiliation_started = False
        elif line.startswith("ERROR: Failed to download LaTeX sources"):
            if current_paper:
                papers_with_errors.append(current_paper)
                failed_download_count += 1
        elif line.startswith("AFFILIATION SECTION:"):
            affiliation_started = True
        elif line.startswith("-" * 10):
            continue
        elif current_paper and affiliation_started:
            stripped = line.strip()
            if stripped:
                paper_sections[current_paper].append(line.rstrip('\n'))

print(f"Successfully parsed {len(paper_sections)} papers.")
print(f"Total number of papers in output file with download errors: {failed_download_count}")

# 3. Load the list of countries
print("Loading country list...")
list_countries = pd.read_csv('world_coords.csv')['country'].tolist()
list_countries.extend(['United States', 'United Kingdom', 'Canada', 'Australia', 'New Zealand', 'USA', 'UK', 'The Netherlands', 'China', 'South Korea',
                       'UAE','The United Arab Emirates', 'The United States', 'The United Kingdom', 'The United States of America', 'Italy', 'France', 'Germany', 'Spain', 'Japan'])
# Remove duplicates
list_countries = list(set(list_countries))
print(f"Loaded {len(list_countries)} countries.")

# 4. Search and extract lines
print("Matching and filtering papers...")
results = []
no_match_papers = []
error_titles_norm = set(t.strip().lower() for t in papers_with_errors)
normalized_sections = {k.strip().lower(): k for k in paper_sections.keys()}

for title in data['title']:
    matching_lines = []
    title_norm = str(title).strip().lower()
    
    is_error = title_norm in error_titles_norm
    found_in_latex = title_norm in normalized_sections
    
    if found_in_latex:
        original_key = normalized_sections[title_norm]
        section_lines = paper_sections[original_key]
        
        for l in section_lines:
            for country in list_countries:
                if country in l:
                    matching_lines.append(l.strip())
    
    # Track papers with no matches that aren't download errors
    if not is_error and len(matching_lines) == 0:
        no_match_papers.append(title)
        
    results.append({
        'title': title,
        'lines': matching_lines
    })

# 5. Save results
output_df = pd.DataFrame(results)
output_df.to_csv('extracted_affiliations.csv', index=False)
print(f"\nTask complete! Saved to 'extracted_affiliations.csv'.")
print(f"Found matches for {len(output_df[output_df['lines'].map(len) > 0])} papers.")

# Report on papers in 'data' that failed to download
data_error_count = sum(1 for t in data['title'] if str(t).strip().lower() in error_titles_norm)
print(f"Of the {len(data)} papers in your request, {data_error_count} had LaTeX download errors recorded.")

# Report on papers with no matches
print(f"Number of papers with NO country matches (excluding errors): {len(no_match_papers)}")
if no_match_papers:
    print("\nPapers with no country matches:")
    for p in no_match_papers[:20]: # Show first 20
        print(f"- {p}")
    if len(no_match_papers) > 20:
        print(f"... and {len(no_match_papers)-20} more.")

output_df.head()


Loading titles...
Loaded 3336 titles.
Parsing latex affiliation sections... this may take a moment.
Successfully parsed 10685 papers.
Total number of papers in output file with download errors: 12917
Loading country list...
Loaded 253 countries.
Matching and filtering papers...

Task complete! Saved to 'extracted_affiliations.csv'.
Found matches for 1041 papers.
Of the 3336 papers in your request, 2152 had LaTeX download errors recorded.
Number of papers with NO country matches (excluding errors): 149

Papers with no country matches:
- Probabilistic mapping between multiparticle production variables and the depth of maximum in proton-induced extensive air showers
- The Leinster-Cobbold diversity index as a criterion for sub-clustering
- The Youngest Star Clusters in the Large Magellanic Cloud
- On the presence of a fifth force at the Galactic Center
- Heating, Excitation, Dissociation, and Ionization of Molecules by High-Energy Photons in Planetary Atmospheres
- Conceptual framework fo

Unnamed: 0,title,lines
0,Can a multi-tracer approach improve the constr...,[]
1,First large scale spatial and velocity pattern...,[]
2,A Stellar Magnesium to Silicon ratio in the at...,[]
3,Photometric and spectroscopic variability of b...,[]
4,Homogeneous measurements of proximity zone siz...,"[$^{1}$Leiden Observatory, Leiden University, ..."


In [22]:
import pandas as pd
import os

# 1. Load the titles to search for
print("Loading titles...")
with open('papers_with_missing_affiliations.txt', 'r') as f:
    titles = [line.strip() for line in f if line.strip()]
data = pd.DataFrame(titles, columns=['title'])
print(f"Loaded {len(data)} titles.")

# 2. Parse the latex affiliations file
print("Parsing latex affiliation sections... this may take a moment.")
paper_sections = {}
current_paper = None
affiliation_started = False
failed_download_count = 0
papers_with_errors = []

with open("latex_affiliations_output_2.txt", "r") as f:
    for line in f:
        if line.startswith("PAPER: "):
            current_paper = line.replace("PAPER: ", "").strip()
            paper_sections[current_paper] = []
            affiliation_started = False
        elif line.startswith("ERROR: Failed to download LaTeX sources"):
            if current_paper:
                papers_with_errors.append(current_paper)
                failed_download_count += 1
        elif line.startswith("AFFILIATION SECTION:"):
            affiliation_started = True
        elif line.startswith("-" * 10):
            continue
        elif current_paper and affiliation_started:
            stripped = line.strip()
            if stripped:
                paper_sections[current_paper].append(line.rstrip('\n'))

print(f"Successfully parsed {len(paper_sections)} papers.")
print(f"Total number of papers in output file with download errors: {failed_download_count}")

# 3. Load the list of countries
print("Loading country list...")
list_countries = pd.read_csv('world_coords.csv')['country'].tolist()
list_countries.extend(['United States', 'United Kingdom', 'Canada', 'Australia', 'New Zealand', 'USA', 'UK', 'The Netherlands', 'China', 'South Korea',
                       'UAE','The United Arab Emirates', 'The United States', 'The United Kingdom', 'The United States of America', 'Italy', 'France', 'Germany', 'Spain', 'Japan'])
# Remove duplicates
list_countries = list(set(list_countries))
print(f"Loaded {len(list_countries)} countries.")

# 4. Search and extract lines
print("Matching and filtering papers...")
results = []
no_match_papers = []
error_titles_norm = set(t.strip().lower() for t in papers_with_errors)
normalized_sections = {k.strip().lower(): k for k in paper_sections.keys()}

for title in data['title']:
    matching_lines = []
    title_norm = str(title).strip().lower()
    
    is_error = title_norm in error_titles_norm
    found_in_latex = title_norm in normalized_sections
    
    if found_in_latex:
        original_key = normalized_sections[title_norm]
        section_lines = paper_sections[original_key]
        
        for l in section_lines:
            for country in list_countries:
                if country in l:
                    matching_lines.append(l.strip())
    
    # Track papers with no matches that aren't download errors
    if not is_error and len(matching_lines) == 0:
        no_match_papers.append(title)
        
    results.append({
        'title': title,
        'lines': matching_lines
    })

# 5. Save results
output_df = pd.DataFrame(results)
output_df.to_csv('extracted_affiliations_2.csv', index=False)
print(f"\nTask complete! Saved to 'extracted_affiliations.csv'.")
print(f"Found matches for {len(output_df[output_df['lines'].map(len) > 0])} papers.")

# Report on papers in 'data' that failed to download
data_error_count = sum(1 for t in data['title'] if str(t).strip().lower() in error_titles_norm)
print(f"Of the {len(data)} papers in your request, {data_error_count} had LaTeX download errors recorded.")

# Report on papers with no matches
print(f"Number of papers with NO country matches (excluding errors): {len(no_match_papers)}")
if no_match_papers:
    print("\nPapers with no country matches:")
    for p in no_match_papers[:20]: # Show first 20
        print(f"- {p}")
    if len(no_match_papers) > 20:
        print(f"... and {len(no_match_papers)-20} more.")

output_df.head()


Loading titles...
Loaded 3336 titles.
Parsing latex affiliation sections... this may take a moment.
Successfully parsed 5974 papers.
Total number of papers in output file with download errors: 212
Loading country list...
Loaded 253 countries.
Matching and filtering papers...

Task complete! Saved to 'extracted_affiliations.csv'.
Found matches for 1519 papers.
Of the 3336 papers in your request, 126 had LaTeX download errors recorded.
Number of papers with NO country matches (excluding errors): 1691

Papers with no country matches:
- Can a multi-tracer approach improve the constraints on the turnover scale?
- Homogeneous measurements of proximity zone sizes for 59 quasars in the Epoch of Reionization
- The COSMOS-Web Lens Survey (COWLS) III: forecasts versus data
- JWST observations of segregated $^{12}$CO$_2$ and $^{13}$CO$_2$ ices in protostellar envelopes
- A direct black hole mass measurement in a Little Red Dot at the Epoch of Reionization
- Why the northern hemisphere needs a 30-4

Unnamed: 0,title,lines
0,Can a multi-tracer approach improve the constr...,[]
1,First large scale spatial and velocity pattern...,"[Universit\'e C\^ote d'Azur, Observatoire de l..."
2,A Stellar Magnesium to Silicon ratio in the at...,[\affil[1]{School of Earth and Space Explorati...
3,Photometric and spectroscopic variability of b...,"[\institute{Tartu Observatory, University of T..."
4,Homogeneous measurements of proximity zone siz...,[]
