## Phase 1 

### Retrieve articles related ot topic from Arxiv

In [1]:
import arxiv
import json
import pandas as pd
import urllib.request as libreq
import certifi
import os
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime, timedelta
from tqdm import tqdm
from IPython.display import display, Latex
from bs4 import BeautifulSoup
import re
import PyPDF2
import io
os.environ['SSL_CERT_FILE'] = certifi.where()

#### Fuzzy Search

In [2]:
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
search = arxiv.Search(
  query = "Computational Finance",
  max_results = 10,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

# `results` is a generator; you can iterate over its elements one by one...
for r in client.results(search):
  print(r.title)
  print(r.authors)

PERSE: Personalized 3D Generative Avatars from A Single Portrait
[arxiv.Result.Author('Hyunsoo Cha'), arxiv.Result.Author('Inhee Lee'), arxiv.Result.Author('Hanbyul Joo')]
Action-Agnostic Point-Level Supervision for Temporal Action Detection
[arxiv.Result.Author('Shuhei M. Yoshida'), arxiv.Result.Author('Takashi Shibata'), arxiv.Result.Author('Makoto Terao'), arxiv.Result.Author('Takayuki Okatani'), arxiv.Result.Author('Masashi Sugiyama')]
Branes Screening Quarks and Defect Operators
[arxiv.Result.Author('Andreas Karch'), arxiv.Result.Author('Marcos Riojas')]
SoS Certificates for Sparse Singular Values and Their Applications: Robust Statistics, Subspace Distortion, and More
[arxiv.Result.Author('Ilias Diakonikolas'), arxiv.Result.Author('Samuel B. Hopkins'), arxiv.Result.Author('Ankit Pensia'), arxiv.Result.Author('Stefan Tiegel')]
Distributed Mixture-of-Agents for Edge Inference with Large Language Models
[arxiv.Result.Author('Purbesh Mitra'), arxiv.Result.Author('Priyanka Kaswan'), a

#### GIGA CLASS

In [26]:
class ArxivDataframe:
    def __init__(self, subject, date=None):
        self.subject = subject.lower()
        self.client = arxiv.Client()
        self.bs4_client = BeautifulSoup
        self.date = date

    
    def _retrieve_html(self):
        base_url = f'https://arxiv.org/list/{self.subject}/new'
        page = libreq.urlopen(base_url)
        html = page.read().decode('utf-8')
        return html
    
    def _retrieve_html_dt(self):
        query = f'https://arxiv.org/catchup/{self.subject}/{self.date}?abs=True'
        print(query)
        page = libreq.urlopen(query)

        html = page.read().decode('utf-8')

        print(html)
        return html
    
    def _remove_brackets(self, text):
        """Remove content within brackets from text"""
        return re.sub(r'\(.*?\)', '', text).strip()
    
    def _clean_subjects(self, df):
        """Clean primary and secondary subjects"""
        df['primary_subject'] = df['primary_subject'].map(self._remove_brackets)
        df['secondary_subjects'] = df['secondary_subjects'].map(
            lambda x: [self._remove_brackets(subject) for subject in x] if isinstance(x, list) else x,
            na_action='ignore'
        )
        return df
    
    def _clean_journal(self, df):
        """Clean journal information"""
        df['submitted_journal'] = df['submitted_journal'].str.split(r'[,;:.]').str[0]
        return df
    
    def _extract_affiliations(self, pdf_reader, authors, max_pages=2):
        """
        Extract author affiliations from PDF using a simplified approach with better logging
        """
        print("\n=== Starting Affiliation Extraction ===")
        print(f"Processing authors: {authors}")
        
        affiliations = [None] * len(authors)
        
        try:
            # Get text from first pages
            full_text = ""
            for page_num in range(min(max_pages, len(pdf_reader.pages))):
                try:
                    page_text = pdf_reader.pages[page_num].extract_text()
                    full_text += page_text + "\n"
                    print(f"Successfully read page {page_num + 1}")
                except Exception as e:
                    print(f"Error reading page {page_num + 1}: {str(e)}")
                    continue

            # Clean text
            full_text = re.sub(r'\s+', ' ', full_text)
            
            # Truncate text at common section markers
            section_markers = ['Abstract', 'Introduction', 'Keywords', 'I.', '1.', 'Methods']
            for marker in section_markers:
                pos = full_text.find(marker)
                if pos != -1:
                    full_text = full_text[:pos]
                    print(f"Truncated text at marker: {marker}")
            
            print("\nLooking for affiliation blocks...")
            
            # Simple pattern to find potential affiliation blocks
            affiliation_patterns = [
                # Look for institutional addresses
                r'(?i)(?:Department|University|Institute|Laboratory|School|Center|Centre)[^.]*(?:[^.]*(?:University|Institute|Laboratory|School|Center|Centre)[^.]*)*\.',
            ]
            
            potential_affiliations = []
            for pattern in affiliation_patterns:
                matches = re.finditer(pattern, full_text)
                for match in matches:
                    aff = match.group(0).strip()
                    if len(aff) > 20:  # Filter out very short matches
                        potential_affiliations.append(aff)
                        print(f"Found potential affiliation: {aff}")
            
            # Remove duplicates while preserving order
            potential_affiliations = list(dict.fromkeys(potential_affiliations))
            
            print(f"\nFound {len(potential_affiliations)} unique potential affiliations")
            
            # For each author, try to find their affiliation
            for i, author in enumerate(authors):
                try:
                    author_name = author.split()[-1]  # Get last name
                    print(f"\nProcessing author: {author} (searching for: {author_name})")
                    
                    # Look for affiliations near author name
                    author_pos = full_text.find(author)
                    if author_pos != -1:
                        # Look at text chunk around author mention
                        window = 500  # Increased window size
                        start = max(0, author_pos - window//2)
                        end = min(len(full_text), author_pos + window//2)
                        nearby_text = full_text[start:end]
                        
                        author_affiliations = []
                        for aff in potential_affiliations:
                            if aff in nearby_text:
                                author_affiliations.append(aff)
                                print(f"Found matching affiliation: {aff}")
                        
                        if author_affiliations:
                            affiliations[i] = author_affiliations
                        else:
                            print(f"No affiliations found near author {author}")
                    else:
                        print(f"Could not find author {author} in text")
                
                except Exception as e:
                    print(f"Error processing author {author}: {str(e)}")
                    continue
            
            print("\n=== Affiliation Extraction Complete ===")
            print(f"Final affiliations: {affiliations}")
            return affiliations
            
        except Exception as e:
            print(f"Error in affiliation extraction: {str(e)}")
            return [None] * len(authors)
        
    def _extract_pdf_metrics(self, pdf_reader):
        """Extract metrics (pages, figures, tables) from PDF"""
        metrics = {
            'pages': len(pdf_reader.pages),
            'figures': 0,
            'tables': 0
        }
        
        for page in pdf_reader.pages:
            text = page.extract_text()
            # Find figures
            figure_numbers = re.findall(r'(?i)(?:Figure|Fig.|Figure.|Fig})\s+(\d+)', text)
            if figure_numbers:
                metrics['figures'] = max(metrics['figures'], max(map(int, figure_numbers)))
            
            # Find tables
            table_numbers = re.findall(r'(?i)(?:Table|Table.})\s+(\d+)', text)
            if table_numbers:
                metrics['tables'] = max(metrics['tables'], max(map(int, table_numbers)))
                
        return metrics
    
    def _process_pdf(self, pdf_link, current_metrics=None, authors=None):
        """Process PDF to extract metrics, keywords, and affiliations"""
        try:
            pdf_response = libreq.urlopen('https://' + pdf_link)
            pdf_file = pdf_response.read()
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
            
            # Extract metrics if needed
            metrics = self._extract_pdf_metrics(pdf_reader)
            
            # Only update metrics that are currently NaN
            if current_metrics:
                for key in metrics:
                    if pd.isna(current_metrics[key]):
                        current_metrics[key] = metrics[key]
                metrics = current_metrics
            
            # Extract keywords
            keywords = self._extract_keywords(pdf_reader)
            
            # Extract affiliations if authors are provided
            affiliations = None
            if authors:
                affiliations = self._extract_affiliations(pdf_reader, authors)
            
            return {**metrics, 'keywords': keywords, 'affiliations': affiliations}
            
        except Exception as e:
            print(f"Error processing PDF {pdf_link}: {str(e)}")
            return None
        
    def _extract_keywords(self, pdf_reader, max_pages=5):
        """
        Extract keywords from PDF with improved accuracy and efficiency across subjects
        Args:
            pdf_reader: PyPDF2.PdfReader object
            max_pages: Maximum number of pages to search (default: 5, as keywords are usually at the start)
        Returns:
            list: Extracted keywords
        """
        keywords = []
        patterns = [
            r'(?i)(?:key[ -]?words?|index terms)[:.]?\s*(.*?)(?:[.;]|\n|(?=\n\n)|$)',
            r'(?i)(?:PACS numbers?|Mathematics Subject Classification|AMS subject classifications?'
            r'|Computing Classification System|ACM CCS|MeSH terms)[:.]?\s*(.*?)(?:[.;]|\n|(?=\n\n)|$)',
            r'(?i)(?:subject headings?|thesaurus terms?|subject terms?|descriptors?)[:.]?\s*(.*?)(?:[.;]|\n|(?=\n\n)|$)',
            r'(?i)(?:mots[- ]?cl[ée]s?|schlüsselwörter|palabras[- ]?clave)[:.]?\s*(.*?)(?:[.;]|\n|(?=\n\n)|$)'
        ]

        # Common section headers that indicate the end of front matter
        section_markers = [
            '1. Introduction', '1 Introduction', 'Introduction', 
            'Background', 'Literature Review', 'Methods',
            'Methodology', 'Results', 'Discussion',
            'I. ', 'II. ', 'Section 1', 'Section 2'
        ]
        
        try:
            # Only search first few pages for efficiency
            pages_to_search = min(max_pages, len(pdf_reader.pages))          
            for page_num in range(pages_to_search):
                try:
                    text = pdf_reader.pages[page_num].extract_text()
                    if not text:
                        continue
                        
                    # Clean text while preserving important separators
                    text = re.sub(r'\s+', ' ', text)
                    text = re.sub(r'(?<=[.,;])\s*(?=[A-Z])', '\n', text)  # Add breaks at major punctuation
                    
                    # Check for section markers and truncate text
                    for marker in section_markers:
                        marker_pos = text.find(marker)
                        if marker_pos != -1:
                            text = text[:marker_pos]
                            break
                    
                    # Extract keywords using patterns
                    for pattern in patterns:
                        matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
                        for match in matches:
                            # Handle both string and tuple matches
                            match_text = match[0] if isinstance(match, tuple) else match
                            
                            # Clean and split the matched text
                            cleaned_keywords = match_text.strip()
                            # Split on common keyword separators
                            for separator in [';', ',', '•', '·', '—', '-', '\n']:
                                if separator in cleaned_keywords:
                                    keywords.extend([k.strip() for k in cleaned_keywords.split(separator)])
                                    break
                            else:
                                keywords.append(cleaned_keywords)
                            
                except Exception as e:
                    print(f"Error processing page {page_num}: {str(e)}")
                    continue
                
            # Post-processing of keywords
            processed_keywords = []
            for keyword in keywords:
                # Skip if too short or too long
                if not keyword or len(keyword) < 3 or len(keyword) > 100:
                    continue
                # Clean up the keyword
                cleaned = re.sub(r'^\W+|\W+$', '', keyword)  # Remove leading/trailing non-word chars
                cleaned = re.sub(r'\s+', ' ', cleaned)       # Normalize whitespace
                cleaned = cleaned.strip()               
                if cleaned and len(cleaned) >= 3:
                    processed_keywords.append(cleaned)
            
            # Remove duplicates while preserving order
            seen = set()
            final_keywords = []
            for keyword in processed_keywords:
                lower_keyword = keyword.lower()
                if lower_keyword not in seen:
                    seen.add(lower_keyword)
                    final_keywords.append(keyword)
            
            return final_keywords[:10]  # Limit to top 10 keywords
            
        except Exception as e:
            print(f"Error in keyword extraction: {str(e)}")
            return []
    
    def _metadata(self, xml_part):
        soup = self.bs4_client(xml_part, 'html.parser')
        title_tag = soup.find('div', class_='list-title mathjax')
        title = title_tag.get_text(strip=True).replace('Title:', '').strip() if title_tag else None

        # abstract
        abstract_tag = soup.find('p', class_='mathjax')
        abstract = abstract_tag.get_text(strip=True) if abstract_tag else None

        # authors
        authors_section = soup.find('div', class_='list-authors')
        authors = [author.get_text(strip=True) for author in authors_section.find_all('a')] if authors_section else []

        # comments
        comments_tag = soup.find('div', class_='list-comments mathjax')
        comments = comments_tag.get_text(strip=True).replace('Comments:', '').strip() if comments_tag else ''
        
        # figures, pages, tables
        figures_match = re.search(r'(\d+)\s+figures', comments)
        figures = int(figures_match.group(1)) if figures_match else None
        pages_match = re.search(r'(\d+)\s+pages', comments)
        pages = int(pages_match.group(1)) if pages_match else None
        tables_match = re.search(r'(\d+)\s+table[s]?', comments)
        tables = int(tables_match.group(1)) if tables_match else None

        # PDF link
        pdf_tag = soup.find('a', title='Download PDF')
        pdf_link = pdf_tag['href'] if pdf_tag else None

        # primary subject
        primary_subject_tag = soup.find('span', class_='primary-subject')
        primary_subject = primary_subject_tag.get_text(strip=True) if primary_subject_tag else None

        # secondary subjects
        subjects_section = soup.find('div', class_='list-subjects')
        if subjects_section:
            subjects_text = subjects_section.get_text(strip=True)
            subjects_split = subjects_text.split(';')
            secondary_subjects = [subject.strip() for subject in subjects_split[1:]] if len(subjects_split) > 1 else None
        else:
            secondary_subjects = None

        # journal
        submitted_journal = None
        if comments:
            for prefix in ['Submitted to ', 'Accepted to ', 'Accepted for publication in ', 'Accepted by ', 'Submitted by ']:
                if prefix in comments:
                    submitted_journal = comments.split(prefix)[-1]
                    break

        # published
        published_tag = soup.find('div', class_='list-journal-ref')
        published_journal = published_tag.get_text(strip=True).replace('Journal-ref:', '').strip() if published_tag else None

        return {
            'title': title,
            'abstract': abstract,
            'authors': authors,
            'figures': figures,
            'pages': pages,
            'tables': tables,
            'pdf_link': f'arxiv.org{pdf_link}' if pdf_link else None,
            'primary_subject': primary_subject,
            'secondary_subjects': secondary_subjects,
            'submitted_journal': submitted_journal,
            'published_journal': published_journal
        }
    
    def process_dataframe(self, df):
        """Process the dataframe to add all additional features"""
        # Clean subjects and journal information
        df = self._clean_subjects(df)
        df = self._clean_journal(df)
        
        # Initialize keywords column
        df['keywords'] = None
        df['affiliations'] = None
    
        # Process each paper
        for i in tqdm(range(len(df)), desc='Processing PDFs, for metrics, keywords and affiliations'):
            current_metrics = {
                'pages': df['pages'][i],
                'figures': df['figures'][i],
                'tables': df['tables'][i]
            }
            
            # Only process PDF if we're missing any data
            if (pd.isna(current_metrics['pages']) or 
                pd.isna(current_metrics['figures']) or 
                pd.isna(current_metrics['tables']) or 
                pd.isna(df['keywords'][i]) or
                pd.isna(df['affiliations'][i])):
                
                pdf_data = self._process_pdf(
                    df['pdf_link'][i], 
                    current_metrics,
                    authors=df['authors'][i] if 'authors' in df else None
                )
                
                if pdf_data:
                    df.at[i, 'pages'] = pdf_data['pages']
                    df.at[i, 'figures'] = pdf_data['figures']
                    df.at[i, 'tables'] = pdf_data['tables']
                    df.at[i, 'keywords'] = pdf_data['keywords']
                    if pdf_data['affiliations']:
                        df.at[i, 'affiliations'] = pdf_data['affiliations']
        
        return df
    
    def construct_dataframe(self):
        """Construct and process the complete dataframe"""
        # Get initial data
        if(self.date):
            print("yo")
            html = self._retrieve_html_dt()
        else:
            html = self._retrieve_html()
            
        soup = self.bs4_client(html, 'html.parser')
        
        h3_tag = soup.find('h3', string=lambda x: x and 'New submissions' in x)
        title_tag = soup.find('title')
        if not h3_tag and not title_tag:
            print("New submissions header not found")
            return pd.DataFrame()
            
        try:
            if not h3_tag:
                number_of_papers= len(soup.find_all('title'))
            else:
                number_of_papers = int(h3_tag.string.split('(')[1].split()[1])
            
            print(f"Number of papers: {number_of_papers}")
        except (IndexError, ValueError):
            print("Could not extract number of papers")
            return pd.DataFrame()
            
        # Get metadata for all papers
        items = soup.find_all('a', attrs={'name': True})
        
        if not items:
            print("No paper items found")
            return pd.DataFrame()
            
        all_metadata = []
        
        # Process papers except the last one
        for i in tqdm(range(number_of_papers-1),desc='Processing Papers'):
            start = items[i]
            end = items[i + 1]
            start_index = str(soup).find(str(start))
            end_index = str(soup).find(str(end))
            xml_part = str(soup)[start_index:end_index]
            metadata = self._metadata(xml_part)
            all_metadata.append(metadata)
            
        # Process the last paper
        last_item = items[-1]
        start_index = str(soup).find(str(last_item))
        xml_part = str(soup)[start_index:]
        metadata = self._metadata(xml_part)
        all_metadata.append(metadata)
        
        # Create and process dataframe
        df = pd.DataFrame(all_metadata) 
        return self.process_dataframe(df)

In [None]:
left_subject_class = ['computer_science', 'economics', 'eess', 'mathematics', 'physics']
right_subject_class = ['q_biology', 'q_finanace',  'statistics']
the_physics_list = ['astro-ph', 'cond-mat', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nlin', 'nucl-ex', 'nucl-th', 'physics', 'quant-ph']

In [28]:
arxiv_data = ArxivDataframe('astro-ph', '2025-01-03')
# Get processed dataframe with all features
df = arxiv_data.construct_dataframe()

yo
https://arxiv.org/catchup/astro-ph/2025-01-03?abs=True
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">

<head>  <title></title>
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png">
  <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest">
  <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5">
  <meta name="msapplication-TileColor" content="#da532c">
  <meta name="theme-color" content="#ffffff

Processing Papers: 100%|██████████| 21/21 [00:00<00:00, 25.27it/s]
Processing PDFs, for metrics, keywords and affiliations:   5%|▍         | 1/22 [00:01<00:24,  1.14s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['John F. Wu']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Keywords

Looking for affiliation blocks...
Found potential affiliation: Institute, 3700 San Martin Dr, Baltimore, MD 21218 2Department of Physics & Astronomy, Johns Hopkins University, 3400 N Charles St, Baltimore, MD 21218 3Department of Computer Science, Johns Hopkins University, 3400 N Charles St, Baltimore, MD 21218 ABSTRACT Galaxy appearances reveal the physics of how they formed and evolved.

Found 1 unique potential affiliations

Processing author: John F. Wu (searching for: Wu)
No affiliations found near author John F. Wu

=== Affiliation Extraction Complete ===
Final affiliations: [None]


Processing PDFs, for metrics, keywords and affiliations:   9%|▉         | 2/22 [00:05<01:01,  3.07s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Shi-Ju Kang', 'Shan-Shan Ren', 'Yong-Gang Zheng', 'Qingwen Wu']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Keywords

Looking for affiliation blocks...
Found potential affiliation: School of Physics and Electrical Engineering, Liupanshui Normal University, Liupanshui, Guizhou, 553004, People’s Republic of China 2Institute of Space Sciences, Shandong University, Weihai, Shandong, 264209, People’s Republic of China 3Department of Physics, Yunnan Normal University, Kunming, Yunnan, 650092, People’s Republic of China 4Department of Astronomy, School of Physics, Huazhong University of Science and Technology, Wuhan, Hubei, 430074, People’s Republic of China (Received March 1, 2021; Revised April 1, 2021; Accepted January 3, 2025; Published January 3, 2025) Submitted to ApJ ABSTRACT The changing-look blazars (CLBs) are the blazars that their optical spectral lines at different epochs show a signific

Processing PDFs, for metrics, keywords and affiliations:  14%|█▎        | 3/22 [00:06<00:39,  2.08s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Mathilde Mâlin', 'Anthony Boccaletti', 'Clément Perrot', 'Pierre Baudoz', 'Daniel Rouan', 'Pierre-Olivier Lagage', 'Rens Waters', 'Manuel Güdel', 'Thomas Henning', 'Bart Vandenbussche', 'Olivier Absil', 'David Barrado', 'Benjamin Charnay', 'Elodie Choquet', 'Christophe Cossou', 'Camilla Danielski', 'Leen Decin', 'Adrian M. Glauser', 'John Pye', 'Goran Olofsson', 'Alistair Glasse', 'Polychronis Patapis', 'Pierre Royer', 'Silvia Scheithauer', 'Eugene Serabyn', 'Pascal Tremblin', 'Niall Whiteford', 'Ewine F. van Dishoeck', 'Göran Ostlin', 'Tom P. Ra', 'Gillian Wright']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Introduction
Truncated text at marker: 1.
Truncated text at marker: Methods

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: Mathilde Mâlin (searching for: Mâlin)
No affiliations found near author Mathilde Mâlin

Processing author: Anthony Boc

Processing PDFs, for metrics, keywords and affiliations:  18%|█▊        | 4/22 [00:06<00:25,  1.40s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['S.I. Ipatov']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract
Truncated text at marker: I.

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: S.I. Ipatov (searching for: Ipatov)
Could not find author S.I. Ipatov in text

=== Affiliation Extraction Complete ===
Final affiliations: [None]


Processing PDFs, for metrics, keywords and affiliations:  23%|██▎       | 5/22 [00:07<00:21,  1.28s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Somayeh Khakpash', 'Federica Bianco', 'Georgios Vernardos', 'Gregory Dobler', 'Charles Keeton']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Keywords

Looking for affiliation blocks...
Found potential affiliation: University–New Brunswick, Department of Physics & Astronomy, 136 Frelinghuysen Rd, Piscataway, NJ 08854, USA 2LSST-DA Catalyst Fellow 3University of Delaware Department of Physics and Astronomy 217 Sharp Lab Newark, DE 19716 USA 4University of Delaware Joseph R.
Found potential affiliation: School of Public Policy and Administration, 184 Academy St, Newark, DE 19716 USA 5University of Delaware Data Science Institute 6Vera C.
Found potential affiliation: Department of Physics and Astronomy, Lehman College of the City University of New York, Bronx, NY, 10468, USA 8Department of Astrophysics, American Museum of Natural History, Central Park West and 79th Street, NY, 10024, USA ABSTRACT 

Processing PDFs, for metrics, keywords and affiliations:  27%|██▋       | 6/22 [00:08<00:19,  1.21s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Hisashi Hayakawa', 'Edward W. Cliver', 'Frédéric Clette', 'Yusuke Ebihara', 'Shin Toriumi', 'Ilaria Ermolli', 'Theodosios Chatzistergos', 'Kentaro Hattori', 'Delores J. Knipp', 'Séan P. Blake', 'Gianna Cauzzi', 'Kevin Reardon', 'Philippe-A. Bourdin', 'Dorothea Just', 'Mikhail Vokhmyanin', 'Keitaro Matsumoto', 'Yoshizumi Miyoshi', 'José R. Ribeiro', 'Ana P. Correia', 'David M. Willis', 'Matthew N. Wild', 'Sam M. Silverman']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract

Looking for affiliation blocks...
Found potential affiliation: Institute for Space-Earth Environmental Research and Institute for Advanced Researches, Nagoya University, Nagoya 4648601, Japan; hisashi@nagoya-u.
Found potential affiliation: Laboratory, Harwell Campus, Didcot OX11 0QX, UK3National Solar Observatory, Boulder, CO 80303, USA; ecliver@nso.
Found potential affiliation: Center SILSO, Observatoire Royal de Belgiq

Processing PDFs, for metrics, keywords and affiliations:  32%|███▏      | 7/22 [00:14<00:39,  2.61s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Philippe-A. Bourdin', 'Yasuhito Narita']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: Philippe-A. Bourdin (searching for: Bourdin)
Could not find author Philippe-A. Bourdin in text

Processing author: Yasuhito Narita (searching for: Narita)
Could not find author Yasuhito Narita in text

=== Affiliation Extraction Complete ===
Final affiliations: [None, None]


Processing PDFs, for metrics, keywords and affiliations:  36%|███▋      | 8/22 [00:18<00:43,  3.13s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Di Wu', 'Jing-Zhi Zhou', 'Yu-Ting Kuang', 'Zhi-Chao Li', 'Zhe Chang', 'Qing-Guo Huang']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract

Looking for affiliation blocks...
Found potential affiliation: School of Fundamental Physics and Mathematical Sciences, Hangzhou Institute for Advanced Study, UCAS, Hangzhou 310024, China bCenter for Joint Quantum Studies and Department of Physics, School of Science, Tianjin University, Tianjin 300350, China cInstitute of High Energy Physics, Chinese Academy of Sciences, Beijing 100049, China dUniversity of Chinese Academy of Sciences, Beijing 100049, China eCASKeyLaboratoryofTheoreticalPhysics, InstituteofTheoreticalPhysics, ChineseAcademy of Sciences E-mail: wudi@ucas.

Found 1 unique potential affiliations

Processing author: Di Wu (searching for: Wu)
No affiliations found near author Di Wu

Processing author: Jing-Zhi Zhou (searching for: Zhou)
No a

Processing PDFs, for metrics, keywords and affiliations:  41%|████      | 9/22 [00:19<00:32,  2.52s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Hui Liu', 'Hui Li', 'Sizhong Zou', 'Kaifan Ji', 'Zhenyu Jin', 'Jiahui Shan', 'Jingwei Li', 'Guanglu Shi', 'Yu Huang', 'Li Feng', 'Jianchao Xue', 'Qiao Li', 'Dechao Song', 'Ying Li']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract

Looking for affiliation blocks...
Found potential affiliation: Laboratory of Dark Matter and Space Astronomy, Purple Mountain Observatory, Chinese Academy of Sciences, Nanjing 210023, China; nj.

Found 1 unique potential affiliations

Processing author: Hui Liu (searching for: Liu)
No affiliations found near author Hui Liu

Processing author: Hui Li (searching for: Li)
No affiliations found near author Hui Li

Processing author: Sizhong Zou (searching for: Zou)
No affiliations found near author Sizhong Zou

Processing author: Kaifan Ji (searching for: Ji)
Could not find author Kaifan Ji in text

Processing author: Zhenyu Jin (searching for: Jin)
No affiliations

Processing PDFs, for metrics, keywords and affiliations:  45%|████▌     | 10/22 [00:20<00:22,  1.90s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Bing Zhang']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: Bing Zhang (searching for: Zhang)
No affiliations found near author Bing Zhang

=== Affiliation Extraction Complete ===
Final affiliations: [None]


Processing PDFs, for metrics, keywords and affiliations:  50%|█████     | 11/22 [00:23<00:25,  2.35s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Becca Spejcher', 'Noel D. Richardson', 'Herbert Pablo', 'Marina Beltran', 'Payton Butler', 'Eddie Avila']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Keywords
Truncated text at marker: 1.

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: Becca Spejcher (searching for: Spejcher)
Could not find author Becca Spejcher in text

Processing author: Noel D. Richardson (searching for: Richardson)
Could not find author Noel D. Richardson in text

Processing author: Herbert Pablo (searching for: Pablo)
Could not find author Herbert Pablo in text

Processing author: Marina Beltran (searching for: Beltran)
Could not find author Marina Beltran in text

Processing author: Payton Butler (searching for: Butler)
Could not find author Payton Butler in text

Processing author: Eddie Avila (searching for: Avila)
Could not find author Eddie Avila in text

=== Affiliation

Processing PDFs, for metrics, keywords and affiliations:  55%|█████▍    | 12/22 [00:24<00:19,  1.97s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Sripan Mondal', 'Akash Bairagi', 'A. K. Srivastava']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Keywords

Looking for affiliation blocks...
Found potential affiliation: Department of Physics, Indian Institute of Technology (BHU), Varanasi-221005, India.
Found potential affiliation: Department of Physics, Indian Institute of Technology (BHU), Varanasi-221005, India 3Department of Physics, Indian Institute of Technology (BHU), Varanasi-221005, India.

Found 2 unique potential affiliations

Processing author: Sripan Mondal (searching for: Mondal)
Found matching affiliation: Department of Physics, Indian Institute of Technology (BHU), Varanasi-221005, India.

Processing author: Akash Bairagi (searching for: Bairagi)
Found matching affiliation: Department of Physics, Indian Institute of Technology (BHU), Varanasi-221005, India.

Processing author: A. K. Srivastava (searching for: Srivastava)
Coul

Processing PDFs, for metrics, keywords and affiliations:  59%|█████▉    | 13/22 [00:25<00:14,  1.59s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Takeru K. Suzuki', 'Keiichi Ohnaka', 'Yuki Yasuda']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract

Looking for affiliation blocks...
Found potential affiliation: School of Arts & Sciences, The University of Tokyo, 3-8-1, Komaba, Meguro, Tokyo 153-8902, Japan; Department of Astronomy, The University of Tokyo, 7-3-1, Hongo, Bunkyo, Tokyo, 113-0033, Japan; Komaba Institute for Science, The University of Tokyo, 3-8-1 Komaba, Meguro, Tokyo 153-8902, Japan 2Instituto de Astrofísica, Departamento de Física y Astronomía, Facultad de Ciencias Exactas, Universidad Andrés Bello, Fernández Concha 700, Las Condes, Santiago, Chile 3Division of Physics, Faculty of Science, Kita 10 Nishi 8, Kita-ku, Hokkaido University, Sapporo 060-0810, Japan ∗E-mail: stakeru@ea.

Found 1 unique potential affiliations

Processing author: Takeru K. Suzuki (searching for: Suzuki)
Could not find author Takeru K. Suzuki 

Processing PDFs, for metrics, keywords and affiliations:  64%|██████▎   | 14/22 [00:27<00:13,  1.67s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Shi Pi', 'Misao Sasaki', 'Volodymyr Takhistov', 'Jianing Wang']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Abstract

Looking for affiliation blocks...
Found potential affiliation: Laboratory of Theoretical Physics, Institute of Theoretical Physics, Chinese Academy of Sciences, Beijing 100190, China bCenter for High Energy Physics, Peking University, Beijing 100871, China cKavli Institute for the Physics and Mathematics of the Universe (WPI), UTIAS, The Uni- versity of Tokyo, Kashiwa, Chiba 277-8583, Japan dCenter for Gravitational Physics and Quantum Information, Yukawa Institute for Theoret- ical Physics, Kyoto University, Kyoto 606-8502, Japan eLeung Center for Cosmology and Particle Astrophysics, National Taiwan University, Taipei 10617 fInternational Center for Quantum-field Measurement Systems for Studies of the Universe and Particles (QUP,WPI), High Energy Accelerator Research Organiza

Processing PDFs, for metrics, keywords and affiliations:  68%|██████▊   | 15/22 [00:30<00:15,  2.22s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Miftahul Hilmi', 'Nicha Leethochawalit', 'Michele Trenti', 'Benjamin Metha']
Successfully read page 1
Successfully read page 2
Truncated text at marker: 1.

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: Miftahul Hilmi (searching for: Hilmi)
Could not find author Miftahul Hilmi in text

Processing author: Nicha Leethochawalit (searching for: Leethochawalit)
Could not find author Nicha Leethochawalit in text

Processing author: Michele Trenti (searching for: Trenti)
Could not find author Michele Trenti in text

Processing author: Benjamin Metha (searching for: Metha)
Could not find author Benjamin Metha in text

=== Affiliation Extraction Complete ===
Final affiliations: [None, None, None, None]


Processing PDFs, for metrics, keywords and affiliations:  73%|███████▎  | 16/22 [00:35<00:17,  2.93s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Qiguo Tian', 'Lei Hao', 'Yipeng Zhou', 'Xiheng Shi', 'Tuo Ji', 'Peng Jiang', 'Lin Lin', 'Zhenya Zheng', 'Hongyan Zhou']
Successfully read page 1
Successfully read page 2
Truncated text at marker: 1.

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: Qiguo Tian (searching for: Tian)
Could not find author Qiguo Tian in text

Processing author: Lei Hao (searching for: Hao)
Could not find author Lei Hao in text

Processing author: Yipeng Zhou (searching for: Zhou)
Could not find author Yipeng Zhou in text

Processing author: Xiheng Shi (searching for: Shi)
Could not find author Xiheng Shi in text

Processing author: Tuo Ji (searching for: Ji)
Could not find author Tuo Ji in text

Processing author: Peng Jiang (searching for: Jiang)
Could not find author Peng Jiang in text

Processing author: Lin Lin (searching for: Lin)
Could not find author Lin Lin in text

Processing author: Zhenya Z

Processing PDFs, for metrics, keywords and affiliations:  77%|███████▋  | 17/22 [00:36<00:11,  2.35s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['T. Şahin', 'F. Güney', 'S.A. Şentürk', 'N. Çınar', 'M. Marışmak']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Keywords

Looking for affiliation blocks...
Found potential affiliation: University, Faculty of Science, Department of Space Sciences and Technologies 07058, Antalya, Türkiye 2Institute of Graduate Studies in Science, Akdeniz University, Türkiye ABSTRACT ThisstudyintroducesalinelistfortheabundanceanalysisofF-andG-typestarsacrossthe4080–9675Åwavelength range.

Found 1 unique potential affiliations

Processing author: T. Şahin (searching for: Şahin)
No affiliations found near author T. Şahin

Processing author: F. Güney (searching for: Güney)
No affiliations found near author F. Güney

Processing author: S.A. Şentürk (searching for: Şentürk)
No affiliations found near author S.A. Şentürk

Processing author: N. Çınar (searching for: Çınar)
No affiliations found near author N. Çınar

Proc

Processing PDFs, for metrics, keywords and affiliations:  82%|████████▏ | 18/22 [00:41<00:12,  3.20s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['V. Hocdé', 'A. Matter', 'N. Nardetto', 'A. Gallenne', 'P. Kervella', 'A. Mérand', 'G. Pietrzyński', 'W. Gieren', 'J. Leftley', 'S. Robbe-Dubois', 'B. Lopez', 'M. C. Bailleul', 'G. Bras', 'R. Smolec', 'P. Wielgórski', 'G. Hajdu', 'A. Afanasiev']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Introduction
Truncated text at marker: 1.
Truncated text at marker: Methods

Looking for affiliation blocks...
Found potential affiliation: Centre, Polish Academy of Sciences, Bartycka 18, 00-716 Warszawa, Poland email : vhocde@camk.
Found potential affiliation: Laboratory for Astronomy, IRL 3386, CNRS, Casilla 36-D, Santiago, Chile 5LESIA, Observatoire de Paris, Université PSL, CNRS, Sorbonne Université, Université Paris-Cité, 5 Place Jules Janssen,92195 Meudon, France, 6European Southern Observatory, Karl-Schwarzschild-Str.

Found 2 unique potential affiliations

Processing author: V. Hocdé (searching for: 

Processing PDFs, for metrics, keywords and affiliations:  86%|████████▋ | 19/22 [00:44<00:09,  3.02s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Guanwen Fang', 'Yao Dai', 'Zesen Lin', 'Chichun Zhou', 'Jie Song', 'Yizhou Gu', 'Xiaotong Guo', 'Anqi Mao', 'Xu Kong']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Introduction
Truncated text at marker: 1.

Looking for affiliation blocks...
Found potential affiliation: Institute of Astronomy and Astrophysics, Anqing Normal University, Anqing 246133, People’s Republic of China, e-mail: wen@mail.
Found potential affiliation: Department of Physics, The Chinese University of Hong Kong, Shatin, N.
Found potential affiliation: School of Engineering, Dali University, Dali 671003, People’s Republic of China e-mail: zhouchichun@dali.
Found potential affiliation: Department of Astronomy, University of Science and Technology of China, Hefei 230026, China, e-mail: xkong@ustc.
Found potential affiliation: School of Astronomy and Space Science, University of Science and Technology of China, Hefei 230026, Pe

Processing PDFs, for metrics, keywords and affiliations:  91%|█████████ | 20/22 [00:49<00:07,  3.59s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Yanping Cong', 'Bin Yue', 'Yidong Xu', 'Furen Deng', 'Jiajun Zhang', 'Xuelei Chen']
Successfully read page 1
Successfully read page 2
Truncated text at marker: Keywords

Looking for affiliation blocks...
Found potential affiliation: Laboratory of Radio Astronomy and Technology, Chinese Academy of Sciences, 20A Datun Road, Chaoyang District, Beijing 100101, China 4School of Astronomy and Space Science, University of Chinese Academy of Sciences, Beijing 100049, China ABSTRACT Loop I/North Polar Spur (NPS) is the giant arc structure above the Galactic plane observed in the radio sky.
Found potential affiliation: Center (GC), associated with the Fermi Bubble and eROSITA X-ray bubble.

Found 2 unique potential affiliations

Processing author: Yanping Cong (searching for: Cong)
No affiliations found near author Yanping Cong

Processing author: Bin Yue (searching for: Yue)
No affiliations found near author Bin Yue

Processing auth

Processing PDFs, for metrics, keywords and affiliations:  95%|█████████▌| 21/22 [00:55<00:04,  4.48s/it]


=== Starting Affiliation Extraction ===
Processing authors: ['Tanvi Sharma', 'Wen-Ping Chen', 'Beth Biller', 'Loic Albert', 'Belinda Damian', 'Jessy Jose', 'Bhavana Lalchand', 'Michael C. Liu', 'Yumiko Oasa']
Successfully read page 1
Successfully read page 2
Truncated text at marker: I.

Looking for affiliation blocks...

Found 0 unique potential affiliations

Processing author: Tanvi Sharma (searching for: Sharma)
Could not find author Tanvi Sharma in text

Processing author: Wen-Ping Chen (searching for: Chen)
Could not find author Wen-Ping Chen in text

Processing author: Beth Biller (searching for: Biller)
Could not find author Beth Biller in text

Processing author: Loic Albert (searching for: Albert)
Could not find author Loic Albert in text

Processing author: Belinda Damian (searching for: Damian)
Could not find author Belinda Damian in text

Processing author: Jessy Jose (searching for: Jose)
Could not find author Jessy Jose in text

Processing author: Bhavana Lalchand (searc

Processing PDFs, for metrics, keywords and affiliations:  95%|█████████▌| 21/22 [21:01<01:00, 60.08s/it]


KeyboardInterrupt: 

In [5]:
df

Unnamed: 0,title,abstract,authors,figures,pages,tables,pdf_link,primary_subject,secondary_subjects,submitted_journal,published_journal,keywords,affiliations
0,Insights on Galaxy Evolution from Interpretabl...,Galaxy appearances reveal the physics of how t...,[John F. Wu],4.0,10.0,2.0,arxiv.org/pdf/2501.00089,Astrophysics of Galaxies,[Machine Learning],AAS Journals,,[Galaxies (573],[None]
1,Hunting for the candidates of Changing-Look Bl...,The changing-look blazars (CLBs) are the blaza...,"[Shi-Ju Kang, Shan-Shan Ren, Yong-Gang Zheng, ...",7.0,13.0,3.0,arxiv.org/pdf/2501.00094,High Energy Astrophysical Phenomena,,ApJ,,"[Active galactic nuclei (16) – Blazars (164, B...","[None, None, None, None]"
2,First unambiguous detection of ammonia in the ...,The newly accessible mid-infrared (MIR) window...,"[Mathilde Mâlin, Anthony Boccaletti, Clément P...",6.0,18.0,4.0,arxiv.org/pdf/2501.00104,Earth and Planetary Astrophysics,,A&A,,[Planetary systems],"[None, None, None, None, None, None, None, Non..."
3,Exchange of meteorites between the terrestrial...,The evolution of the orbits of bodies ejected ...,[S.I. Ipatov],0.0,6.0,0.0,arxiv.org/pdf/2501.00134,Earth and Planetary Astrophysics,,,Modern astronomy: from the Early Universe to e...,[planets and satellites: terrestrial planets],[None]
4,Autoencoder Reconstruction of Cosmological Mic...,Enhanced modeling of microlensing variations i...,"[Somayeh Khakpash, Federica Bianco, Georgios V...",11.0,18.0,1.0,arxiv.org/pdf/2501.00153,Instrumentation and Methods for Astrophysics,,The Astrophysical Journal,,[Interdisciplinary astronomy(804) 1],"[None, None, None, None, None]"
5,The Extreme Space Weather Event of 1872 Februa...,"We review observations of solar activity, geom...","[Hisashi Hayakawa, Edward W. Cliver, Frédéric ...",13.0,20.0,2.0,arxiv.org/pdf/2501.00176,Solar and Stellar Astrophysics,"[Earth and Planetary Astrophysics, Geophysics,...",,ApJ 959:23 (20pp) 2023,[],"[None, None, None, None, None, None, None, Non..."
6,Electromotive field in space and astrophysical...,The concept of electromotive field appears in ...,"[Philippe-A. Bourdin, Yasuhito Narita]",8.0,23.0,0.0,arxiv.org/pdf/2501.00181,Solar and Stellar Astrophysics,"[Earth and Planetary Astrophysics, High Energy...",,"Rev. Mod. Plasma Phys. 9, 1 (2025)","[Electromotive field, Dynamo mechanism, Turbul...","[None, None]"
7,Can tensor-scalar induced GWs dominate PTA obs...,Observational constraints on small-scale primo...,"[Di Wu, Jing-Zhi Zhou, Yu-Ting Kuang, Zhi-Chao...",7.0,19.0,1.0,arxiv.org/pdf/2501.00228,Cosmology and Nongalactic Astrophysics,"[High Energy Astrophysical Phenomena, General ...",,,[],"[None, None, None, None, None, None]"
8,Improving image quality of the Solar Disk Imag...,The in-flight calibration and performance of t...,"[Hui Liu, Hui Li, Sizhong Zou, Kaifan Ji, Zhen...",10.0,14.0,1.0,arxiv.org/pdf/2501.00231,Solar and Stellar Astrophysics,[Instrumentation and Methods for Astrophysics],,,"[techniques: image processing, sun: chromosphe...","[None, None, None, None, None, None, None, Non..."
9,On the Duration of Gamma-Ray Bursts,"Recently, a short-duration GRB with supernova ...",[Bing Zhang],1.0,10.0,0.0,arxiv.org/pdf/2501.00239,High Energy Astrophysical Phenomena,,,,[Gamma-ray bursts],[None]


In [None]:
def _extract_affiliations(self, pdf_reader, authors, max_pages=2):
        """
        Extract author affiliations from PDF by matching authors and their superscript markers
        Args:
            pdf_reader: PyPDF2.PdfReader object
            authors: List of author names
            max_pages: Maximum pages to search (usually in first 2 pages)
        Returns:
            list: List of affiliations corresponding to authors
        """
        affiliations = [None] * len(authors)
        try:
            # Only search first few pages where affiliations typically appear
            pages_to_search = min(max_pages, len(pdf_reader.pages))
            
            # Get text from first pages
            full_text = ""
            for page_num in range(pages_to_search):
                try:
                    page_text = pdf_reader.pages[page_num].extract_text()
                    full_text += page_text + "\n"
                except Exception as e:
                    print(f"Error reading page {page_num}: {str(e)}")
                    continue

            # Clean text
            full_text = re.sub(r'\s+', ' ', full_text)
            
            # Common patterns for affiliation sections
            section_markers = [
                'Abstract', 'Introduction', 'Keywords', 'I.', '1.', 
                'Methods', 'Background', 'Results'
            ]
            
            # Truncate text at first section marker
            for marker in section_markers:
                marker_pos = full_text.find(marker)
                if marker_pos != -1:
                    full_text = full_text[:marker_pos]
            
            # Extract affiliation block
            affiliation_patterns = [
                # Pattern for numbered affiliations
                r'(?:^|\n)(?:[1-9][0-9]?|[a-z])[).]\s*(.*?)(?=(?:[1-9][0-9]?|[a-z])[).]|$)',
                # Pattern for symbol-based affiliations
                r'(?:^|\n)(?:[†*§¶‡#])\s*(.*?)(?=(?:[†*§¶‡#])|$)',
                # Pattern for explicit affiliation markers
                r'(?i)(?:^|\n)(?:Affiliation|Address|Institution)[s:]?\s*(.*?)(?=\n|$)'
            ]
            
            # Find all affiliation blocks
            affiliation_blocks = []
            for pattern in affiliation_patterns:
                matches = re.finditer(pattern, full_text, re.MULTILINE | re.DOTALL)
                for match in matches:
                    affiliation_blocks.append(match.group(1).strip())
            
            # Process each author
            for i, author in enumerate(authors):
                try:
                    # Clean author name
                    clean_author = re.sub(r'[^\w\s]', '', author).strip()
                    
                    # Look for author pattern with superscript
                    author_patterns = [
                        # Number superscripts
                        rf"{clean_author}[\s{{}}]*?([1-9][0-9]?(?:,[1-9][0-9]?)*)",
                        # Symbol superscripts
                        rf"{clean_author}[\s{{}}]*?([†*§¶‡#](?:,[†*§¶‡#])*)",
                        # Letter superscripts
                        rf"{clean_author}[\s{{}}]*?([a-z](?:,[a-z])*)"
                    ]
                    
                    author_affiliations = []
                    for pattern in author_patterns:
                        matches = re.finditer(pattern, full_text, re.IGNORECASE)
                        for match in matches:
                            markers = match.group(1).split(',')
                            for marker in markers:
                                marker = marker.strip()
                                # Convert marker to index
                                if marker.isdigit():
                                    idx = int(marker) - 1
                                elif marker.isalpha():
                                    idx = ord(marker.lower()) - ord('a')
                                else:
                                    # For symbols, find matching affiliation block
                                    for block in affiliation_blocks:
                                        if block.startswith(marker):
                                            author_affiliations.append(block)
                                            continue
                                    continue
                                
                                if 0 <= idx < len(affiliation_blocks):
                                    author_affiliations.append(affiliation_blocks[idx])
                    
                    if author_affiliations:
                        affiliations[i] = list(set(author_affiliations))  # Remove duplicates
                    
                    # Fallback: If no superscript found, look for nearby affiliation
                    if not affiliations[i]:
                        author_pos = full_text.find(clean_author)
                        if author_pos != -1:
                            # Look for affiliation in next 200 characters
                            nearby_text = full_text[author_pos:author_pos + 200]
                            for block in affiliation_blocks:
                                if block in nearby_text:
                                    affiliations[i] = [block]
                                    break
                
                except Exception as e:
                    print(f"Error processing author {author}: {str(e)}")
                    continue
            
            return affiliations
            
        except Exception as e:
            print(f"Error in affiliation extraction: {str(e)}")
            return [None] * len(authors)