In [1]:
import os
import re
import fitz # PyMuPDF
import pandas as pd

compiled_df = pd.read_csv('nih.apc.csv')

# Define file path to folder containing all uploaded PDFs
file_path = r"C:\Users\sherr\OneDrive - University of Ottawa\uOttawa MIS\ROARA\nih-apc-rfi\files"

In [2]:
### Extracting in-text citations ###

def extract_citations(text):

    res = []

    citation_patterns = [
        
    # APA in-text citation styles 

        # (Author, Year) OR (Author & Author, Year) OR (Author et al., Year, Page#))
        r'\([^)]+?,\s*\d{4}(?:[a-z])?(?:,\s*p{1,2}\.?\s*\d+(?:-\d+)?)?\)',

        # Author (Year) OR Author et al. (Year) OR Author and Author (Year)
        r'[A-Z][a-zA-Z\-\']+(?:(?:\s+(?:and|&)\s+[A-Z][a-zA-Z\-\']+)+|\s+et\s+al\.)?\s*\(\d{4}(?:[a-z])?\)',

        # Author et al. (Author, Year)
        r'[A-Z][a-zA-Z\-\']+\s+et\s+al\.\s*\([^)]+?,\s*\d{4}\)',

    # MLA in-text citation styles

        # (Author Page#) OR (Author and Author Page#) OR (Author et al. Page#)
        r'\([A-Z][a-zA-Z\-\']+(?:\s+and\s+[A-Z][a-zA-Z\-\']+|\s+et\s+al\.)?\s+\d+(?:-\d+)?\)',

        # ("Title" Page#)
        r'\("[^"]+"\s+\d+(?:-\d+)?\)',

        # Author (Page#)
        r'\([A-Z][a-zA-Z\-\']+\s+\d+(?:-\d+)?\)',
    
    # Chicago 
    
        # (Author Year) OR (Author Year, Page#)
        r'\([A-Za-z\s\.\-\'&]+?\s\d{4}(?:,\s\d+(?:-\d+)?)?\)',

        # (Year, Page#)
        r'\(\d{4},\s\d+(?:-\d+)?\)',

        # (Author, n.d.) OR (Author n.d., Page#)
        r'\([A-Za-z\s\.\-\']+,?\sn\.d\.(?:,\s\d+(?:-\d+)?)?\)',

    ]

    blocklist = ["option", "table", "figure", "box"]

    for pattern in citation_patterns:
        
        matches = re.findall(pattern, text)

        for match in matches:

            if isinstance(match, tuple):
                citation_str = match[0]
            else:
                citation_str = match
                
            # remove newline characters 
            citation_str = citation_str.replace('\n', '')
        
            if not any(word in citation_str.lower() for word in blocklist):
                res.append(citation_str)
    return res

In [3]:
### Extracting DOIs, PMIDs, and PMCIDs ###

def extract_identifiers(text):

    doi_pattern = r'\b(10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+)\b'
    pmid_pattern = r'PMID:?\s*(\d{1,8})'
    pmcid_pattern = r'\bPMC\d{1,8}\b'

    return re.findall(doi_pattern, text, re.IGNORECASE) + re.findall(pmid_pattern, text, re.IGNORECASE) + re.findall(pmcid_pattern, text, re.IGNORECASE)

In [4]:
### Extracting URLs and web domains ###

def extract_urls(text): 

    # remove email addresses from text
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
    clean_text = re.sub(email_pattern, ' ', text)

    # URL and web domain patterns 
    url_pattern = r'(https?://[^\s]+)'
    domain_pattern = r'\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b'

    raw_urls = re.findall(url_pattern, clean_text)
    raw_domains = re.findall(domain_pattern, clean_text)

    # Clean raw URL and domain matches 
    clean_urls = {re.sub(r'[.,;)]$', '', u) for u in raw_urls}
    clean_domains = {re.sub(r'[.,;)]$', '', d) for d in raw_domains}

    # Remove duplicate domains that are part of extracted URLs
    final_domains = []

    for domain in clean_domains:
        is_duplicate = False

        for url in clean_urls:
            if domain in url:
                is_duplicate = True
                break
            
        if not is_duplicate:
            final_domains.append(domain)

    return list(clean_urls.union(final_domains))

In [5]:
### Extract sources from compiled dataframe ###

compiled_res = []

for index, row in compiled_df.iterrows():

    record_id = row['Record.ID']
    comment = row['Comment']
    citations = extract_citations(comment)
    links = extract_urls(comment)
    identifiers = extract_identifiers(comment)

    if (not citations) and (not links) and (not identifiers):
        continue

    compiled_res.append({'Record.ID': record_id, 'Citation': citations, 'Identifiers': identifiers, 'Links': links})

compiled_df = pd.DataFrame(compiled_res)
# print(compiled_df_df.head())

compiled_df.to_csv('compiled_sources.csv', index=False)

In [6]:
### Extract the sources defined above and also all hyperlinks from uploaded PDFs ###

def extract_hyperlinks(file_path):

    doc = fitz.open(file_path)
    hyperlinks = set()

    try:
        with fitz.open(file_path) as doc:
            for page in doc:
                for link in page.get_links():
                    if "uri" in link:
                        uri = link["uri"].strip()

                        # Filter out mailto and tel addresses
                        if not uri.lower().startswith(("mailto:", "tel:", "whatsapp:")):
                            hyperlinks.add(uri)
    except Exception:
        return []
    return list(hyperlinks)

In [7]:
### Extract text from uploaded PDFs for regex extractions ###

def extract_pdf_text(pdf_path):

    uploaded_texts = []

    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                uploaded_texts.append(page.get_text())
    except Exception:
        return ""
    return "\n".join(uploaded_texts)

In [8]:
### Extract sources and hyperlinks from uploaded PDFs and store in separate csv ### 

uploaded_res = []
# output_csv = "uploaded_sources.csv"

if os.path.exists(file_path):

    for filename in os.listdir(file_path):

        # PDF file name is exactly 2 OR 3 digits
        if re.match(r'^\d{2,3}\.pdf$', filename, re.IGNORECASE):
            
            current_file_path = os.path.join(file_path, filename)
            record_id = os.path.splitext(filename)[0]
            
            try:
                # Extract Content
                plain_text = extract_pdf_text(current_file_path)
                embedded_links = extract_hyperlinks(current_file_path)

                # Run Regex
                citations = extract_citations(plain_text)
                ids = extract_identifiers(plain_text)
                text_links = extract_urls(plain_text)
                    
                # Combine links
                all_links = list(set(embedded_links + text_links))
                    
                # Store results if data found
                if citations or all_links or ids.get('DOIs') or ids.get('PMIDs') or ids.get('PMCIDs'):
                    uploaded_res.append({
                        'Record.ID': record_id,
                        'Citation': citations,
                        'Identifiers': ids,
                        'Links': all_links
                    })
            except Exception:
                continue
    if uploaded_res:
        uploaded_df = pd.DataFrame(uploaded_res)

    else:
        print("No matches found in files")
else:
    print("Folder path not found")

uploaded_sources = uploaded_df.to_csv('uploaded_sources.csv', index=False)

In [None]:
### Merge main sources and uploaded sources dataframes based on Record.ID and stores unique results ###

combined_df = pd.concat([compiled_df, uploaded_df])

combined_df['Record.ID'] = combined_df['Record.ID'].astype(str)

agg_func = lambda source: list(set(sum(source, [])))

final_df = combined_df.groupby("Record.ID", as_index=False).agg({
    "Citation": agg_func,
    "Identifiers": agg_func,
    "Links": agg_func
})

final_df.to_csv('all_sources.csv', index=False)