In [2]:
import pandas as pd 
import re

responses_csv = "nih.apc.csv"

df = pd.read_csv(responses_csv)

In [None]:
### extract references and links from comments ###

def extract_links(text):
    
    # define url pattern as starting with http OR https 
    url_pattern = (r'(https?://[^\s]+)' + r'(http://[^\s]+)')

    # define web address as any letter preceding and following a period, e.g., openapc.net
    web_address = r'([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'

    doi_pattern = r'(doi:\s*10\.\d{4,9}/[-._;()/:A-Z0-9]+)'

    return re.findall(url_pattern, text) + re.findall(doi_pattern, text) + re.findall(web_address, text)

def extract_citations(text):
    res = []

    patterns_list = [

        r'\([^()]+?,\s*\d{4}\)',  # Author, Year
        r'[A-Z][a-zA-Z]+ et al\.', # Author et al
        r'\(([^()]+? et al., \s*\d{4})\)',  # (Author et al., Year)
        r'[A-Z][a-zA-Z]+ et al\. (\d{4})',  # Author et al. (Year)
        r'[A-Z][a-zA-Z]+ and [A-Z][a-zA-Z]+ \((\d{4})\)',  # Author and Author (Year)
        r'[A-Z][a-zA-Z]+ & [A-Z][a-zA-Z]+ \((\d{4})\)',  # Author & Author (Year)
        r'([A-Z][a-zA-Z]+ et al\. \(([^()]+?,\s*\d{4})\))',  # Author et al. (Author, 2011)

    ]
    for pattern in patterns_list:
        matches = re.findall(pattern, text)
        for match in matches:
            if isinstance(match, tuple):
                res.append(match[0])
            else:
                res.append(match)
    return res

In [9]:
### check individual cases ###

extract_citations(df['Comment'][139])


['(Science, 2011)', 'Ginther et al.', 'Ginther et al. (Science, 2011)']

In [10]:
### loop over all comments and extract citations and links to a list with their corresponding Record.ID ###

citations_data = []

for index, row in df.iterrows():
    record_id = row['Record.ID']
    comment = row['Comment']
    citations = extract_citations(comment)
    links = extract_links(comment)
    if (not citations) and (not links):
        continue
    citations_data.append({'Record.ID': record_id, 'Citation': citations, 'Links': links})

citations_df = pd.DataFrame(citations_data)
print(citations_df.head(20))

citations_df.to_csv('nih_apc_citations.csv', index=False)

    Record.ID                                           Citation  \
0           6                                                 []   
1          16                              [(Coalition S, 2022)]   
2          24                                                 []   
3          42                                                 []   
4          52                                                 []   
5          57                                                 []   
6          78                                                 []   
7         115                                                 []   
8         140  [(Science, 2011), Ginther et al., Ginther et a...   
9         144                                                 []   
10        159                                                 []   
11        166                                                 []   
12        169                               [(from Jan 1, 2025)]   
13        193                                   

In [None]:
### retrieve citations and links in-context with surrounding text ###
def get_context(text, target, window=50):
    contexts = []
    for match in re.finditer(re.escape(target), text):
        start = max(match.start() - window, 0)
        end = min(match.end() + window, len(text))
        context = text[start:end]
        contexts.append(context)
    return contexts