In [None]:
import urllib
import re
import json

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def get_page_contents(url):
    page = urllib.request.urlopen(url)
    return page.read().decode("utf-8")

def get_abstract_urls(page_text):
    """Extracts URLs for page text.
    """
    urls = re.findall(r'href=[\'"]?([^\'" >]+)', page_text)
    urls = list(filter(lambda x: x.find("/prb/abstract") != -1, urls))
    urls = list(filter(lambda x: x.find("#fulltext") == -1, urls))
    return ['https://journals.aps.org' + u for u in urls]

# Extract Issue URLs

In [None]:
LATEST_VOL_NUMBER = 108

vol_issue = {}
for n in range(1, 1 + LATEST_VOL_NUMBER):
    vol_url = f'https://journals.aps.org/prb/issues/{n}#v{n}'
    page_text = get_page_contents(vol_url)
    partial_urls = re.findall(f'\/prb\/issues/{n}/[0-9]+', page_text)
    vol_issue[n] = ['https://journals.aps.org/' + u for u in partial_urls]
    print(f'\tVolume {n:>3} has {len(vol_issue[n]):>4} issues.', end='\r')

# Get Abstracts URLs from each Issue

In [None]:
%%time
abstract_urls_dict = {}

count_abstracts = 0
for vol_num in range(1, 1 + 108):
    abstract_urls_dict[vol_num] = {}
    for issue_url in vol_issue[vol_num]:
        issue_num = int(issue_url.split('/')[-1])
        print(f'\tVolume {vol_num:>3}, Issue {issue_num:>3}', end='\r')
        abstract_urls_dict[vol_num][issue_num] = get_abstract_urls(
            get_page_contents(issue_url))
        count_abstracts += len(abstract_urls_dict[vol_num][issue_num])

print(f'\nTotal number of abstracts: {count_abstracts}')

In [None]:
with open("aps_prb_abstract_urls.json", "w") as file:
    json.dump(abstract_urls_dict, file, indent=4)

# Extract Abstract Metadata

In [None]:
def get_data_from_abstract(abstract_url):
    """Extracts metadata from abstracts:
    """
    abstract_page_content = get_page_contents(abstract_url)

    # Title:
    title = re.findall(r'\"citation\_title\" content\=\"(.*)\"\/\>', abstract_page_content)
    if len(title) == 0 or len(title) > 1:
        raise ValueError(f"More than one title was found!\n{title}")
    else:
        title = title[0]
    
    # Published Date:
    published_date = re.findall(r'Published\s+(\d{1,2}) (\w+) (\d{4})', abstract_page_content)
    if len(published_date) == 0:
        raise ValueError("No published date!")
    elif len(published_date) > 1:
        raise ValueError(f"More than one published date!\n{published_date}")
    else:
        published_date = published_date[0]
        published_date = ' '.join(published_date)

    # Citations:
    citing_articles = re.findall(r'Citing Articles \(([0-9]+)\)', abstract_page_content)
    citing_articles = list(set(citing_articles))
    if len(citing_articles) == 0:
        citing_articles = 0
    elif len(citing_articles) == 1:
        citing_articles = int(citing_articles[0])
    elif len(citing_articles) > 1:
        raise ValueError(f"More than one unique reference to \"Citing Articles\"!\n{citing_articles}")

    # Authors:
    authors = re.findall(r'\"citation\_author\" content\=\"(.*)\"\/\>', abstract_page_content)

    return {
        "Title": title,
        "Published Date": published_date, 
        "Number of Citations": citing_articles,
        "Authors": authors, 
    }

In [None]:
# # Test:
# get_data_from_abstract(abstract_urls_dict[1][1][0])

## Get Abstract Metadata:

In [None]:
# %%time
# abstract_data = []

# for vol_num, issue_dict in abstract_urls_dict.items():
#     for issue_num, abstract_urls in issue_dict.items():
#         for abs_url in abstract_urls:
#             print(f"\tVol. {vol_num:>3} Iss. {issue_num:>3} {abs_url:<128}", end="\r")
#             temp_dict = get_data_from_abstract(abs_url)
#             temp_dict['Volume'] = vol_num
#             temp_dict['Issue'] = issue_num
#             temp_dict['Abstract URL'] = abs_url
#             abstract_data.append(temp_dict)

# print("\nDone!")

## Multi-Threaded Extraction:

In [None]:
import threading

def global_issue_count(vol_issue):
    """Given a tuple of (vol, iss.), returns the 
    issue number accross all volumes.
    """
    vol_issue_count = {}
    counter = 1
    for vol_num, issues in vol_issue.items():
        for issue_url in issues:
            issue_num = int(issue_url.split('/')[-1])
            vol_issue_count[(vol_num, issue_num)] = counter
            counter += 1 
    return vol_issue_count

def threaded_extractor(thread_idx, num_threads, abstract_urls_dict, vol_issue_count):
    global abstract_data_dict
    for vol_num, issue_dict in abstract_urls_dict.items():
        for issue_num, abstract_urls in issue_dict.items():
            if vol_issue_count[(vol_num, issue_num)] % num_threads != thread_idx:
                continue
            for abs_num, abs_url in enumerate(abstract_urls, start=1):
                print(f"\t[{thread_idx:^3}] Vol. {vol_num:^3} Iss. {issue_num:^3} {abs_url:<128}", end="\r")
                temp_dict = get_data_from_abstract(abs_url)
                temp_dict['Volume'] = vol_num
                temp_dict['Issue'] = issue_num
                temp_dict['Abstract URL'] = abs_url
                abstract_data_dict[thread_idx].append(temp_dict)
            # if len(abstract_data_dict[thread_idx]) >= 10:
            #     return 

In [None]:
num_threads = 10
threads = []
abstract_data_dict = {}
vol_issue_count = global_issue_count(vol_issue)
for t_idx in range(num_threads):
    abstract_data_dict[t_idx] = []
    threads.append(
        threading.Thread(
            target=threaded_extractor, 
            args=(t_idx, num_threads, abstract_urls_dict, vol_issue_count))
    )

for t_idx in range(num_threads):
    threads[t_idx].start()

for t_idx in range(num_threads):
    threads[t_idx].join()

abstract_data = []
for key, val in abstract_data_dict.items():
    abstract_data.extend(val)
print(f"\nNumber of records: {len(abstract_data)}")

## Convert to Pandas DataFrame:

In [None]:
df = pd.DataFrame(abstract_data)

display(df)

In [None]:
# Export DataFrame to CSV:
df.to_csv("aps_prb_articles_meta_data.csv", index=False)