In [1]:
import urllib
import re
import json
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import socks
# import socket
# import ssl
# from urllib.request import Request, urlopen

# IP_ADDR = '66.97.37.164'
# PORT = 80

# n = 108
# url = f'https://journals.aps.org/prb/issues/{n}#v{n}'

# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE

# request = Request(url)
# socks.set_default_proxy(socks.SOCKS5, IP_ADDR, PORT)
# socket.socket = socks.socksocket
# response = urlopen(request, context=ctx)

# print(response.read())

In [None]:


# proxy_support = urllib.request.ProxyHandler({
#     'http' : 'http://66.97.37.164:80', 
#     'https': 'https://66.97.37.164:80'
# })
# opener = urllib.request.build_opener(proxy_support)
# urllib.request.install_opener(opener)
# with urllib.request.urlopen(url) as response:
#     contents = response.read()
# print(contents)

# Crawl APS PRB

In [2]:
LATEST_VOL_NUMBER = 108
# PROXY_HOST = "http://47.91.88.100:8080"  #"http://81.171.24.199"  # "http://91.211.245.176"

In [13]:
def get_page_contents(url, proxy_host=None):
    if proxy_host is None or proxy_host == '':
        response = urllib.request.urlopen(url)
    else:
        req = urllib.request.Request(url)
        req.set_proxy(proxy_host, 'http')
        response = urllib.request.urlopen(req)
    return response.read().decode("utf-8")

def get_abstract_urls(page_text):
    """Extracts URLs for page text.
    """
    urls = re.findall(r'href=[\'"]?([^\'" >]+)', page_text)
    urls = list(filter(lambda x: x.find("/prb/abstract") != -1, urls))
    urls = list(filter(lambda x: x.find("#fulltext") == -1, urls))
    return ['https://journals.aps.org' + u for u in urls]

## Extract Issue URLs:

In [None]:
# vol_issue = {}
# for n in range(1, 1 + LATEST_VOL_NUMBER):
#     vol_url = f'https://journals.aps.org/prb/issues/{n}#v{n}'
#     page_text = get_page_contents(vol_url, proxy_host=PROXY_HOST)
#     partial_urls = re.findall(f'\/prb\/issues/{n}/[0-9]+', page_text)
#     vol_issue[n] = ['https://journals.aps.org/' + u for u in partial_urls]
#     print(f'\tVolume {n:>3} has {len(vol_issue[n]):>4} issues.', end='\r')

## Get Abstracts URLs from each Issue:

In [4]:
%%time

abstract_urls_dict = {}
count_abstracts = 0
try:
    with open("aps_prb_abstract_urls.json", "r") as file:
      abstract_urls_dict = json.load(file)
except:
    for vol_num in range(1, 1 + 108):
        abstract_urls_dict[vol_num] = {}
        for issue_url in vol_issue[vol_num]:
            issue_num = int(issue_url.split('/')[-1])
            print(f'\tVolume {vol_num:>3}, Issue {issue_num:>3}', end='\r')
            abstract_urls_dict[vol_num][issue_num] = get_abstract_urls(
              get_page_contents(issue_url, proxy_host=PROXY_HOST))
            count_abstracts += len(abstract_urls_dict[vol_num][issue_num])
    with open("aps_prb_abstract_urls.json", "w") as file:
        json.dump(abstract_urls_dict, file, indent=4)
else:
    for vol_num, issue_dict in abstract_urls_dict.items():
        for issue_num, abstract_urls in issue_dict.items():
            count_abstracts += len(abstract_urls)

print(f'\nTotal number of abstracts: {count_abstracts}')


Total number of abstracts: 216964
CPU times: user 48.6 ms, sys: 10.1 ms, total: 58.8 ms
Wall time: 64.1 ms


## Extract Abstract Metadata:

In [6]:
def get_data_from_abstract(abstract_url):
    """Extracts metadata from abstracts:
    """
    abstract_page_content = get_page_contents(abstract_url)

    # Title:
    title = re.findall(r'\"citation\_title\" content\=\"(.*)\"\/\>', abstract_page_content)
    if len(title) == 0 or len(title) > 1:
        raise ValueError(f"More than one title was found!\n{title}")
    else:
        title = title[0]
    
    # Published Date:
    published_date = re.findall(r'Published\s+(\d{1,2}) (\w+) (\d{4})', abstract_page_content)
    if len(published_date) == 0:
        raise ValueError("No published date!")
    elif len(published_date) > 1:
        raise ValueError(f"More than one published date!\n{published_date}")
    else:
        published_date = published_date[0]
        published_date = ' '.join(published_date)

    # Citations:
    citing_articles = re.findall(r'Citing Articles \(([0-9]+)\)', abstract_page_content)
    citing_articles = list(set(citing_articles))
    if len(citing_articles) == 0:
        citing_articles = 0
    elif len(citing_articles) == 1:
        citing_articles = int(citing_articles[0])
    elif len(citing_articles) > 1:
        raise ValueError(f"More than one unique reference to \"Citing Articles\"!\n{citing_articles}")

    # Authors:
    authors = re.findall(r'\"citation\_author\" content\=\"(.*)\"\/\>', abstract_page_content)

    return {
        "Title": title,
        "Published Date": published_date, 
        "Number of Citations": citing_articles,
        "Authors": authors, 
        "Contents": abstract_page_content
    }

In [16]:
# # Test:
# get_data_from_abstract(abstract_urls_dict["107"]["1"][0])

### Get abstract metadata:

In [18]:
VOL_DELAY = 900.0
DELAY = 1.0

for vol_num, issue_dict in abstract_urls_dict.items():
    abstract_data = []
    for issue_num, abstract_urls in issue_dict.items():
        for abs_url in abstract_urls:
            print(f"\tVol. {vol_num:>3} Iss. {issue_num:>3} {abs_url:<128}", end="\r")
            temp_dict = get_data_from_abstract(abs_url)
            temp_dict['Volume'] = vol_num
            temp_dict['Issue'] = issue_num
            temp_dict['Abstract URL'] = abs_url
            abstract_data.append(temp_dict)
            time.sleep(DELAY)
    df = pd.DataFrame(abstract_data)
    df.to_csv(f"abstracts/vol_{vol_num}.csv", index=False)
    time.sleep(VOL_DELAY)

print("\nDone!")

	Vol.   2 Iss.   7 https://journals.aps.org/prb/abstract/10.1103/PhysRevB.2.2819.2                                                                 

HTTPError: HTTP Error 403: Forbidden

### Multi-threaded extraction:

In [None]:
# import threading
# import random
# import time

# def global_issue_count(vol_issue):
#     """Given a tuple of (vol, iss.), returns the 
#     issue number accross all volumes.
#     """
#     vol_issue_count = {}
#     counter = 1
#     for vol_num, issues in vol_issue.items():
#         for issue_url in issues:
#             issue_num = int(issue_url.split('/')[-1])
#             vol_issue_count[(vol_num, issue_num)] = counter
#             counter += 1 
#     return vol_issue_count

# def threaded_extractor(thread_idx, num_threads, abstract_urls_dict, vol_issue_count):
#     global abstract_data_dict
#     for vol_num, issue_dict in abstract_urls_dict.items():
#         for issue_num, abstract_urls in issue_dict.items():
#             if vol_issue_count[(vol_num, issue_num)] % num_threads != thread_idx:
#                 continue
#             for abs_num, abs_url in enumerate(abstract_urls, start=1):
#                 print(f"\t[{thread_idx:^3}] Vol. {vol_num:^3} Iss. {issue_num:^3} {abs_url:<128}", end="\r")
#                 temp_dict = get_data_from_abstract(abs_url, proxy_host=PROXY_HOST))
#                 temp_dict['Volume'] = vol_num
#                 temp_dict['Issue'] = issue_num
#                 temp_dict['Abstract URL'] = abs_url
#                 abstract_data_dict[thread_idx].append(temp_dict)
    
#                 # sleep_ms = random.randint(100, 500)
#                 # time.sleep(0.001 * sleep_ms)
  
#             # if len(abstract_data_dict[thread_idx]) >= 10:
#             #     return 

In [None]:
# num_threads = 10
# threads = []
# abstract_data_dict = {}
# vol_issue_count = global_issue_count(vol_issue)
# for t_idx in range(num_threads):
#     abstract_data_dict[t_idx] = []
#     threads.append(
#         threading.Thread(
#             target=threaded_extractor, 
#             args=(t_idx, num_threads, abstract_urls_dict, vol_issue_count))
#     )

# for t_idx in range(num_threads):
#     threads[t_idx].start()

# for t_idx in range(num_threads):
#     threads[t_idx].join()

# abstract_data = []
# for key, val in abstract_data_dict.items():
#     abstract_data.extend(val)
# print(f"\nNumber of records: {len(abstract_data)}")

## Convert to Pandas DataFrame:

In [None]:
# df = pd.DataFrame(abstract_data)

# display(df)

In [None]:
# # Export DataFrame to CSV:
# df.to_csv("aps_prb_articles_meta_data.csv", index=False)