In [5]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 2 13:20:39 2022
Updated on Mon Jun 20 20:11:48 2022

@author: Binyang
"""

import requests
from requests import get
from bs4 import BeautifulSoup
import time
from pathlib import Path
import json
import sys
from importlib import reload
import pandas as pd

In [6]:
def sleep(duration):
    print(f'sleep {duration}s')
    time.sleep(duration)

def parse(text, parser='html.parser'):
    return BeautifulSoup(text, parser)

def fetch(url, file_path=None, session=None, max_attempts=10, custom_headers=None):
    if custom_headers is None:
        custom_headers = {}

    if file_path is not None:
        print(f'Fetch {url} to {file_path}')
        if Path(file_path).exists():
            print('Already exists. Skipped.')
            return None
    else:
        print(f'Fetch {url}')

    for attempt in range(max_attempts):
        if session is not None:
            response = session.get(url, headers=custom_headers)
        else:
            response = get(url, headers=custom_headers)

        # handle HTTP 429 Too Many Requests
        if response.status_code == 429:
            delay = (attempt + 1) * 10
            print(f'Retrying in {delay} seconds')
            time.sleep(delay)
        else:
            break

    response.raise_for_status()

    if session is not None:
        print(f'Session cookies: {len(session.cookies)}')

    if file_path is not None:
        with open(file_path, 'wb') as file:
            file.write(response.content)

    return response.text

In [1]:
##data_scraping
custom_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

doc = parse(fetch("https://asmedigitalcollection.asme.org/mechanicaldesign/issue/browse-by-year", custom_headers=custom_headers))

NameError: name 'parse' is not defined

In [None]:
all_docs_info = []
main_url = "https://asmedigitalcollection.asme.org"
columns_to_take = ["title", "abstract", "doi", "year", "issue", "citation_info", "authors", "pdf_link", "issue_section",
                   "topics", "keywords", "related article", "source of related articles", "cited times", 
                   "author affiliation", "reference"]
doc = parse(fetch("https://asmedigitalcollection.asme.org/mechanicaldesign/issue/browse-by-year", custom_headers=custom_headers))
##get the links to the webpage of each year
for year_link in doc.select("div.widget-ContentBrowseAllYearsManifest a"):
    year = int(year_link.text.strip())

    ##scrape JMD website by year
    if(year == 2022):
        year_href = year_link["href"]
        year_doc = parse(fetch(year_href, custom_headers=custom_headers))
        
        ##scrape all issues in a certain year
        for issue_link in year_doc.select("div.widget-ContentBrowseByYearManifest a"):
            issue_text = issue_link.text.strip()
            issue_link  = main_url + issue_link["href"]
            print(year, issue_link)
            issue_doc = parse(fetch(issue_link, custom_headers=custom_headers))
            
            ##get the link to each paper in a certain issue
            ##scrape the desired data from paper webpage
            for article in issue_doc.select("div.al-article-item-wrap"):
                article_obj = {}
                for column in columns_to_take:
                    article_obj[column] = ""
                article_obj["year"] = year
                article_obj["issue"] = issue_text
                ##title
                if article.select("h5.item-title"):
                    article_obj["title"] = article.select("h5.item-title")[0].text.strip()
                ##author
                article_obj["authors"] = ";".join([author.text.strip() for author in article.select("div.al-authors-list")])
                ##pdf link
                if article.select("a.article-pdfLink"):
                    article_obj["pdf_link"] = main_url + article.select("a.article-pdfLink")[0]["href"]
                ##other info
                if article.select("h5.item-title a"):
                    article_link = main_url + article.select("h5.item-title a")[0]["href"]
                    article_doc = parse(fetch(article_link, custom_headers=custom_headers))
                    ##abstract
                    article_obj["abstract"] = "\n".join([abstract_section.text for abstract_section in article_doc.select(".abstract")])
                    ##cite_info
                    article_obj["citation_info"] = ";".join([section.text.strip() for section in article_doc.select(".ww-citation-primary")])
                    ##doi
                    if article_doc.select("a.ww-doi-link"):
                        article_obj["doi"] = article_doc.select("a.ww-doi-link")[0]["href"]
                    ##issue_section
                    article_obj["issue_section"] = " ".join([phrase.text for phrase in article_doc.select(".content-metadata-tocSections")]).strip()
                    ##topic
                    article_obj["topics"] = " ".join([phrase.text for phrase in article_doc.select(".content-metadata-topics")]).strip()
                    ##keywords
                    article_obj["keywords"] = " ".join([phrase.text for phrase in article_doc.select(".content-metadata-keywords")]).strip()
                    ##most related ASME papers assigned by ASME 
                    article_obj["related article"] = ";".join([related_articles.text for related_articles in article_doc.select("div.article-title a")]).strip()
                    article_obj["source of related articles"] = ";".join([sources.text for sources in article_doc.select("div.content-meta")]).strip()
                    ##times being cited
                    article_obj["cited times"] = ";".join([web.text for web in article_doc.select("div.article-cited-link-wrap a")]).strip()
                    ##author affiliations
                    aff0 = ''
                    for affiliation in article_doc.select("div.author-affiliation"):
                        for aff in affiliation.select("div.aff"):
                            aff0 = ";".join([aff0,aff.text])
                    article_obj["author affiliation"] = aff0.strip()[1:]
                    ##references of a focal paper
                    if article_doc.select("div.ref-list"):
                        reference = ''
                        doix = ''
                        sourcex = ''
                        titlex = ''
                        for ref in article_doc.select("div.ref-list")[0].select('div.citation'):
                            if ref.select('div.article-title'):
                                titlex = "title::" + ref.select('div.article-title')[0].text.strip()
                        #     print (ref.text.strip())
                            if ref.select('div.source'):
                                sourcex = "source::" + ref.select('div.source')[0].text.strip()
                            if ref.select('div.crossref-doi a'):
                                doix = "doi::" + ref.select('div.crossref-doi a')[0]['href'].replace('dx.', '')
                            referencex = ';'.join([titlex, sourcex, doix])
                            reference = '||'.join([reference, referencex])
                        article_obj["reference"] = reference[2:]
                print(article_obj["title"])
                all_docs_info.append(article_obj)

print("Number of docs: ", len(all_docs_info))

df_with_articles = pd.DataFrame(all_docs_info)

df_with_articles.to_excel("jmd_all_updated.xlsx", encoding="utf-8")

In [None]:
# import urllib
# def download_pdf_by_link(download_url, filename):
#     try:
#         response = urllib.request.urlopen(download_url)
#         pdf_data = response.read()
#         assert len(pdf_data) > 5000
#         file = open(filename, 'wb')
#         file.write(pdf_data)
#         file.close()
#         print("Downloaded file ", filename)
#     except Exception as err:
#         print(err)

In [None]:
custom_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}

In [None]:
def download_pdf_by_link(download_url, filename):
    try:
        r = requests.get(download_url, headers = custom_headers)
        with open(filename, 'wb') as f:
            f.write(r.content)
        print("Downloaded file ", filename)
    except Exception as err:
        print(err)

In [None]:
import os
import time
# os.makedirs("folder_with_pdfs", exist_ok=True)

df_with_articles = pd.read_excel('jmd_all_updated.xlsx')
n = 0
for i in range(0,len(df_with_articles)):
    print(df_with_articles["pdf_link"].values[i])
    if df_with_articles["pdf_link"].values[i].strip():
        download_pdf_by_link(df_with_articles["pdf_link"].values[i], os.path.join("folder_with_pdfs", "%d.pdf"%i))
        n += 1
        if n%15 == 0:
            time.sleep(5)

In [None]:
##detect cases that multiple papers share the same pdf file 
for i in range(1,len(df_with_articles)):
# for i in range(1133,1134):
    size = os.path.getsize(os.path.join("folder_with_pdfs", "%d.pdf"%i)) 
    size1 = os.path.getsize(os.path.join("folder_with_pdfs", "%d.pdf"%(i-1))) 
    if size == size1:
        print (i)