In [1]:
import polars as pl

from src.file_handling import file_location
from src.web_scraping import asme_digitial_jmd

In [2]:
asme_path :file_location.FileLocation = file_location.FolderPathOfASME()
asme_issues_paths = asme_path.asme_jmd_html_issues
all_issues_infor_df :pl.DataFrame = asme_digitial_jmd.all_issues_in_folder_to_df(asme_issues_paths)

In [3]:
all_issues_infor_df

title,authors,doi,year,month,volume,issue,article_url,pdf_url,topics,pdf_filename
str,str,str,str,str,str,str,str,str,str,str
"""Editorial""","""Charles W. McLarnan""","""https://doi.org/10.1115/1.3453…","""1978""","""January""","""100""","""1""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""""","""100_1_Editorial.pdf"""
"""Fatigue Life Prediction of Com…","""B. N. Leis""","""https://doi.org/10.1115/1.3453…","""1978""","""January""","""100""","""1""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Fatigue life;Fatigue;Fatigue a…","""100_1_Fatigue Life Prediction …"
"""Dynamic Severity Criterion for…","""G. S. A. Shawki""","""https://doi.org/10.1115/1.3453…","""1978""","""January""","""100""","""1""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Design;High cycle fatigue;Stre…","""100_1_Dynamic Severity Criteri…"
"""Wear-Related Topography of Ion…","""D. J. Sturges;S. W. Martin;C. …","""https://doi.org/10.1115/1.3453…","""1978""","""January""","""100""","""1""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Wear;Nitriding;Composite mater…","""100_1_Wear-Related Topography …"
"""Simulation of Resonances and I…","""M. Benton;A. Seireg""","""https://doi.org/10.1115/1.3453…","""1978""","""January""","""100""","""1""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Gears;Resonance;Simulation;Com…","""100_1_Simulation of Resonances…"
…,…,…,…,…,…,…,…,…,…,…
"""Product Dataset Platform: Syst…","""Mohammad Arjomandi Rad;Julian …","""https://doi.org/10.1115/1.4068…","""2026""","""March""","""148""","""3""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Design;Sleep""","""148_3_Product Dataset Platform…"
"""Concept Chaining Patterns Duri…","""Madhurima Das;Jessica Meza;Chr…","""https://doi.org/10.1115/1.4068…","""2026""","""March""","""148""","""3""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Chain;Design""","""148_3_Concept Chaining Pattern…"
"""A Quasi-Optimal Shape Design M…","""Sifan Chen;Guoyue Luo;Yuan Kon…","""https://doi.org/10.1115/1.4068…","""2026""","""March""","""148""","""3""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Construction;Cutting;Optimizat…","""148_3_A Quasi-Optimal Shape De…"
"""Gradient-Based Optimization of…","""Daniel Krsikapa;Il Yong Kim""","""https://doi.org/10.1115/1.4068…","""2026""","""March""","""148""","""3""","""https://asmedigitalcollection.…","""https://asmedigitalcollection.…","""Design;Optimization;Packing (S…","""148_3_Gradient-Based Optimizat…"


In [4]:
def cluster_url_by_issue(
        issues_dfs: pl.DataFrame
)-> pl.DataFrame:
    df :pl.DataFrame = issues_dfs.select('year','volume','issue').unique(maintain_order=True).with_row_index(name="index", offset=0)
    df = df.join(all_issues_infor_df, on=['year','volume','issue'])
    df = df.with_columns(
        (pl.col('pdf_url').str.split('.org').list.get(0) +
        pl.lit('.org') + pl.lit('.remotexs.ntu.edu.sg') +
         pl.col('pdf_url').str.split('.org').list.get(1)
         ).alias('pdf_url_remotexs')
    )
    return df

cluster_url_by_issue_df = cluster_url_by_issue(issues_dfs=all_issues_infor_df)
test_pdf_urls = (cluster_url_by_issue_df[0]['pdf_url'][0], cluster_url_by_issue_df[0]['pdf_url_remotexs'][0])
test_pdf_urls

('https://asmedigitalcollection.asme.org/mechanicaldesign/article-pdf/100/1/1/5567254/1_1.pdf',
 'https://asmedigitalcollection.asme.org.remotexs.ntu.edu.sg/mechanicaldesign/article-pdf/100/1/1/5567254/1_1.pdf')

### request blocked by bot Detection

In [8]:
import requests
import re
from pathlib import Path

r = requests.get('https://asmedigitalcollection.asme.org.remotexs.ntu.edu.sg/mechanicaldesign/article-pdf/doi/10.1115/1.2918913/5923895/071402_1.pdf')
r

<Response [403]>

In [33]:
import webbrowser
issues_numbers :list = cluster_url_by_issue_df['index'].unique().to_list()
issues_numbers.reverse()


def doi_to_filename(doi: str) -> str:
    match = re.search(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', doi, re.IGNORECASE)
    if not match:
        raise ValueError("Invalid DOI format")
    doi_part = match.group(1)
    filename_safe = "DOI_" + re.sub(r'[\/:]', '_', doi_part)
    return filename_safe

def filename_to_doi(filename: str) -> str:
    if not filename.startswith("DOI_"):
        raise ValueError("Invalid filename format")

    doi_safe = filename[len("DOI_"):]

    doi = re.sub(r'_', '/', doi_safe, count=1)
    return "https://doi.org/" + doi

for issue_no in issues_numbers:
    issue_df = cluster_url_by_issue_df.filter(pl.col('index')==issue_no)
    issue_no_pdf_urls :list = issue_df.select('pdf_url_remotexs').to_series().to_list()
    issue_no_pdf_dois :list = issue_df.select('doi').to_series().to_list()
    """
    for pdf_url in issue_no_pdf_urls:
        webbrowser.open_new_tab(pdf_url)
    break
    """
    for doi_url in zip(issue_no_pdf_dois, issue_no_pdf_urls):
        doi = doi_url[0]
        doi_filename = doi_to_filename(doi)
        url = doi_url[1]

In [7]:
"""
test_pdf_urlclient_type :str = soup_article.select('span.article-client_type')[0].text.strip()
title_paper :str = soup_article.select('title')[0].text.strip().split('|')[0].strip()
author_full_names :str = ';'.join([name_html.text.strip() for name_html in soup_article.select('div.author-full-name')])
author_affiliation :str = ';'.join([aff.text.strip() for aff in soup_article.select('div.author-affiliation > div.aff')])
author_emails :str = ';'.join([email_html.text.strip() for email_html in soup_article.select('div.article-footnote > a[href]')])
publish_date = dt.strptime(soup_article.select('span.publish-date-label')[0].text.strip().split(':')[1].strip(),'%B %d, %Y')
abstract :str = soup_article.select('div.article-section-wrapper > section.abstract > p')[0].text.strip()
issue_section :str = soup_article.select('div.content-metadata-tocSections > a[href]')[0].text.strip()
keywords = ';'.join([keyword_html.text.strip() for keyword_html in soup_article.select('div.content-metadata-keywords > a[href]')])
topics = ';'.join([topic_html.text.strip() for topic_html in soup_article.select('div.content-metadata-topics > a[href]')])
content_sections :list = [section_html.text.strip() for section_html in soup_article.select('h2.section-title')]
paragraphs_sections :list = []
for section_paragraph in soup_article.select('h2.section-title + div'):
    _section :list = []
    for paragraph in section_paragraph.select('div > div > p'):
        _section += [paragraph.text.strip()]
    paragraphs_sections.append('\n'.join(_section))
article_content :list = [f'{header}:{paragraphs}' for header, paragraphs in zip(content_sections, paragraphs_sections)]

acknowledgement :str = soup_article.select('h2.backacknowledgements-title + div')[0].text.strip()
back_section_header :list = [header_html.text.strip() for header_html in soup_article.select('h2.backsection-title')]
back_section_content :list = [content_html.text.strip() for content_html in soup_article.select('h2.backsection-title + div > div > p')]
article_back_section :dict = {header:content for header,content in zip(back_section_header, back_section_content)}

cited_article_columns :tuple = ('title', 'cited_authors','doi_links','source')
cited_article_infor :dict = {col:[] for col in cited_article_columns}
for ref in soup_article.select("div.ref-list")[0].select('div.citation'):
    ref_title, ref_authors, ref_doi, ref_source = None, None, None, None
    if ref.select('div.article-title'): ref_title = ref.select('div.article-title')[0].text.strip()
    if ref.select('span.person-group'): ref_authors = ref.select('span.person-group')[0].text.strip()
    if ref.select('div.crossref-doi'): ref_doi = ref.select('div.crossref-doi a')[0]['href'].replace('dx.', '')
    if ref.select('div.source'): ref_source = ref.select('div.source')[0].text.strip()
    cited_article_infor['title'] += [ref_title]
    cited_article_infor['cited_authors'] += [ref_authors]
    cited_article_infor['doi_links'] += [ref_doi]
    cited_article_infor['source'] += [ref_source]

pl.DataFrame(cited_article_infor)
"""

'\ntest_pdf_urlclient_type :str = soup_article.select(\'span.article-client_type\')[0].text.strip()\ntitle_paper :str = soup_article.select(\'title\')[0].text.strip().split(\'|\')[0].strip()\nauthor_full_names :str = \';\'.join([name_html.text.strip() for name_html in soup_article.select(\'div.author-full-name\')])\nauthor_affiliation :str = \';\'.join([aff.text.strip() for aff in soup_article.select(\'div.author-affiliation > div.aff\')])\nauthor_emails :str = \';\'.join([email_html.text.strip() for email_html in soup_article.select(\'div.article-footnote > a[href]\')])\npublish_date = dt.strptime(soup_article.select(\'span.publish-date-label\')[0].text.strip().split(\':\')[1].strip(),\'%B %d, %Y\')\nabstract :str = soup_article.select(\'div.article-section-wrapper > section.abstract > p\')[0].text.strip()\nissue_section :str = soup_article.select(\'div.content-metadata-tocSections > a[href]\')[0].text.strip()\nkeywords = \';\'.join([keyword_html.text.strip() for keyword_html in soup_