In [None]:
import polars as pl
import requests
from pathlib import Path

from src.file_handling import file_location
from src.web_scraping import asme_digitial_jmd

folder_path = file_location.FileLocation()
root_path = folder_path.root
data_path = root_path.parent.parent / 'fyp' / 'data'

asme_path = file_location.FolderPathOfASME(data_path)
asme_html_issues_path = asme_path.asme_jmd_html_issues
jmd_bare_paper_infor_path = asme_path.asme_jmd_pdf / 'jmd_papers_bare_infor.parquet'

In [None]:
if not jmd_bare_paper_infor_path.exists():
    all_issues_infor_df :pl.DataFrame = asme_digitial_jmd.all_issues_in_folder_to_df(asme_html_issues_path)

    def cluster_url_by_issue_remotexs(
            issues_dfs: pl.DataFrame
    )-> pl.DataFrame:
        df :pl.DataFrame = issues_dfs.select('year','volume','issue').unique(maintain_order=True)
        df = df.join(all_issues_infor_df, on=['year','volume','issue'])
        df = df.with_columns(
            (pl.col('pdf_url').str.split('.org').list.get(0) +
            pl.lit('.org') + pl.lit('.remotexs.ntu.edu.sg') +
             pl.col('pdf_url').str.split('.org').list.get(1)
             ).alias('pdf_url_remotexs')
        )
        return df

    cluster_url_by_issue_df = cluster_url_by_issue_remotexs(issues_dfs=all_issues_infor_df)
    cluster_url_by_issue_df.write_parquet(jmd_bare_paper_infor_path)
else:
    cluster_url_by_issue_df = pl.read_parquet(jmd_bare_paper_infor_path)

cluster_url_by_issue_df

### request blocked by bot Detection

In [None]:
r = requests.get('https://asmedigitalcollection.asme.org.remotexs.ntu.edu.sg/mechanicaldesign/article-pdf/doi/10.1115/1.2918913/5923895/071402_1.pdf')
r

In [None]:
clustered_url_gte_2022_df = cluster_url_by_issue_df.filter(pl.col('year')>='2021')

url_to_open_list = clustered_url_gte_2022_df['pdf_url_remotexs'].to_list()
pdf_filename_list = clustered_url_gte_2022_df['pdf_filename'].to_list()
pdf_folder :Path = asme_path.asme_jmd_pdf
if not pdf_folder.exists():
    pdf_folder.mkdir()
pdf_save_paths_list = [pdf_folder / pdf_filename for pdf_filename in pdf_filename_list]
test_pdf_url = url_to_open_list[0]

In [None]:
from selenium import webdriver


# login to remotexs
driver = webdriver.Chrome()
driver.get(test_pdf_url)

In [None]:
for save_path, pdf_url in zip(pdf_save_paths_list, url_to_open_list):
    if save_path.exists():
        continue
    driver.execute_script(f"window.open('{pdf_url}', '_blank');")
    driver.switch_to.window(driver.window_handles[-1])

    # Download original PDF with requests (session cookies preserved)
    cookies = {c['name']: c['value'] for c in driver.get_cookies()}
    r = requests.get(driver.current_url, cookies=cookies)
    save_path.write_bytes(r.content)

    driver.close()
    driver.switch_to.window(driver.window_handles[0])


In [None]:
"""
test_pdf_urlclient_type :str = soup_article.select('span.article-client_type')[0].text.strip()
title_paper :str = soup_article.select('title')[0].text.strip().split('|')[0].strip()
author_full_names :str = ';'.join([name_html.text.strip() for name_html in soup_article.select('div.author-full-name')])
author_affiliation :str = ';'.join([aff.text.strip() for aff in soup_article.select('div.author-affiliation > div.aff')])
author_emails :str = ';'.join([email_html.text.strip() for email_html in soup_article.select('div.article-footnote > a[href]')])
publish_date = dt.strptime(soup_article.select('span.publish-date-label')[0].text.strip().split(':')[1].strip(),'%B %d, %Y')
abstract :str = soup_article.select('div.article-section-wrapper > section.abstract > p')[0].text.strip()
issue_section :str = soup_article.select('div.content-metadata-tocSections > a[href]')[0].text.strip()
keywords = ';'.join([keyword_html.text.strip() for keyword_html in soup_article.select('div.content-metadata-keywords > a[href]')])
topics = ';'.join([topic_html.text.strip() for topic_html in soup_article.select('div.content-metadata-topics > a[href]')])
content_sections :list = [section_html.text.strip() for section_html in soup_article.select('h2.section-title')]
paragraphs_sections :list = []
for section_paragraph in soup_article.select('h2.section-title + div'):
    _section :list = []
    for paragraph in section_paragraph.select('div > div > p'):
        _section += [paragraph.text.strip()]
    paragraphs_sections.append('\n'.join(_section))
article_content :list = [f'{header}:{paragraphs}' for header, paragraphs in zip(content_sections, paragraphs_sections)]

acknowledgement :str = soup_article.select('h2.backacknowledgements-title + div')[0].text.strip()
back_section_header :list = [header_html.text.strip() for header_html in soup_article.select('h2.backsection-title')]
back_section_content :list = [content_html.text.strip() for content_html in soup_article.select('h2.backsection-title + div > div > p')]
article_back_section :dict = {header:content for header,content in zip(back_section_header, back_section_content)}

cited_article_columns :tuple = ('title', 'cited_authors','doi_links','source')
cited_article_infor :dict = {col:[] for col in cited_article_columns}
for ref in soup_article.select("div.ref-list")[0].select('div.citation'):
    ref_title, ref_authors, ref_doi, ref_source = None, None, None, None
    if ref.select('div.article-title'): ref_title = ref.select('div.article-title')[0].text.strip()
    if ref.select('span.person-group'): ref_authors = ref.select('span.person-group')[0].text.strip()
    if ref.select('div.crossref-doi'): ref_doi = ref.select('div.crossref-doi a')[0]['href'].replace('dx.', '')
    if ref.select('div.source'): ref_source = ref.select('div.source')[0].text.strip()
    cited_article_infor['title'] += [ref_title]
    cited_article_infor['cited_authors'] += [ref_authors]
    cited_article_infor['doi_links'] += [ref_doi]
    cited_article_infor['source'] += [ref_source]

pl.DataFrame(cited_article_infor)
"""