In [1]:
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from seleniumbase import Driver
from openpyxl import load_workbook, Workbook
from auto_download_undetected_chromedriver import download_undetected_chromedriver

In [2]:
chromedriver_path = download_undetected_chromedriver(os.getcwd(), undetected=True, arm=False, force_update=True)

version: 124.0.6367.156 | major_version: 124
downloading: https://googlechromelabs.github.io/chrome-for-testing/latest-patch-versions-per-build-with-downloads.json
system: mac-x64
[]
124.0.6367.156 could not be found
[]
124.0.6367.15 could not be found
[]
124.0.6367.1 could not be found
['https://storage.googleapis.com/chrome-for-testing-public/124.0.6367.207/mac-x64/chromedriver-mac-x64.zip']
downloading: https://storage.googleapis.com/chrome-for-testing-public/124.0.6367.207/mac-x64/chromedriver-mac-x64.zip
not patched yet
found block:
b'{window.cdc_adoQpoasnfa76pfcZLmcfl_Array = window.Array;window.cdc_adoQpoasnfa76pfcZLmcfl_Object = window.Object;window.cdc_adoQpoasnfa76pfcZLmcfl_Promise = window.Promise;window.cdc_adoQpoasnfa76pfcZLmcfl_Proxy = window.Proxy;window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol = window.Symbol;window.cdc_adoQpoasnfa76pfcZLmcfl_JSON = window.JSON;}'
replacing with:
b'{console.log("undetected chromedriver 1337!")}                                                  

In [3]:
def scrape_article(driver, publicationID_dict, start, wait_time=2):
    file = "thesis_articles.xlsx"  # Store all journal articles in a single file
    headers = ["原文題目", "翻譯題目", "作者", "來源", '摘要', '關鍵字']
    original_df = pd.DataFrame(columns=headers)
    for thesis_name, publication_id in publicationID_dict.items():
        year_start = start
        print(f"Scraping {thesis_name} with ID {publication_id}")
        sheet_name = f"{thesis_name}(文章列表)"

        try:
            if not os.path.exists(file):
                wb = Workbook()
                ws = wb.active
                ws.title = sheet_name
                ws.append(headers)
                wb.save(file)
            else:
                wb = load_workbook(file)
                if sheet_name not in wb.sheetnames:
                    ws = wb.create_sheet(title=sheet_name)
                    ws.append(headers)
                else:
                    ws = wb[sheet_name]
        except Exception as e:
            print(f"Load workbook with error: {e}")
            return

        while year_start <= 2024:
            page = 1
            can_break = False
            while True:
                # Open the journal list page for the current year
                url = f"https://www.airitilibrary.com/Publication/Information?publicationID={publication_id}&type=%E6%9C%9F%E5%88%8A&tabName=2&issueYear={year_start}&page={page}&publisherID=47"
                driver.get(url)
                time.sleep(wait_time)

                # Retrieve all articles on the current page by extracting the onclick attribute
                soup = BeautifulSoup(driver.page_source, "lxml")
                article_tags = soup.find_all("h3", class_="ustyle_heading_H3")

                # If no articles are found, move to the next year
                if len(article_tags) == 1:
                    can_break = True
                if can_break:
                    break

                # Iterate through all articles and extract their unique identifiers
                for tag in article_tags:
                    if tag.find('a') is None:
                        continue
                    onclick_value = tag.find('a')['onclick']
                    article_id = onclick_value.split("'")[1]  # Extract unique identifier

                    # Construct the full article URL
                    article_url = f"https://www.airitilibrary.com/Publication/alDetailedMesh?docid={article_id}"
                    driver.get(article_url)
                    time.sleep(wait_time)

                    # Parse article content
                    new_soup = BeautifulSoup(driver.page_source, "lxml")
                    original_title = new_soup.find("h2", class_=["ustyle_heading_H2 mainTitleColor"])
                    foreign_title = new_soup.find("h3", class_=["ustyle_heading_H3 subTitleColor"])
                    author_elements = new_soup.find_all("span", class_=["author"])
                    source = new_soup.find("i", class_=["source"])
                    abstract_section = new_soup.find("div", class_="academicAbstractSet")
                    abstract = abstract_section.find("div", class_=["academicAbstractGroup"]) if abstract_section else None
                    keywords_elements = abstract_section.find_all("a", class_=["點擊關鍵字"]) if abstract_section else []

                    # Extract article details
                    title_o = original_title.text.strip() if original_title else "Original Title not found"
                    title_f = foreign_title.text.strip() if foreign_title else "Foreign Title not found"
                    if author_elements:
                        authors = " ".join([author.text.strip() for author in author_elements])
                    else:
                        authors = "Author not found"
                    sources = source.text.strip().replace("\n", "") if source else "Sources not found"
                    abstracts = abstract.text.strip() if abstract else "Abstract not found"
                    if keywords_elements:
                        keywords = ", ".join([keyword.text.strip() for keyword in keywords_elements])
                    else:
                        keywords = "Keywords not found"

                    # Write data to Excel
                    each_data = [title_o, title_f, authors, sources, abstracts, keywords]
                    ws.append(each_data)
                    each_df = pd.DataFrame(data=[each_data], columns=headers)
                    original_df = pd.concat([original_df, each_df], ignore_index=True)
                
                # Move to the next page    
                page += 1

            # After processing the year, move to the next one
            print(f"{year_start} finished")
            year_start += 1

        # Save the file to Excel
        wb.save(file)

    driver.quit()
    return original_df

In [4]:
publicationID_dict = {
    "經濟論文": "1018161x",
    "經濟論文叢刊": "10183833",
    "台灣經濟預測與政策": "17298849",
    "經濟研究": "10181245",
    "應用經濟論叢": "05469600",
    "農業經濟叢刊": "10277757",
    "農業與經濟": "1011520x",
    "人文與社會科學集刊": "1018189X",
    "中國大陸研究": "10132716",
    "文化創意產業研究學報": "22216170"
}

driver = Driver(uc=True)
year_start = 2000

df = scrape_article(driver, publicationID_dict, year_start)
df

Scraping 農業經濟叢刊 with ID 10277757
2000 finished
2001 finished
2002 finished
2003 finished
2004 finished
2005 finished
2006 finished
2007 finished
2008 finished
2009 finished
2010 finished
2011 finished
2012 finished
2013 finished
2014 finished
2015 finished
2016 finished
2017 finished
2018 finished
2019 finished
2020 finished
2021 finished
2022 finished
2023 finished
2024 finished
Scraping 農業與經濟 with ID 1011520x
2000 finished
2001 finished
2002 finished
2003 finished
2004 finished
2005 finished
2006 finished
2007 finished
2008 finished
2009 finished
2010 finished
2011 finished
2012 finished
2013 finished
2014 finished
2015 finished
2016 finished
2017 finished
2018 finished
2019 finished
2020 finished
2021 finished
2022 finished
2023 finished
2024 finished
Scraping 人文與社會科學集刊 with ID 1018189X
2000 finished
2001 finished
2002 finished
2003 finished
2004 finished
2005 finished
2006 finished
2007 finished
2008 finished
2009 finished
2010 finished
2011 finished
2012 finished
2013 finished
201

Unnamed: 0,原文題目,翻譯題目,作者,來源,摘要,關鍵字
0,戰前台灣菸葉生產之成長來源分析—技術進步與政策誘因,Sources of Growth of Tobacco in Colonial Taiwa...,董安琪(An-Chi Tung)\n\n；\n\n傅祖壇(Tsu-Tan Fu),《農業經濟叢刊》6卷1期(2000/12)Pp. 1-32,本文旨在有系統地分析戰前台灣菸葉之成長來源。菸葉雖然在政府財政及專賣政策上深具重要性，但菸葉...,"技術進步, 成長來源, 菸葉, 異質性產品"
1,目標區與農產品價格的穩定：小型開放經濟之分析,Target Zones and Agricultural Price Stabilizat...,賴景昌(Ching-Chong Lai)\n\n；\n\n王葳(Vey Wang)\n\n；...,《農業經濟叢刊》6卷1期(2000/12)Pp. 33-66,維持農產品價格的穩定是許多國家重要的政策之一，政府為能平抑國內農產品價格，往往制定不少穩定價...,"平準實物制度, 平準基金制度, 目標區, 農產品價格穩定, 蜜月效果"
2,貨幣供給與進口物價對台灣地區農工產品價格影響之長短期效果：共整合方法之應用,The Impacts of Changing in Money Supply and Pr...,劉祥熹(Hsiang-Hsi Liu)\n\n；\n\n洪德佳(Te-Chia Hung),《農業經濟叢刊》6卷1期(2000/12)Pp. 67-114,本文研究目的旨在探討台灣地區的貨幣供給、進口產品價格與農業產品價格、製造業產品價格相互影響的...,"貨幣供給, 進口物價, 農工產品價格, 共整合, 向量誤差修正模型"
3,日本戰後農地價格變動因素之分析—資訊元分析法的應用,Factors Influencing Farmland Prices of Post-Wa...,陳建宏(Chien-Hung Chen)\n\n；\n\n戴錦周(Jin-Jou Dai),《農業經濟叢刊》6卷1期(2000/12)Pp. 115-141,本文利用資本化模式和資訊元分析法，應用在1962－1996年日本各都道府縣的資料上，以研究農...,"農地價格, 偏相關係數, 資本化模式, 資訊元, 自變數資訊不等數, 迴歸資訊不等數"
4,Harmonizing Agricultural and Eevironmental Pol...,農業政策與環境政策之調和,Erik Lichtenberg,《農業經濟叢刊》5卷2期(2000/06)Pp. 133-163,本文討論三個課題，此三個課題主要是由經濟理論與改善農業生產對環境保護及資源保育之歷史經驗歸納...,"農業與環境, 環境保護, 資源保育, 農藥, 肥料, 稀有水資源, 森林開伐"
...,...,...,...,...,...,...
1469,集與市：上海創意市集與創意產業、城市的關系演進,Bazaar and City: The Evolution of the Relation...,張琳(Lin Zhang),《文化創意產業研究學報》13卷1期(2023/08)Pp. 65-70,創意市集作為創意產業發展過程中出現的公眾交流模式，旨在為各類新興設計師、藝術家提供多元開放的...,"創意市集, 文化產業, 創意城市, 歷史演變, 上海模式"
1470,社會資本與百年家族企業突破困境－以新和春漬物醬油工廠為例,Social Capital and a Century-old Family Busine...,吳昆宗(Kun-Tsung Wu)\n\n；\n\n王信文(Hsing-Wen Wang),《文化創意產業研究學報》13卷1期(2023/08)Pp. 71-80,本研究在探討一家已有百年，傳承四代的家族中小企業醬油工廠，透過鑲嵌於地方各種人脈、資源等社會...,"社會資本, 企業接班, 產業創新, 體驗行銷"
1471,金基德電影中魔幻語境下的敘事內涵－以《末日飛船》為例,The Narrative Connotation in the Magical Conte...,黎煥勤(Huanchin Li),《文化創意產業研究學報》13卷1期(2023/08)Pp. 81-87,韓國導演金基德（Ki-duk Kim）的電影，可以說是韓國藝術電影的代表。他的敘事策略是以寫...,"金基德, 魔幻寫實, 情節分析, 人性解構, 末日飛船"
1472,文化古蹟體驗空間設計與體驗價值之研究,Study on the Experience Space Design and Exper...,吳慧潔(Wai-Kit Ng)\n\n；\n\n陳俊良(Chun-Liang Chen),《文化創意產業研究學報》13卷1期(2023/08)Pp. 89-99,文化古蹟已從保存思維轉為積極的活化經營，因此體驗空間的設計對於改善顧客體驗融合文化創意產業至...,"文化古蹟, 體驗空間, 體驗價值, 服務設計, 文化創意產業"
